In [19]:
import requests
from bs4 import BeautifulSoup
import json
import uuid

In [78]:
joke_websites = {
    # Chuck Norris Top 100 Joke List 
    'chuckURL': 'https://punsandjokes.com/chuck-norris-jokes/',
    # Dad Joke List from /r/dadjokes
    'dadURL': 'https://www.reddit.com/r/dadjokes/comments/dcwqu1/514_dad_jokes/',
    # bs4 HEADER
    'header': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
}

{'chuckURL': 'https://punsandjokes.com/chuck-norris-jokes/',
 'dadURL': 'https://www.reddit.com/r/dadjokes/comments/dcwqu1/514_dad_jokes/',
 'header': {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}}

In [83]:
# Creating function to return page contents
def get_page_contents(site: str, header: dict) -> BeautifulSoup:
    response = requests.get(site, headers=header)
    return BeautifulSoup(response.content, 'html.parser')

Scrapping `Chuck Norris` jokes and saving to a list. Each joke will have a `jokeID` created via `uuid4`

In [84]:
chuck_joke_soup = get_page_contents(joke_websites['chuckURL'], joke_websites['header'])

In [86]:
# Getting list of li elements in the section of html where jokes are stored.
li_elements = chuck_joke_soup.select('div.entry-content.blog_post_text.blog_post_description.clearfix li')

# Creating empty list to append each chuck norris joke and jokeID to.
chuck_joke_list = []

for jk in li_elements:

    chuck_joke_list.append({
        'jokeID': str(uuid.uuid4()),
        'jokeTXT': jk.text.strip()
    })

In [92]:
# Checking total length of list and looking at first three elments to check successfull append
len(chuck_joke_list), chuck_joke_list[0:3]

(225,
 [{'jokeID': '6c8ed576-b7d2-4651-ba6d-510f028b732a',
   'jokeTXT': 'When Chuck Norris was a baby he farted for the first time, scientists say this is when the big bang occured.'},
  {'jokeID': '5e1fc6cb-b059-4024-9551-d0b904a61746',
   'jokeTXT': 'After Chuck Norris was born, he drove his Mother home from the hospital.'},
  {'jokeID': '67895b3f-4025-42e5-b553-2378c46dc6ca',
   'jokeTXT': 'Chuck Norris’ tears cure cancer. The problem is that he has never cried.'}])

Scrapping `Dad Jokes` and saving to a list. Each joke will ahve a `jokeID` created via `uuid4()`

Unlike the `Chuck Norris` joke list, if the dad joke contains and question mark it is split into a `jokeContent` and `jokeAnswer` key:values

In [93]:
dad_joke_soup = get_page_contents(joke_websites['dadURL'], joke_websites['header'])

In [94]:
# Extracting all joke elements from post contents. Each joke is contained in a seperate <p> element.
dad_joke_list = []

div_element = dad_joke_soup.find("div", {"slot": "text-body"})

if not div_element:  # if div_element is still None
    print("Unable to find the main div.")
else:
    jokes = div_element.find_all("p")

    for joke in jokes:
        joke_text = joke.text.strip()
    
        if joke_text:
            if '?' in joke_text:
                split_joke = joke_text.split('?', 1)
                joke_content = split_joke[0] + '?'
                joke_answer = split_joke[1].strip() if len(split_joke) > 1 else ''

                joke_data = {
                    'jokeID': str(uuid.uuid4()),
                    'jokeContent': joke_content,
                    'jokeAnser': joke_answer
                }

            else:
                joke_data = {
                    "jokeId": str(uuid.uuid4()),
                    "content": joke_text
                }

        dad_joke_list.append(joke_data)    

In [95]:
# Checking total length of list and looking at first three elments to check successfull append
len(dad_joke_list), dad_joke_list[0:3]

(635,
 [{'jokeID': 'c5aca0e2-fb73-41cd-b86d-83f03c3dcad5',
   'jokeContent': 'What do you call a fake noodle?',
   'jokeAnser': 'An Impasta.'},
  {'jokeID': 'c5aca0e2-fb73-41cd-b86d-83f03c3dcad5',
   'jokeContent': 'What do you call a fake noodle?',
   'jokeAnser': 'An Impasta.'},
  {'jokeId': '2ce785a5-3848-4cee-b816-12aac92cca78',
   'content': 'I would avoid the sushi if I was you. It’s a little fishy.'}])

Saving jokes out to JSON files for later use or import into a DB for use by DadBot

In [None]:
# Creating a single dict of all jokes
all_jokes = {
    'chuckJokes': chuck_joke_list,
    'dadJokes': dad_joke_list
}

In [101]:
# Writting out to json files.
with open('/repos/dadbot-py/data/new_joke_list.json', 'w') as file:
    json.dump(all_jokes, file, indent=4)

with open('/repos/dadbot-py/data/chuck_norris_only_list.json', 'w') as file:
    json.dump(chuck_joke_list, file, indent=4)

with open('/repos/dadbot-py/data/dad_joke_list.json', 'w') as file:
    json.dump(chuck_joke_list, file, indent=4)