## Pulling Yo momma jokes from the internet

Taking four different sources and standardizing them for training on a GPT-2 model

In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re

Source "A Prarie Home Companion" (lol): https://www.prairiehome.org/story/1997/04/05/yo-mama-jokes.html

In [40]:
prarie_page = requests.get("https://www.prairiehome.org/story/1997/04/05/yo-mama-jokes.html")

soup = BeautifulSoup(prarie_page.text, 'html.parser')
text = soup.find("blockquote").findChildren("p", recursive=False)

prarie_jokes = {"source": [], "ym_joke": []}

for ym_joke in text:

    joke = ym_joke.get_text().replace('"', "")

    if "momma" in joke:
        prarie_jokes["source"].append("prarie home")
        prarie_jokes["ym_joke"].append(joke)

prarie = pd.DataFrame.from_dict(prarie_jokes)

prarie["ym_joke"] = prarie["ym_joke"].str.replace("momma's", "momma")

prarie.to_csv("data/prarie_ym_jokes.csv", index=False)

Source "yomomma-api": https://github.com/rdegges/yomomma-api/blob/master/jokes.txt

In [37]:
ym_api = pd.read_html("https://github.com/rdegges/yomomma-api/blob/master/jokes.txt")[0] \
           .reset_index() \
           .drop(columns={"index"}) \
           .rename(columns={0: "source", 1: "ym_joke"})

ym_api["source"] = "ym api"

# standardize lingo
ym_api["ym_joke"] = ym_api["ym_joke"].str.replace("mama", "momma", regex=True, flags=re.IGNORECASE)
ym_api["ym_joke"] = ym_api["ym_joke"].str.replace("Momma's", "momma", regex=True, flags=re.IGNORECASE)
ym_api["ym_joke"] = ym_api["ym_joke"].str.replace("mommags", "momma")

ym_api = ym_api[ym_api["ym_joke"].str.startswith("Yo ")]

ym_api.to_csv("data/ym_api_jokes.csv", index=False)

Source: "IRC-Bot": https://github.com/iambibhas/IRC-Bot/blob/master/insults.txt

In [39]:
irc_bot = pd.read_html("https://github.com/iambibhas/IRC-Bot/blob/master/insults.txt")[0] \
            .reset_index() \
            .drop(columns={"index"}) \
            .rename(columns={0: "source", 1: "ym_joke"})

irc_bot["source"] = "irc bot"

irc_bot = irc_bot[irc_bot["ym_joke"].str.contains("(mom|mam)", flags=re.IGNORECASE, regex=True)]

irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("Your ", "Yo ")
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("You're ", "Yo ")

# regex is wonking out so...one for each!!
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mommas", "momma")
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("momma's", "momma", flags=re.IGNORECASE, regex=True)
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mama's", "momma", flags=re.IGNORECASE, regex=True)
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mama", "momma", flags=re.IGNORECASE, regex=True)
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mamma's", "momma", flags=re.IGNORECASE, regex=True)
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mamma", "momma", flags=re.IGNORECASE, regex=True)
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("Momma", "momma")
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mom ", "momma ")
irc_bot["ym_joke"] = irc_bot["ym_joke"].str.replace("mom's", "momma")

irc_bot = irc_bot[irc_bot["ym_joke"].str.startswith("Yo ")]

irc_bot.to_csv("data/irc_bot_jokes.csv", index=False)

  return func(self, *args, **kwargs)


Source corz.org: https://corz.org/public/docs/miscy/insults.txt

In [28]:
corz_page = requests.get("https://corz.org/public/docs/miscy/insults.txt")

soup = BeautifulSoup(corz_page.content, 'html.parser')
text = soup.find("div", attrs={"class": "textview-body"})

corz_jokes = {"source": [], "ym_joke": []}

for line in text:
    if "mam" in line or "mom" in line:
        # replace latin nbsp
        line = line.replace("\n", "").replace(u'\xa0', u' ')
        corz_jokes["source"].append("corz")
        corz_jokes["ym_joke"].append(line)

corz = pd.DataFrame.from_dict(corz_jokes)

# standardize the lingo
corz["ym_joke"] = corz["ym_joke"].str.replace("mamma", "momma")
corz["ym_joke"] = corz["ym_joke"].str.replace("momma's", "momma")

corz["ym_joke"] = corz["ym_joke"].str.replace("your mom ", "Yo momma ")
corz["ym_joke"] = corz["ym_joke"].str.replace("Your momma ", "Yo momma ")

# remove truncated jokes and only grab yo momma jokes
corz = corz[(corz["ym_joke"].str.len() > 15) & (corz["ym_joke"].str.startswith("Yo "))]

corz.to_csv("data/corz_ym_jokes.csv", index=False)

## Combine the data sources, standardize them in a few different ways

In [42]:
csv_file_list = ["data/prarie_ym_jokes.csv", "data/ym_api_jokes.csv", "data/irc_bot_jokes.csv", "data/corz_ym_jokes.csv"]

list_of_dataframes = []
for filename in csv_file_list:
    list_of_dataframes.append(pd.read_csv(filename))

ym = pd.concat(list_of_dataframes).drop(columns=["source"])

ym.head()

Unnamed: 0,ym_joke
0,Yo momma so dumb she bought a solar-powered fl...
1,Yo momma so dumb she watches The Three Stooges...
2,Yo momma so dumb it took her 2 hours to watch ...
3,"Yo momma so dumb she sits on the TV, and watch..."
4,Yo momma so dumb she stepped on a crack and br...


### style 1: raw jokes

In [43]:
ym.to_csv("data/for_training/all_ym_jokes_raw.csv", index=False)

### style 2: a text file in the style of the Shakespeare script file

In [58]:
ym_jokes = ym.values

with open('data/for_training/all_ym_jokes.txt', 'w') as f:
    for joke in ym_jokes:
        joke = joke[0].split("Yo momma ")[-1]
        f.write("Yo momma:\n")
        f.write(joke)
        f.write("\n\n")