# Data Cleaning

## Getting The Data

Scrapt transcript from scrapsfromtheloft, just pick the comedians I am interested

In [1]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle

# Scrapes transcript data from scrapsfromtheloft.com
def url_to_transcript(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "lxml")
#     content = soup.find(class_="elementor-element elementor-element-74af9a5b elementor-widget elementor-widget-theme-post-content")
#     text = content.find_all('p')
    text = [p.text for p in soup.find(class_="elementor-element elementor-element-74af9a5b elementor-widget elementor-widget-theme-post-content").find_all('p')]
    return text


# URLs of transcripts in scope
urls = ['https://scrapsfromtheloft.com/2021/05/20/tig-notaro-boyish-girl-interrupted-transcript/',
        'https://scrapsfromtheloft.com/2021/04/18/joe-list-i-hate-myself-transcript/',
        'https://scrapsfromtheloft.com/2020/11/12/sam-morril-i-got-this-2020-transcript/',
        'https://scrapsfromtheloft.com/2021/03/27/nate-bargatze-greatest-average-american-transcript/',
        'https://scrapsfromtheloft.com/2020/11/27/larry-the-cable-guy-remain-seated-transcript/',
        'https://scrapsfromtheloft.com/2020/11/05/bill-burr-snl-monologue-2020-transcript/',
        'https://scrapsfromtheloft.com/2020/08/22/rob-schneider-asian-momma-mexican-kids-transcript/',
        'https://scrapsfromtheloft.com/2020/06/25/eric-andre-legalize-everything-transcript/',
        'https://scrapsfromtheloft.com/2020/06/02/chris-gethard-career-suicide-transcript/',
        'https://scrapsfromtheloft.com/2020/05/26/hannah-gadsby-douglas-transcript/',
        'https://scrapsfromtheloft.com/2018/05/15/ali-wong-hard-knock-wife-full-transcript/',
        'https://scrapsfromtheloft.com/2018/05/06/kavin-jay-everybody-calm-down-full-transcript/']

# Comedian names
comedians = ['tig', 'joe', 'sam', 'nate', 'larry', 'bill', 'rob', 'eric', 'chris', 'hannah', 'ali', 'kavin']

In [2]:
# # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

transcripts

[['(Chattering) (music playing)',
  '(audience cheering)',
  'Thank you. Oh, my gosh. Are you kidding me? My goodness. (Chuckles) Wow, thank you. People are like, “Tig!”',
  '(audience laughs)',
  '“Why are you shooting your special in Boston?”',
  '(audience laughs)',
  'Woman: Whoo!',
  'I’ll tell ya something, my grandfather… was originally from Boston.',
  '(Audience cheers)',
  'And my mother lived in Boston when she was a tiny, little person.',
  'Man: Whoo!',
  'And this rug… Hear me out. This rug I am standing on has been in my family since the 1800s and was in my mother’s house in Boston in the ’40s.',
  '(audience cheers)',
  'Why am I shooting my special… in Boston? I wanted to show you my rug.',
  '(audience laughs)',
  'Why am I shooting my special in Boston? No more stupid questions. (laughs) Please. I performed in Las Vegas and, um, when you do stand-up in Vegas, typically, you have to do an entire week, seven nights in a row, two shows a night. There’s an early show and

In [3]:
# Pickle files for later use

# Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

mkdir: transcripts: File exists


In [4]:
# Load pickled files
data = {}
for i, c in enumerate(comedians):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [5]:
# Double check to make sure data has been loaded properly
data.keys()

dict_keys(['tig', 'joe', 'sam', 'nate', 'larry', 'bill', 'rob', 'eric', 'chris', 'hannah', 'ali', 'kavin'])

In [6]:
# More checks
data['tig'][:2]

['(Chattering) (music playing)', '(audience cheering)']

## Cleaning The Data

In [7]:
# Let's take a look at our data again
next(iter(data.keys()))

'tig'

In [8]:
# Notice that our dictionary is currently in key: comedian, value: list of text format
next(iter(data.values()))

['(Chattering) (music playing)',
 '(audience cheering)',
 'Thank you. Oh, my gosh. Are you kidding me? My goodness. (Chuckles) Wow, thank you. People are like, “Tig!”',
 '(audience laughs)',
 '“Why are you shooting your special in Boston?”',
 '(audience laughs)',
 'Woman: Whoo!',
 'I’ll tell ya something, my grandfather… was originally from Boston.',
 '(Audience cheers)',
 'And my mother lived in Boston when she was a tiny, little person.',
 'Man: Whoo!',
 'And this rug… Hear me out. This rug I am standing on has been in my family since the 1800s and was in my mother’s house in Boston in the ’40s.',
 '(audience cheers)',
 'Why am I shooting my special… in Boston? I wanted to show you my rug.',
 '(audience laughs)',
 'Why am I shooting my special in Boston? No more stupid questions. (laughs) Please. I performed in Las Vegas and, um, when you do stand-up in Vegas, typically, you have to do an entire week, seven nights in a row, two shows a night. There’s an early show and a late show, an

In [9]:
# We are going to change this to key: comedian, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [10]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [11]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

Unnamed: 0,transcript
ali,"Ladies and gentlemen, please welcome to the stage Ali Wong! ♪ What y’all thought Y’all wasn’t gon’ see me? ♪\n♪ I’m the Osirus of this shit♪\n♪ Wu..."
bill,"Original air date: October 10, 2020 Host Bill Burr does stand-up about the COVID-19 pandemic, cancel culture and white women. Ladies and gentl..."
chris,♪ I hate my brain ♪\n♪ Because the things I think sometimes ♪\n♪ Are so judgmental and lame ♪\n♪ I’ve got everything that I want ♪\n♪ Except my se...
eric,Opening Sketch [dramatic music playing] [street bustles] [cars honk] [drumming rhythmically] [New Orleans jazz band plays] [sirens wail] [police d...
hannah,"The following is the transcript of Hannah Gadbsy: Douglas. In her second Netflix special, named after her dog, Gadsby explores how autism affects ..."
joe,"[Emcee] Ladies and gentlemen, Joe List. [audience applauds and cheers] Thank you. [audience applauds and cheers] Thank you. That was way too much...."
kavin,"Ladies and gentlemen, welcome to Kavin Jay: Everybody Calm Down! Now, please put your hands together for the guy whose name is in the title, Kavin..."
larry,"[Announcer] Ladies and gentlemen, Larry, The Cable Guy! [crowd cheer and applaud] All right. Thank you. Please remain seated. Do not rush the stag..."
nate,[folk rock music playing] ♪ Family ♪\n♪ Singin’ in the kitchen ♪\n♪ Family ♪\n♪ Runnin’ through the yard… ♪\n♪ Family ♪\n♪ Goin’ on vacation ♪\n♪ ...
rob,"[Rob Schneider] Ladies and gentlemen, please welcome… Rob Schneider. [audience cheering] [upbeat music playing] [cheering, clapping] Thank you ver..."


In [12]:
# Let's take a look at the transcript for Ali Wong
data_df.transcript.loc['ali']

'Ladies and gentlemen, please welcome to the stage Ali Wong! ♪ What y’all thought Y’all wasn’t gon’ see me? ♪\n♪ I’m the Osirus of this shit♪\n♪ Wu-Tang is here forever, motherfuckers♪\n♪ It’s like this ninety-seven ♪\n♪ Aight my n i g g a s and my n i g g arettes♪\n♪ Let’s do it like this♪\n♪ I’ma rub your ass in the moonshine♪\n♪ Let’s take it back to seventy-nine♪\n♪ I bomb atomically♪\n♪ Socrates’ philosophies and hypotheses♪\n♪ Can’t define How I be droppin’ these mockeries♪\n♪ Lyrically perform armed robbery ♪\n♪ Flee with the lottery Possibly they spotted me♪\n♪ Battle-scarred shogun♪\n♪ Explosion when my pen hits ♪ Oh, my goodness! I heard a rumor that all of the Asians in this city… Have congregated in this theater tonight. Yeah. Thank you for coming with your white boyfriends. I really… Appreciate it, from the bottom of my heart. I’m so excited to be here. I have not been performing that much at all, in the past two years, because two years ago, I gave birth to a baby girl. A

In [13]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [14]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

Unnamed: 0,transcript
ali,ladies and gentlemen please welcome to the stage ali wong ♪ what y’all thought y’all wasn’t gon’ see me ♪\n♪ i’m the osirus of this shit♪\n♪ wutan...
bill,original air date october host bill burr does standup about the pandemic cancel culture and white women ladies and gentlemen bill burr ♪ t...
chris,♪ i hate my brain ♪\n♪ because the things i think sometimes ♪\n♪ are so judgmental and lame ♪\n♪ i’ve got everything that i want ♪\n♪ except my se...
eric,opening sketch i stole this from the evidence room man legalize it he got weed you a damn fool what… really you police y...
hannah,the following is the transcript of hannah gadbsy douglas in her second netflix special named after her dog gadsby explores how autism affects her ...
joe,ladies and gentlemen joe list thank you thank you that was way too much i feel like i feel like everyone’s aware that that was too much there w...
kavin,ladies and gentlemen welcome to kavin jay everybody calm down now please put your hands together for the guy whose name is in the title kavin jay ...
larry,ladies and gentlemen larry the cable guy all right thank you please remain seated do not rush the stage thank you well this is awesome who cance...
nate,♪ family ♪\n♪ singin’ in the kitchen ♪\n♪ family ♪\n♪ runnin’ through the yard… ♪\n♪ family ♪\n♪ goin’ on vacation ♪\n♪ family ♪\n♪ on a credit c...
rob,ladies and gentlemen please welcome… rob schneider thank you very much thank you salt lake you can do it how are you you know what i realized...


In [15]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around, remove symbols'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub(r'[^\w]', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [16]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

Unnamed: 0,transcript
ali,ladies and gentlemen please welcome to the stage ali wong what yall thought yall wasnt gon see me im the osirus of this shit wutang is here...
bill,original air date october host bill burr does standup about the pandemic cancel culture and white women ladies and gentlemen bill burr t...
chris,i hate my brain because the things i think sometimes are so judgmental and lame ive got everything that i want except my set of expe...
eric,opening sketch i stole this from the evidence room man legalize it he got weed you a damn fool what really you police ye...
hannah,the following is the transcript of hannah gadbsy douglas in her second netflix special named after her dog gadsby explores how autism affects her ...
joe,ladies and gentlemen joe list thank you thank you that was way too much i feel like i feel like everyones aware that that was too much there wa...
kavin,ladies and gentlemen welcome to kavin jay everybody calm down now please put your hands together for the guy whose name is in the title kavin jay ...
larry,ladies and gentlemen larry the cable guy all right thank you please remain seated do not rush the stage thank you well this is awesome who cance...
nate,family singin in the kitchen family runnin through the yard family goin on vacation family on a credit card hey that...
rob,ladies and gentlemen please welcome rob schneider thank you very much thank you salt lake you can do it how are you you know what i realized ...


## Organizing The Data

### Corpus

In [17]:
# Let's take a look at our dataframe
data_df

Unnamed: 0,transcript
ali,"Ladies and gentlemen, please welcome to the stage Ali Wong! ♪ What y’all thought Y’all wasn’t gon’ see me? ♪\n♪ I’m the Osirus of this shit♪\n♪ Wu..."
bill,"Original air date: October 10, 2020 Host Bill Burr does stand-up about the COVID-19 pandemic, cancel culture and white women. Ladies and gentl..."
chris,♪ I hate my brain ♪\n♪ Because the things I think sometimes ♪\n♪ Are so judgmental and lame ♪\n♪ I’ve got everything that I want ♪\n♪ Except my se...
eric,Opening Sketch [dramatic music playing] [street bustles] [cars honk] [drumming rhythmically] [New Orleans jazz band plays] [sirens wail] [police d...
hannah,"The following is the transcript of Hannah Gadbsy: Douglas. In her second Netflix special, named after her dog, Gadsby explores how autism affects ..."
joe,"[Emcee] Ladies and gentlemen, Joe List. [audience applauds and cheers] Thank you. [audience applauds and cheers] Thank you. That was way too much...."
kavin,"Ladies and gentlemen, welcome to Kavin Jay: Everybody Calm Down! Now, please put your hands together for the guy whose name is in the title, Kavin..."
larry,"[Announcer] Ladies and gentlemen, Larry, The Cable Guy! [crowd cheer and applaud] All right. Thank you. Please remain seated. Do not rush the stag..."
nate,[folk rock music playing] ♪ Family ♪\n♪ Singin’ in the kitchen ♪\n♪ Family ♪\n♪ Runnin’ through the yard… ♪\n♪ Family ♪\n♪ Goin’ on vacation ♪\n♪ ...
rob,"[Rob Schneider] Ladies and gentlemen, please welcome… Rob Schneider. [audience cheering] [upbeat music playing] [cheering, clapping] Thank you ver..."


In [18]:
# Let's add the comedians' full names as well
full_names = ['Ali wong', 'Bill burr', 'Chris gethard', 'Eric andre', 'Hannah gadsby', 'Joe list', 
             'Kavin jay', 'Larry', 'Nate bargatze', 'Rob schneider', 'Sam morril', 'Tig notaro']

data_df['full_name'] = full_names
data_df

Unnamed: 0,transcript,full_name
ali,"Ladies and gentlemen, please welcome to the stage Ali Wong! ♪ What y’all thought Y’all wasn’t gon’ see me? ♪\n♪ I’m the Osirus of this shit♪\n♪ Wu...",Ali wong
bill,"Original air date: October 10, 2020 Host Bill Burr does stand-up about the COVID-19 pandemic, cancel culture and white women. Ladies and gentl...",Bill burr
chris,♪ I hate my brain ♪\n♪ Because the things I think sometimes ♪\n♪ Are so judgmental and lame ♪\n♪ I’ve got everything that I want ♪\n♪ Except my se...,Chris gethard
eric,Opening Sketch [dramatic music playing] [street bustles] [cars honk] [drumming rhythmically] [New Orleans jazz band plays] [sirens wail] [police d...,Eric andre
hannah,"The following is the transcript of Hannah Gadbsy: Douglas. In her second Netflix special, named after her dog, Gadsby explores how autism affects ...",Hannah gadsby
joe,"[Emcee] Ladies and gentlemen, Joe List. [audience applauds and cheers] Thank you. [audience applauds and cheers] Thank you. That was way too much....",Joe list
kavin,"Ladies and gentlemen, welcome to Kavin Jay: Everybody Calm Down! Now, please put your hands together for the guy whose name is in the title, Kavin...",Kavin jay
larry,"[Announcer] Ladies and gentlemen, Larry, The Cable Guy! [crowd cheer and applaud] All right. Thank you. Please remain seated. Do not rush the stag...",Larry
nate,[folk rock music playing] ♪ Family ♪\n♪ Singin’ in the kitchen ♪\n♪ Family ♪\n♪ Runnin’ through the yard… ♪\n♪ Family ♪\n♪ Goin’ on vacation ♪\n♪ ...,Nate bargatze
rob,"[Rob Schneider] Ladies and gentlemen, please welcome… Rob Schneider. [audience cheering] [upbeat music playing] [cheering, clapping] Thank you ver...",Rob schneider


In [19]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

### Document-Term Matrix

In [20]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer => Convert a collection of text documents to a matrix of token counts
cv = CountVectorizer(stop_words='english')         #  A stop word is a commonly used word (such as “the”, “a”, “an”, “in”) that a search engine has been programmed to ignore, both when indexing entries for searching and when retrieving them as the result of a search query. 
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

Unnamed: 0,aa,aah,abbreviated,abc,abduction,abetter,ability,ablazing,able,ableist,...,zinger,zip,ziploc,ziplock,zipping,zone,zoo,zoom,zumba,éses
ali,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
bill,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
chris,1,0,0,1,0,0,1,0,3,0,...,0,0,0,0,0,0,0,3,0,1
eric,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
hannah,0,0,0,0,0,0,0,1,3,1,...,0,6,0,0,0,0,0,0,0,0
joe,0,2,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
kavin,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
larry,0,1,0,0,0,0,0,0,3,0,...,0,0,0,1,1,0,0,0,0,0
nate,0,0,0,0,0,0,0,0,4,0,...,0,0,0,0,0,0,1,1,0,0
rob,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# Pickle Document-Term Matrix as later use
data_dtm.to_pickle("dtm.pkl")

In [22]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))