In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

## Data path

In [2]:
# raw_data
file = '03_data_merge.csv'
data_directory = os.path.join( '..','data','raw_data/{}'.format(file))

In [3]:
df = pd.read_csv(data_directory)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 5 columns):
name          244 non-null object
raw_title     244 non-null object
summary       244 non-null object
transcript    244 non-null object
link          244 non-null object
dtypes: object(5)
memory usage: 9.6+ KB


In [5]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...","A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,George Carlin,Politically Correct Language – by George Carlin,George Carlin's critical thinking on pc langua...,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,Ray Romano,"Ray Romano: Right Here, Around the Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...


# Clean
---

## YEAR
Extract year of the comedy special from `raw_title` column

In [6]:
def get_year(data):
    ''' return year'''
    patt = r'(\d{4})' # ####
    text = data.raw_title 
    pattern = re.compile(patt) 
    matches = pattern.findall(text) 
    if len(matches) == 0: #matches = 0 
        return 0 
    else: 
        value = matches[0] 
        return value

In [7]:
df['year'] = df.apply(get_year,axis=1)

In [8]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...","A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...,2018
1,George Carlin,Politically Correct Language – by George Carlin,George Carlin's critical thinking on pc langua...,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...,0
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...,2019
3,Ray Romano,"Ray Romano: Right Here, Around the Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...,2019
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...,2019


## Remove name from `raw_title`
`raw_title` contains name of the comedian, since we have a column that already does that, lets remove the comedian name from column
- crate a list of unique names from column `name
- fix casing on `raw_title` to match our list of names
- remove name from `raw_title` if its in our name list

In [9]:
# create list of names
names = list(df.name.unique())

In [10]:
# fix casing 
df.raw_title = df.raw_title.str.title()

In [11]:
len(names)

115

In [12]:
names[:10]

['Enissa Amani',
 'George Carlin',
 'Ken Jeong',
 'Ray Romano',
 'Gabriel Iglesias',
 'Sebastian Maniscalco',
 'John Leguizamo',
 'Ellen Degeneres',
 'Pete Holmes',
 'Vir Das']

In [13]:
df.raw_title[:10]

0       Enissa Amani: Ehrenwort (2018) Full Transcript
1      Politically Correct Language – By George Carlin
2    Ken Jeong: You Complete Me, Ho (2019) – Full T...
3    Ray Romano: Right Here, Around The Corner (201...
4    Gabriel “Fluffy” Iglesias: One Show Fits All (...
5    Sebastian Maniscalco: Stay Hungry (2019) – Ful...
6    Latin History For Morons: John Leguizamo’S Roa...
7    Sebastian Maniscalco: Aren’T You Embarrassed? ...
8    Sebastian Maniscalco: What’S Wrong With People...
9    Ellen Degeneres: Relatable (2018) – Full Trans...
Name: raw_title, dtype: object

In [14]:
df['title_clean'] = df['raw_title'].str.replace('|'.join(names), '',case=False)

In [15]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year,title_clean
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...","A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...,2018,: Ehrenwort (2018) Full Transcript
1,George Carlin,Politically Correct Language – By George Carlin,George Carlin's critical thinking on pc langua...,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...,0,Politically Correct Language – By
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...,2019,": You Complete Me, Ho (2019) – Full Transcript"
3,Ray Romano,"Ray Romano: Right Here, Around The Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...,2019,": Right Here, Around The Corner (2019) – Full ..."
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...,2019,Gabriel “Fluffy” Iglesias: One Show Fits All (...


## remove `full_transcript`
- `title_clean` contains the  several versions of the string `full_transcipt`
- remove it by replacing it with an empty string

In [16]:
df['title_clean'] = df.title_clean.str.replace('– Full Transcript|Full Transcript|Transcript','',case=False)

In [17]:
df.title_clean.head()

0                                  : Ehrenwort (2018) 
1                   Politically Correct Language – By 
2                        : You Complete Me, Ho (2019) 
3              : Right Here, Around The Corner (2019) 
4    Gabriel “Fluffy” Iglesias: One Show Fits All (...
Name: title_clean, dtype: object

## Remove year
- since we extracted the year already,we can remove it
- regex extract 4 digits incase in parenthesis
- `(####)`

In [18]:
#yr_patt = r'\((.*?)\)'
regex_pat = re.compile(r'\((.*?)\)', flags=re.IGNORECASE)

df['title_clean'] = df.title_clean.str.replace(regex_pat, '')

In [19]:
df.title_clean.head()

0                                     : Ehrenwort  
1                Politically Correct Language – By 
2                           : You Complete Me, Ho  
3                 : Right Here, Around The Corner  
4    Gabriel “Fluffy” Iglesias: One Show Fits All  
Name: title_clean, dtype: object

In [None]:
#.title_03.str.title()
#df.title_03.str.split(':',expand=True)

# Remove random chars
- `:`
- `-`


In [20]:
df['title_clean'] = df.title_clean.str.replace(':|–','')

In [21]:
df.title_clean.head()

0                                      Ehrenwort  
1                Politically Correct Language  By 
2                            You Complete Me, Ho  
3                  Right Here, Around The Corner  
4    Gabriel “Fluffy” Iglesias One Show Fits All  
Name: title_clean, dtype: object

## REmove brackets
- [###]

In [22]:
#yr_patt = r'\((.*?)\)'
regex_pat = re.compile(r'\[(.*?)\]', flags=re.IGNORECASE)

df['title_clean'] = df.title_clean.str.replace(regex_pat, '')

In [23]:
df.title_clean.head()

0                                      Ehrenwort  
1                Politically Correct Language  By 
2                            You Complete Me, Ho  
3                  Right Here, Around The Corner  
4    Gabriel “Fluffy” Iglesias One Show Fits All  
Name: title_clean, dtype: object

## Rename 
- rename `Saturday Night Live`  to `SNL`

In [24]:
df['title_clean'] = df.title_clean.str.replace('Saturday Night Live|Snl','SNL',case=False)

In [25]:
df.title_clean.head()

0                                      Ehrenwort  
1                Politically Correct Language  By 
2                            You Complete Me, Ho  
3                  Right Here, Around The Corner  
4    Gabriel “Fluffy” Iglesias One Show Fits All  
Name: title_clean, dtype: object

## REmove names
- Some names could not be removed from the title column
- have to do it "manually"

In [26]:
names_other = [
    'Gabriel “Fluffy” Iglesias',
    '& Larry The Cable Guy',
    'David Chappelle',
    'Patrice O’Neal',
]
df['title_clean'] = df['title_clean'].str.replace('|'.join(names_other), '',case=False)

In [27]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year,title_clean
0,Enissa Amani,Enissa Amani: Ehrenwort (2018) Full Transcript,"Live from Hamburg, Iranian-German comedian Eni...","A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...,2018,Ehrenwort
1,George Carlin,Politically Correct Language – By George Carlin,George Carlin's critical thinking on pc langua...,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...,0,Politically Correct Language By
2,Ken Jeong,"Ken Jeong: You Complete Me, Ho (2019) – Full T...",Filmed at the Ice House Comedy Club in Pasaden...,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...,2019,"You Complete Me, Ho"
3,Ray Romano,"Ray Romano: Right Here, Around The Corner (201...",Ray Romano cut his stand-up teeth at the Comed...,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...,2019,"Right Here, Around The Corner"
4,Gabriel Iglesias,Gabriel “Fluffy” Iglesias: One Show Fits All (...,"In a new special from Houston, Gabriel ""Fluffy...",[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...,2019,One Show Fits All


In [28]:
df.title_clean.unique()

array([' Ehrenwort  ', 'Politically Correct Language  By ',
       ' You Complete Me, Ho  ', ' Right Here, Around The Corner  ',
       ' One Show Fits All  ', ' Stay Hungry  ',
       'Latin History For Morons ’S Road To Broadway  ',
       ' Aren’T You Embarrassed?  ', ' What’S Wrong With People?  ',
       ' Relatable  ', ' Dirty Clean  ', ' Losing It  ', ' About Usa   ',
       ' Son Of Patricia  ', '  We’Ve Been Thinking  ', ' 100% Fresh  ',
       ' Totally Committed  ', ' If You Quit Listening, I’Ll Shut Up  ',
       'Comedy Central Presents   ', ' The Vagabond  ',
       '’S Award-Winning Comedy Special  ', ' Keep Talking, Pal  ',
       ' SNL Monologue S01E01  ', ' Strange Times  ',
       ' SNL Monologue S38E06  ', ' SNL Monologue S39E16  ',
       ' SNL Monologue  Season 40 | Episode 2 | 10/04/2014  ',
       ' SNL Monologue May 16, 2015 ', ' SNL Monologue  ',
       ' Contrarian  ', ' Freezing Hot  ', ' War Paint  ',
       ' Secret Time  ',
       ' Speech At St. John’S B

In [29]:
# remove any whitespace from title
df['title_clean'] = df['title_clean'].str.strip()

In [30]:
df.title_clean.unique()

array(['Ehrenwort', 'Politically Correct Language  By',
       'You Complete Me, Ho', 'Right Here, Around The Corner',
       'One Show Fits All', 'Stay Hungry',
       'Latin History For Morons ’S Road To Broadway',
       'Aren’T You Embarrassed?', 'What’S Wrong With People?',
       'Relatable', 'Dirty Clean', 'Losing It', 'About Usa',
       'Son Of Patricia', 'We’Ve Been Thinking', '100% Fresh',
       'Totally Committed', 'If You Quit Listening, I’Ll Shut Up',
       'Comedy Central Presents', 'The Vagabond',
       '’S Award-Winning Comedy Special', 'Keep Talking, Pal',
       'SNL Monologue S01E01', 'Strange Times', 'SNL Monologue S38E06',
       'SNL Monologue S39E16',
       'SNL Monologue  Season 40 | Episode 2 | 10/04/2014',
       'SNL Monologue May 16, 2015', 'SNL Monologue', 'Contrarian',
       'Freezing Hot', 'War Paint', 'Secret Time',
       'Speech At St. John’S Baptist Church, May 20, 1963',
       'The Overthinker', 'Live From Oklahoma', 'Live', 'Confirmed Kills',

## Drop unused columns
- only select our final 6 columns

In [31]:
## Select columns
df = df[['name','title_clean','summary','year','transcript','link']]

In [32]:
df.head()

Unnamed: 0,name,title_clean,summary,year,transcript,link
0,Enissa Amani,Ehrenwort,"Live from Hamburg, Iranian-German comedian Eni...",2018,"A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,George Carlin,Politically Correct Language By,George Carlin's critical thinking on pc langua...,0,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...
2,Ken Jeong,"You Complete Me, Ho",Filmed at the Ice House Comedy Club in Pasaden...,2019,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,Ray Romano,"Right Here, Around The Corner",Ray Romano cut his stand-up teeth at the Comed...,2019,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel Iglesias,One Show Fits All,"In a new special from Houston, Gabriel ""Fluffy...",2019,[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...


In [33]:
# rename col
df.rename(columns={"title_clean": "title"},inplace=True)

In [34]:
df.head()

Unnamed: 0,name,title,summary,year,transcript,link
0,Enissa Amani,Ehrenwort,"Live from Hamburg, Iranian-German comedian Eni...",2018,"A NETFLIX ORIGINAL COMEDY SPECIAL\nHey, what’s...",https://scrapsfromtheloft.com/2019/02/19/eniss...
1,George Carlin,Politically Correct Language By,George Carlin's critical thinking on pc langua...,0,"I know I’m a little late with this, but I’d li...",https://scrapsfromtheloft.com/2019/02/18/polit...
2,Ken Jeong,"You Complete Me, Ho",Filmed at the Ice House Comedy Club in Pasaden...,2019,The Hangover… Community… Dr. Ken… Crazy Rich A...,https://scrapsfromtheloft.com/2019/02/17/ken-j...
3,Ray Romano,"Right Here, Around The Corner",Ray Romano cut his stand-up teeth at the Comed...,2019,It’s been 23 years since I did a comedy specia...,https://scrapsfromtheloft.com/2019/02/12/ray-r...
4,Gabriel Iglesias,One Show Fits All,"In a new special from Houston, Gabriel ""Fluffy...",2019,[crowd chanting] Fluffy! Fluffy! [Fluffy shout...,https://scrapsfromtheloft.com/2019/01/30/gabri...


# Save

In [37]:
file = '04_clean_data.csv'
data_directory = os.path.join( '..','data','raw_data/{}'.format(file))

In [38]:
df.to_csv(data_directory,index=False)