In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import re

## Data path

In [2]:
# raw_data
file = '03_data_merge_OCT_19.csv'
data_directory = os.path.join( '..','data','raw_data/{}'.format(file))

In [3]:
df = pd.read_csv(data_directory)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284 entries, 0 to 283
Data columns (total 5 columns):
name          284 non-null object
raw_title     284 non-null object
summary       284 non-null object
transcript    284 non-null object
link          284 non-null object
dtypes: object(5)
memory usage: 11.2+ KB


In [5]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link
0,Nikki Glaser,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...
1,Ryan Hamilton,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...
3,George Carlin,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...",From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...
4,Bill Burr,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...","Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...


# Clean
---

## YEAR
Extract year of the comedy special from `raw_title` column

In [6]:
def get_year(data):
    ''' return year'''
    patt = r'(\d{4})' # ####
    text = data.raw_title 
    pattern = re.compile(patt) 
    matches = pattern.findall(text) 
    if len(matches) == 0: #matches = 0 
        return 0 
    else: 
        value = matches[0] 
        return value

In [7]:
df['year'] = df.apply(get_year,axis=1)

In [10]:
df.year.value_counts()[:10]

2018    39
2017    38
2016    24
2019    22
2015    19
0       13
2013    13
2014    13
2012     9
2010     9
Name: year, dtype: int64

In [8]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year
0,Nikki Glaser,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...,2019
1,Ryan Hamilton,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...,2019
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...,2019
3,George Carlin,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...",From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...,2006
4,Bill Burr,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...","Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...,2019


## Remove name from `raw_title`
`raw_title` contains name of the comedian, since we have a column that already does that, lets remove the comedian name from column
- crate a list of unique names from column `name
- fix casing on `raw_title` to match our list of names
- remove name from `raw_title` if its in our name list

In [11]:
# create list of names
names = list(df.name.unique())

In [12]:
# fix casing 
df.raw_title = df.raw_title.str.title()

In [13]:
len(names)

133

In [14]:
names[:10]

['Nikki Glaser',
 'Ryan Hamilton',
 'Mark Normandthe Tonight Show Starring Jimmy Fallon',
 'George Carlin',
 'Bill Burr',
 'Dave Chappelle',
 'Amazon Rainforestclimate Changedeforestationhasan Minhajjair Bolsonaropatriot Act With Hasan Minhajsônia Guajajara',
 'Emily Heller',
 'David Cross',
 'Kevin Hart']

In [15]:
df.raw_title[:10]

0       Nikki Glaser: Bangin’ (2019) – Full Transcript
1    Ryan Hamilton Stand-Up – The Tonight Show Star...
2    Mark Normand Stand-Up – The Tonight Show Starr...
3    George Carlin: Dumb Americans (2006) – Full Tr...
4      Bill Burr: Paper Tiger (2019) – Full Transcript
5    Dave Chappelle: Sticks And Stones | Epilogue: ...
6    Brazil, Corruption And The Amazon Rainforest |...
7    Dave Chappelle: Sticks & Stones (2019) – Full ...
8    Emily Heller: Ice Thickeners (2019) – Full Tra...
9     David Cross: Oh Come On (2019) – Full Transcript
Name: raw_title, dtype: object

In [16]:
df['title_clean'] = df['raw_title'].str.replace('|'.join(names), '',case=False)

In [17]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year,title_clean
0,Nikki Glaser,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...,2019,: Bangin’ (2019) – Full Transcript
1,Ryan Hamilton,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...,2019,Stand-Up – The Tonight Show Starring Jimmy Fa...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...,2019,Mark Normand Stand-Up – The Tonight Show Starr...
3,George Carlin,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...",From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...,2006,: Dumb Americans (2006) – Full Transcript
4,Bill Burr,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...","Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...,2019,: Paper Tiger (2019) – Full Transcript


In [19]:
df.name.value_counts()[:10]

George Carlin     22
Dave Chappelle    12
Louis C.K.        10
Jim Jefferies      8
Ricky Gervais      7
Bill Burr          7
Chris Rock         6
Richard Pryor      5
Kevin Hart         5
Doug Stanhope      5
Name: name, dtype: int64

## remove `full_transcript`
- `title_clean` contains the  several versions of the string `full_transcipt`
- remove it by replacing it with an empty string

In [20]:
df['title_clean'] = df.title_clean.str.replace('– Full Transcript|Full Transcript|Transcript','',case=False)

In [21]:
df.title_clean.head()

0                                    : Bangin’ (2019) 
1     Stand-Up – The Tonight Show Starring Jimmy Fa...
2    Mark Normand Stand-Up – The Tonight Show Starr...
3                             : Dumb Americans (2006) 
4                                : Paper Tiger (2019) 
Name: title_clean, dtype: object

## Remove year
- since we extracted the year already,we can remove it
- regex extract 4 digits incase in parenthesis
- `(####)`

In [22]:
#yr_patt = r'\((.*?)\)'
regex_pat = re.compile(r'\((.*?)\)', flags=re.IGNORECASE)

df['title_clean'] = df.title_clean.str.replace(regex_pat, '')

In [23]:
df.title_clean.head()

0                                          : Bangin’  
1     Stand-Up – The Tonight Show Starring Jimmy Fa...
2    Mark Normand Stand-Up – The Tonight Show Starr...
3                                   : Dumb Americans  
4                                      : Paper Tiger  
Name: title_clean, dtype: object

In [None]:
#.title_03.str.title()
#df.title_03.str.split(':',expand=True)

# Remove random chars
- `:`
- `-`


In [24]:
df['title_clean'] = df.title_clean.str.replace(':|–','')

In [25]:
df.title_clean.head()

0                                            Bangin’  
1     Stand-Up  The Tonight Show Starring Jimmy Fal...
2    Mark Normand Stand-Up  The Tonight Show Starri...
3                                     Dumb Americans  
4                                        Paper Tiger  
Name: title_clean, dtype: object

## REmove brackets
- [###]

In [26]:
#yr_patt = r'\((.*?)\)'
regex_pat = re.compile(r'\[(.*?)\]', flags=re.IGNORECASE)

df['title_clean'] = df.title_clean.str.replace(regex_pat, '')

In [27]:
df.title_clean.head()

0                                            Bangin’  
1     Stand-Up  The Tonight Show Starring Jimmy Fal...
2    Mark Normand Stand-Up  The Tonight Show Starri...
3                                     Dumb Americans  
4                                        Paper Tiger  
Name: title_clean, dtype: object

## Rename 
- rename `Saturday Night Live`  to `SNL`

In [28]:
df['title_clean'] = df.title_clean.str.replace('Saturday Night Live|Snl','SNL',case=False)

In [29]:
df.title_clean.head()

0                                            Bangin’  
1     Stand-Up  The Tonight Show Starring Jimmy Fal...
2    Mark Normand Stand-Up  The Tonight Show Starri...
3                                     Dumb Americans  
4                                        Paper Tiger  
Name: title_clean, dtype: object

## REmove names
- Some names could not be removed from the title column
- have to do it "manually"

In [30]:
names_other = [
    'Gabriel “Fluffy” Iglesias',
    '& Larry The Cable Guy',
    'David Chappelle',
    'Patrice O’Neal',
]
df['title_clean'] = df['title_clean'].str.replace('|'.join(names_other), '',case=False)

In [31]:
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year,title_clean
0,Nikki Glaser,Nikki Glaser: Bangin’ (2019) – Full Transcript,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...,2019,Bangin’
1,Ryan Hamilton,Ryan Hamilton Stand-Up – The Tonight Show Star...,Ryan Hamilton makes his Tonight Show debut wit...,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...,2019,Stand-Up The Tonight Show Starring Jimmy Fal...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up – The Tonight Show Starr...,Mark Normand is back on The Tonight Show with ...,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...,2019,Mark Normand Stand-Up The Tonight Show Starri...
3,George Carlin,George Carlin: Dumb Americans (2006) – Full Tr...,"Full transcript of George Carlin's ""Dumb Ameri...",From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...,2006,Dumb Americans
4,Bill Burr,Bill Burr: Paper Tiger (2019) – Full Transcript,"Only a few weeks after Dave Chappelle's ""Stick...","Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...,2019,Paper Tiger


In [33]:
df.title_clean.unique()[:10]

array([' Bangin’  ',
       ' Stand-Up  The Tonight Show Starring Jimmy Fallon  ',
       'Mark Normand Stand-Up  The Tonight Show Starring Jimmy Fallon  ',
       ' Dumb Americans  ', ' Paper Tiger  ',
       ' Sticks And Stones | Epilogue The Punchline ',
       'Brazil, Corruption And The Amazon Rainforest | Patriot Act With  ',
       ' Sticks & Stones  ', ' Ice Thickeners  ', ' Oh Come On  '],
      dtype=object)

In [34]:
# remove any whitespace from title
df['title_clean'] = df['title_clean'].str.strip()

In [36]:
df.title_clean.unique()[:10]

array(['Bangin’', 'Stand-Up  The Tonight Show Starring Jimmy Fallon',
       'Mark Normand Stand-Up  The Tonight Show Starring Jimmy Fallon',
       'Dumb Americans', 'Paper Tiger',
       'Sticks And Stones | Epilogue The Punchline',
       'Brazil, Corruption And The Amazon Rainforest | Patriot Act With',
       'Sticks & Stones', 'Ice Thickeners', 'Oh Come On'], dtype=object)

## Drop unused columns
- only select our final 6 columns

In [37]:
## Select columns
df = df[['name','title_clean','summary','year','transcript','link']]

In [38]:
df.head()

Unnamed: 0,name,title_clean,summary,year,transcript,link
0,Nikki Glaser,Bangin’,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,2019,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...
1,Ryan Hamilton,Stand-Up The Tonight Show Starring Jimmy Fallon,Ryan Hamilton makes his Tonight Show debut wit...,2019,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up The Tonight Show Starri...,Mark Normand is back on The Tonight Show with ...,2019,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...
3,George Carlin,Dumb Americans,"Full transcript of George Carlin's ""Dumb Ameri...",2006,From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...
4,Bill Burr,Paper Tiger,"Only a few weeks after Dave Chappelle's ""Stick...",2019,"Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...


In [39]:
# rename col
df.rename(columns={"title_clean": "title"},inplace=True)

In [40]:
df.head()

Unnamed: 0,name,title,summary,year,transcript,link
0,Nikki Glaser,Bangin’,A NETFLIX ORIGINAL COMEDY SPECIAL Thank you! T...,2019,A NETFLIX ORIGINAL COMEDY SPECIAL\nThank you! ...,https://scrapsfromtheloft.com/2019/10/04/nikki...
1,Ryan Hamilton,Stand-Up The Tonight Show Starring Jimmy Fallon,Ryan Hamilton makes his Tonight Show debut wit...,2019,"Published on Sep 11, 2019\nThe Tonight Show St...",https://scrapsfromtheloft.com/2019/09/25/ryan-...
2,Mark Normandthe Tonight Show Starring Jimmy Fa...,Mark Normand Stand-Up The Tonight Show Starri...,Mark Normand is back on The Tonight Show with ...,2019,"Aired on September 19, 2019\nHey, hey. All rig...",https://scrapsfromtheloft.com/2019/09/21/mark-...
3,George Carlin,Dumb Americans,"Full transcript of George Carlin's ""Dumb Ameri...",2006,From Life Is Worth Losing\nRecorded on Novembe...,https://scrapsfromtheloft.com/2019/09/12/georg...
4,Bill Burr,Paper Tiger,"Only a few weeks after Dave Chappelle's ""Stick...",2019,"Recorded Live at the Royal Albert Hall, London...",https://scrapsfromtheloft.com/2019/09/10/bill-...


# Save

In [41]:
file = '04_clean_data_OCT_19.csv'
data_directory = os.path.join( '..','data','raw_data/{}'.format(file))

In [42]:
df.to_csv(data_directory,index=False)