In [3]:
import pandas as pd
import string
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os
from IPython.core.display import clear_output
plt.style.use('ggplot')
sns.set(rc={'figure.figsize':(10.7,4.27)})

## Load data

In [4]:
# comedy link list
file = 'raw_comedy_05.pkl'
data_directory = os.path.join('..','data','raw_data/{}'.format(file))
df = pd.read_pickle(data_directory)
df.head()

Unnamed: 0,name,raw_title,summary,transcript,link,year,title,len_transcript,dob,sex
0,Louis C.K.,Sincerely Louis Ck (2020),Great comedy is finally back. Louis C.K. is no...,Great comedy is finally back. Louis C.K. is no...,https://scrapsfromtheloft.com/2020/05/02/since...,2020,Sincerely Louis Ck,44135,1967-09-12,Male
1,Louis C.K.,Louis C.K.: Chewed Up (2008) – Full Transcript,"Transcript of 'Chewed Up', Louis C.K.'s second...",Filmed at the Berklee Performance Center in Bo...,https://scrapsfromtheloft.com/2017/07/07/louis...,2008,Chewed Up,40121,1967-09-12,Male
2,Louis C.K.,Louis C.K.: Live At The Beacon Theatre (2011) ...,"Louis jokes about fatherhood, success, and fly...",[indistinct chatter] — Louis! — Louis! [indist...,https://scrapsfromtheloft.com/2017/06/28/louis...,2011,Live At The Beacon Theatre,43930,1967-09-12,Male
3,Louis C.K.,Louis C.K.: Shameless (2007) – Full Transcript,Please welcome Louis C.K.! Thank you. Thank yo...,Please welcome Louis C.K.! Thank you. Thank yo...,https://scrapsfromtheloft.com/2017/05/30/louis...,2007,Shameless,48454,1967-09-12,Male
4,Louis C.K.,Louis C.K.: Oh My God (2013) – Full Transcript,'Oh My God' is the fifth comedy special perfor...,Intro\nFade the music out. Let’s roll. Hold th...,https://scrapsfromtheloft.com/2017/05/06/louis...,2013,Oh My God,39141,1967-09-12,Male


In [5]:
data = df[['name','dob','sex','title','year','transcript']]

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270 entries, 0 to 269
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   name        270 non-null    object        
 1   dob         269 non-null    datetime64[ns]
 2   sex         270 non-null    object        
 3   title       270 non-null    object        
 4   year        270 non-null    object        
 5   transcript  270 non-null    object        
dtypes: datetime64[ns](1), object(5)
memory usage: 14.8+ KB


In [7]:
data.head()

Unnamed: 0,name,dob,sex,title,year,transcript
0,Louis C.K.,1967-09-12,Male,Sincerely Louis Ck,2020,Great comedy is finally back. Louis C.K. is no...
1,Louis C.K.,1967-09-12,Male,Chewed Up,2008,Filmed at the Berklee Performance Center in Bo...
2,Louis C.K.,1967-09-12,Male,Live At The Beacon Theatre,2011,[indistinct chatter] — Louis! — Louis! [indist...
3,Louis C.K.,1967-09-12,Male,Shameless,2007,Please welcome Louis C.K.! Thank you. Thank yo...
4,Louis C.K.,1967-09-12,Male,Oh My God,2013,Intro\nFade the music out. Let’s roll. Hold th...


## English only

In [8]:
data.title.value_counts()[:10]

Live                                             3
Does This Need To Be Said?                       1
Totally Committed                                1
Lower Classy                                     1
Live At Webster Hall                             1
Unveiled                                         1
Jamming In New York   Testo Italiano Completo    1
Nanette                                          1
I Be Knowin’                                     1
Latin History For Morons ’S Road To Broadway     1
Name: title, dtype: int64

In [9]:
filter_data = data.title.str.contains('Italiano')
data[filter_data]

Unnamed: 0,name,dob,sex,title,year,transcript
57,Bill Burr,1968-06-10,Male,Why Do I Do This Testo Italiano Completo,2008,"Va bene, grazie, grazie mille. Va bene, Gesù. ..."
97,George Carlin,1937-05-12,Male,Jamming In New York Testo Italiano Completo,1992,"Ciao, grazie. Grazie. Grazie. Grazie molte. Gr..."
98,George Carlin,1937-05-12,Male,You Are All Diseased Testo Italiano Completo,1999,"Siete gentili, grazie! Grazie mille, lo apprez..."
99,George Carlin,1937-05-12,Male,It’S Bad For Ya! Testo Italiano Completo,2008,"Grazie, grazie, grazie! Mi piacerebbe iniziare..."


In [10]:
data = data[~filter_data]

In [11]:
data[filter_data]

  """Entry point for launching an IPython kernel.


Unnamed: 0,name,dob,sex,title,year,transcript


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 266 entries, 0 to 269
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   name        266 non-null    object        
 1   dob         265 non-null    datetime64[ns]
 2   sex         266 non-null    object        
 3   title       266 non-null    object        
 4   year        266 non-null    object        
 5   transcript  266 non-null    object        
dtypes: datetime64[ns](1), object(5)
memory usage: 14.5+ KB


## Clean Text

In [13]:
def clean_text(data):
    #print(type(data))
    text = data.lower()
    text = re.sub('\[.*?\]', '', text)  # remove brackets & text inside
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)  # remove punctuations
    text = re.sub('\w*\d\w*', '', text)  # remove digits surounded by words
    text = re.sub('[‘’“”…]', '', text)  # remove quotes & elipses
    text = re.sub('\n', ' ', text)  # remove new line
    text = re.sub('\t', ' ', text)  # remove new line
    #text = re.sub(string.whitespace, '', text)  # remove \n\t..
    return text

In [14]:
%%time
data['clean_text'] = data.transcript.apply(clean_text)

CPU times: user 1.75 s, sys: 78.1 ms, total: 1.83 s
Wall time: 1.83 s


In [15]:
data.head()

Unnamed: 0,name,dob,sex,title,year,transcript,clean_text
0,Louis C.K.,1967-09-12,Male,Sincerely Louis Ck,2020,Great comedy is finally back. Louis C.K. is no...,great comedy is finally back louis ck is now a...
1,Louis C.K.,1967-09-12,Male,Chewed Up,2008,Filmed at the Berklee Performance Center in Bo...,filmed at the berklee performance center in bo...
2,Louis C.K.,1967-09-12,Male,Live At The Beacon Theatre,2011,[indistinct chatter] — Louis! — Louis! [indist...,— louis — louis alright lets get started go...
3,Louis C.K.,1967-09-12,Male,Shameless,2007,Please welcome Louis C.K.! Thank you. Thank yo...,please welcome louis ck thank you thank you th...
4,Louis C.K.,1967-09-12,Male,Oh My God,2013,Intro\nFade the music out. Let’s roll. Hold th...,intro fade the music out lets roll hold there ...


In [16]:
# drop transcript
data.drop(['transcript'],axis=1,inplace=True)

In [17]:
data.head()

Unnamed: 0,name,dob,sex,title,year,clean_text
0,Louis C.K.,1967-09-12,Male,Sincerely Louis Ck,2020,great comedy is finally back louis ck is now a...
1,Louis C.K.,1967-09-12,Male,Chewed Up,2008,filmed at the berklee performance center in bo...
2,Louis C.K.,1967-09-12,Male,Live At The Beacon Theatre,2011,— louis — louis alright lets get started go...
3,Louis C.K.,1967-09-12,Male,Shameless,2007,please welcome louis ck thank you thank you th...
4,Louis C.K.,1967-09-12,Male,Oh My God,2013,intro fade the music out lets roll hold there ...


## Save

In [18]:
data_directory_saves = os.path.join( '..','data','raw_data/')
data.to_pickle(data_directory_saves+'raw_comedy_06.pkl')