In [10]:
import requests 
from bs4 import BeautifulSoup
import pickle
import pandas as pd
import re
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

pd.set_option('max_colwidth', 200)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/vignesh/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [2]:
# scraping the transcript data
def url_to_transcript(url):
    ''' returns the transcript from the given url '''
    page = requests.get(url).text
    soup = BeautifulSoup(page, 'lxml')
    text = [p.text for p in soup.find(class_="post-content").find_all('p')]
    print(url)
    return text

In [3]:
# urls of transcripts in scope
urls = ['http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/',
        'http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/',
        'http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/',
        'http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/',
        'http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-2016-full-transcript/']


In [2]:
# comedian_names
comedians = ['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe']

In [9]:
transcripts = [url_to_transcript(url) for url in urls]

http://scrapsfromtheloft.com/2017/05/06/louis-ck-oh-my-god-full-transcript/
http://scrapsfromtheloft.com/2017/04/11/dave-chappelle-age-spin-2017-full-transcript/
http://scrapsfromtheloft.com/2018/03/15/ricky-gervais-humanity-transcript/
http://scrapsfromtheloft.com/2017/08/07/bo-burnham-2013-full-transcript/
http://scrapsfromtheloft.com/2017/05/24/bill-burr-im-sorry-feel-way-2014-full-transcript/
http://scrapsfromtheloft.com/2017/04/21/jim-jefferies-bare-2014-full-transcript/
http://scrapsfromtheloft.com/2017/08/02/john-mulaney-comeback-kid-2015-full-transcript/
http://scrapsfromtheloft.com/2017/10/21/hasan-minhaj-homecoming-king-2017-full-transcript/
http://scrapsfromtheloft.com/2017/09/19/ali-wong-baby-cobra-2016-full-transcript/
http://scrapsfromtheloft.com/2017/08/03/anthony-jeselnik-thoughts-prayers-2015-full-transcript/
http://scrapsfromtheloft.com/2018/03/03/mike-birbiglia-my-girlfriends-boyfriend-2013-full-transcript/
http://scrapsfromtheloft.com/2017/08/19/joe-rogan-triggered-

In [12]:
# pickle the raw content for later use
!mkdir transcripts

for i, c in enumerate(comedians):
    with open('transcripts/' + c + '.txt', 'wb') as file:
        pickle.dump(transcripts[i], file)

In [3]:
raw_data = {}
for i, c in enumerate(comedians):
    with open('transcripts/' + c + '.txt', 'rb') as file:
        raw_data[c] = pickle.load(file)

In [7]:
raw_data.keys()

dict_keys(['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe'])

In [6]:
raw_data['louis'][:2]

['Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily agree with you, but I appreciate very much. Well, this is a nice place. This is easily the nicest place For many miles in every direction. That’s how you compliment a building And shit on a town with one sentence. It is odd around here, as I was driving here. There doesn’t seem to be any difference Between the sidewalk and the street for pedestrians here. People just kind of walk in the middle of the road. I love traveling And seeing all the different parts of the country. I live in New York. I live in a– There’s no value to your doing that at all.',
 '“The Old Lady And The Dog”\nI live– I live in New York. I always– Like, there’s this old lady in my neighborhood, And she’s always walking her dog. She’s always just– she’s very old. She just stands there just being old, And the dog just fights gravity every day, just– The two of them, it’s re

In [8]:
# currently the transcripts are in list format, we will change it to a single content

In [4]:
def combine_text(list_of_text):
    ''' takes a list of text and combines them into one large chunk of text'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [5]:
text_combined_data = { key : combine_text(value) for (key, value) in raw_data.items()}

In [13]:
text_combined_data.keys()

dict_keys(['louis', 'dave', 'ricky', 'bo', 'bill', 'jim', 'john', 'hasan', 'ali', 'anthony', 'mike', 'joe'])

In [24]:
text_combined_data['louis']

'Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily agree with you, but I appreciate very much. Well, this is a nice place. This is easily the nicest place For many miles in every direction. That’s how you compliment a building And shit on a town with one sentence. It is odd around here, as I was driving here. There doesn’t seem to be any difference Between the sidewalk and the street for pedestrians here. People just kind of walk in the middle of the road. I love traveling And seeing all the different parts of the country. I live in New York. I live in a– There’s no value to your doing that at all. “The Old Lady And The Dog”\nI live– I live in New York. I always– Like, there’s this old lady in my neighborhood, And she’s always walking her dog. She’s always just– she’s very old. She just stands there just being old, And the dog just fights gravity every day, just– The two of them, it’s really–

In [6]:
# loading the data into a pandas dataframe
data_df = pd.DataFrame.from_dict(text_combined_data, orient='index', columns=['transcript'])
data_df.head()

Unnamed: 0,transcript
louis,"Intro\nFade the music out. Let’s roll. Hold there. Lights. Do the lights. Thank you. Thank you very much. I appreciate that. I don’t necessarily agree with you, but I appreciate very much. Well, t..."
dave,"This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the alchemist’s fire that transforms fear and tragedy ..."
ricky,"Hello. Hello! How you doing? Great. Thank you. Wow. Calm down. Shut the fuck up. Thank you. What a lovely welcome. I’m gonna try my hardest tonight. You’re thinking, “Relax, we’ve had our money’s ..."
bo,Bo What? Old MacDonald had a farm E I E I O And on that farm he had a pig E I E I O Here a snort There a Old MacDonald had a farm E I E I O [Applause] This is Bo Burnham. He’s 22 years old. He’s a...
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a pleasure to be here in the greater Atlanta, Georgi..."


In [9]:
# checking the transcript of a comedian
data_df.transcript.loc['ali']

"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have to get this shit over with, ’cause I have to pee in, like, ten minutes. But thank you, everybody, so much for coming. Um… It’s a very exciting day for me. It’s been a very exciting year for me. I turned 33 this year. Yes! Thank you, five people. I appreciate that. Uh, I can tell that I’m getting older, because, now, when I see an 18-year-old girl, my automatic thought… is “Fuck you.” “Fuck you. I don’t even know you, but fuck you!” ‘Cause I’m straight up jealous. I’m jealous, first and foremost, of their metabolism. Because 18-year-old girls, they could just eat like shit, and then they take a shit and have a six-pack, right? They got that-that beautiful inner thigh clearance where they put their feet together and there’s that huge gap here with the light of potential just radiating through.\nAnd then, when they go to sleep, they just go to 

In [7]:
stop_words = set(nltk.corpus.stopwords.words("english"))

lemmatizer = nltk.stem.WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()

In [8]:
# applying text cleaning techniques.

def clean_text(text):
    
    # replace the \n character.
    text = re.sub('\n', ' ', text)
    
    # tokenize the words
    words = nltk.word_tokenize(text)
    
    # remove stop words
    words_ = [word for word in words if word not in stop_words]
    text = ' '.join(words_)
    
    # remove all punctuations
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # remove characters with digits
    text = re.sub('\w*\d+\w*', '', text)
    
    # remove curly quotes, dash and triple dot special characters
    text = re.sub('[“”‘’…–]', '', text)
    
    text = text.lower()
    
    text = re.sub('[^a-z ]', '', text)
    
    # apply stemming
    text = stemmer.stem(lemmatizer.lemmatize(text, pos='v'))
    
    return text

In [37]:
clean_text(data_df.transcript.loc['ali'])

'ladies gentlemen  please welcome stage  ali wong  hi  hello  welcome  thank  thank coming  hello  hello  we gon na get shit   cause i pee  like  ten minutes  but thank  everybody  much coming  um it  exciting day  it  exciting year  i turned  year  yes  thank  five people  i appreciate  uh  i tell i  getting older    i see  girl  automatic thought  fuck you   fuck  i  even know  fuck    cause i  straight jealous  i  jealous  first foremost  metabolism  because  girls  could eat like shit  take shit sixpack  right  they got thatthat beautiful inner thigh clearance put feet together  huge gap light potential radiating  and  go sleep  go sleep  right  they  insomnia yet  they  know  like take ambien download meditation oasis podcast calm chatter regret resentment towards family cluttering mind  they whole lives ahead  they  hpv yet  they go sleep peace night  everybody hpv  ok  everybody  it  ok come already  everybody  if  yet  go get  you go get  it  coming  you  hpv yet   fucking lose

In [34]:
data_df.transcript.loc['ali']

"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have to get this shit over with, ’cause I have to pee in, like, ten minutes. But thank you, everybody, so much for coming. Um… It’s a very exciting day for me. It’s been a very exciting year for me. I turned 33 this year. Yes! Thank you, five people. I appreciate that. Uh, I can tell that I’m getting older, because, now, when I see an 18-year-old girl, my automatic thought… is “Fuck you.” “Fuck you. I don’t even know you, but fuck you!” ‘Cause I’m straight up jealous. I’m jealous, first and foremost, of their metabolism. Because 18-year-old girls, they could just eat like shit, and then they take a shit and have a six-pack, right? They got that-that beautiful inner thigh clearance where they put their feet together and there’s that huge gap here with the light of potential just radiating through.\nAnd then, when they go to sleep, they just go to 

In [11]:
cleaned_data = pd.DataFrame(data_df['transcript'].apply(clean_text))

In [12]:
cleaned_data

Unnamed: 0,transcript
louis,intro fade music let roll hold lights do lights thank thank much i appreciate i necessarily agree i appreciate much well nice place this easily nicest place for many miles every dire...
dave,this dave he tells dirty jokes living that stare hard work happens it signifies profound train thought alchemist fire transforms fear tragedy levity livelihood dave calls look trance play...
ricky,hello hello how great thank wow calm shut fuck thank what lovely welcome i gon na try hardest tonight you thinking relax money worth seeing you what you legend shut what i ...
bo,bo what old macdonald farm e i e i o and farm pig e i e i o here snort there old macdonald farm e i e i o applause this bo burnham he years old he male and looks like genetic product gira...
bill,cheers applause all right thank thank much thank thank thank how what going thank it pleasure greater atlanta georgia area oasis it nice i know i came june it nice wasn thi...
jim,car horn honks audience cheering announcer ladies gentlemen please welcome stage mr jim jefferies upbeat music playing hello sit sit sit sit sit chuckles thank boston i appreci...
john,all right petunia wish luck you die august that pretty good all right hello hello chicago nice see thank that nice thank look wonderful crowd i need keep energy entire show o...
hasan,theme music orchestral hiphop crowd roars what davis i home i bring back netflix said where want special la chicago new york i like nah son davis california cheering who...
ali,ladies gentlemen please welcome stage ali wong hi hello welcome thank thank coming hello hello we gon na get shit cause i pee like ten minutes but thank everybody much coming um ...
anthony,thank thank thank san francisco thank much so good people surprised i told em i gon na tape special san francisco said why would that politically correct city world not i stage i ...


In [13]:
full_names = ['Ali Wong', 'Anthony Jeselnik', 'Bill Burr', 'Bo Burnham', 'Dave Chappelle', 'Hasan Minhaj',
              'Jim Jefferies', 'Joe Rogan', 'John Mulaney', 'Louis C.K.', 'Mike Birbiglia', 'Ricky Gervais']

In [14]:
data_df = data_df.sort_index()
data_df['full_name'] = full_names
data_df.head()

Unnamed: 0,transcript,full_name
ali,"Ladies and gentlemen, please welcome to the stage: Ali Wong! Hi. Hello! Welcome! Thank you! Thank you for coming. Hello! Hello. We are gonna have to get this shit over with, ’cause I have to pee i...",Ali Wong
anthony,"Thank you. Thank you. Thank you, San Francisco. Thank you so much. So good to be here. People were surprised when I told ’em I was gonna tape my special in San Francisco. Said, “Why would you do t...",Anthony Jeselnik
bill,"[cheers and applause] All right, thank you! Thank you very much! Thank you. Thank you. Thank you. How are you? What’s going on? Thank you. It’s a pleasure to be here in the greater Atlanta, Georgi...",Bill Burr
bo,Bo What? Old MacDonald had a farm E I E I O And on that farm he had a pig E I E I O Here a snort There a Old MacDonald had a farm E I E I O [Applause] This is Bo Burnham. He’s 22 years old. He’s a...,Bo Burnham
dave,"This is Dave. He tells dirty jokes for a living. That stare is where most of his hard work happens. It signifies a profound train of thought, the alchemist’s fire that transforms fear and tragedy ...",Dave Chappelle


In [15]:
data_df.to_pickle("uncleaned_corpus.pkl")

In [16]:
cv = CountVectorizer()
data_cv = cv.fit_transform(cleaned_data['transcript'])
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = cleaned_data.index
data_dtm

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zealand,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,1,1,1,0
jim,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
john,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
hasan,0,0,0,0,0,0,0,0,0,0,...,0,2,1,0,1,0,0,0,0,0
ali,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
anthony,0,0,0,0,0,0,0,0,0,0,...,10,0,0,0,0,0,0,0,0,0


In [17]:
# lets do another step of text cleaning by finding the most commonly used words by each comedian to identify and remove
# the words that are common among all comedians

In [18]:
data_tdm = data_dtm.transpose()
data_tdm.head()

Unnamed: 0,louis,dave,ricky,bo,bill,jim,john,hasan,ali,anthony,mike,joe
aaaaah,0,0,0,0,1,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,1,0,0,0,0,0,0,0,0,0,0


In [19]:
# Find the top 30 words said by each comedian
top_dict = {}
for c in data_tdm.columns:
    top = data_tdm[c].sort_values(ascending=False).head(30)
    top_dict[c] = list(zip(top.index, top.values))

In [37]:
top_dict.items()

dict_items([('ali', [('like', 126), ('and', 81), ('know', 68), ('you', 56), ('na', 49), ('it', 45), ('got', 40), ('get', 40), ('they', 38), ('that', 36), ('shit', 34), ('gon', 28), ('we', 27), ('ok', 26), ('lot', 24), ('so', 22), ('ta', 21), ('oh', 21), ('see', 21), ('wan', 21), ('would', 21), ('go', 21), ('husband', 20), ('women', 19), ('right', 19), ('time', 19), ('cause', 18), ('day', 17), ('when', 16), ('people', 16)]), ('anthony', [('and', 74), ('you', 63), ('like', 50), ('got', 44), ('it', 39), ('know', 39), ('one', 37), ('but', 36), ('joke', 35), ('get', 34), ('said', 31), ('anthony', 27), ('say', 26), ('day', 26), ('go', 24), ('what', 23), ('that', 23), ('people', 23), ('guys', 22), ('ever', 21), ('my', 21), ('would', 20), ('na', 20), ('see', 19), ('tell', 19), ('grandma', 19), ('thing', 18), ('think', 18), ('no', 18), ('right', 18)]), ('bill', [('like', 200), ('you', 143), ('right', 131), ('know', 99), ('got', 78), ('na', 77), ('gon', 77), ('fucking', 70), ('yeah', 67), ('get'

In [38]:
for comedian, top_words in top_dict.items():
    print(comedian)
    print(', '.join([word for word, count in top_words[0: 14]]))
    print('---')

ali
like, and, know, you, na, it, got, get, they, that, shit, gon, we, ok
---
anthony
and, you, like, got, it, know, one, but, joke, get, said, anthony, say, day
---
bill
like, you, right, know, got, na, gon, fucking, yeah, get, shit, that, go, it
---
bo
voice, you, and, know, guy, like, we, bro, love, bo, it, girl, think, but
---
dave
like, and, know, said, you, he, it, shit, the, people, ahah, time, that, fuck
---
hasan
like, you, and, know, dad, it, we, get, she, go, laughter, so, going, what
---
jim
and, like, you, right, fucking, go, know, went, it, get, that, people, one, day
---
joe
like, people, they, you, and, it, fucking, fuck, na, got, we, gon, that, know
---
john
like, and, know, you, one, it, but, said, we, go, bill, clinton, would, so
---
louis
like, and, you, know, that, get, it, life, people, go, thing, na, one, gon
---
mike
and, like, know, said, one, get, think, well, says, you, it, would, jenny, even
---
ricky
right, and, it, you, like, go, he, that, they, know, but,

In [20]:
words = []
for comedian in data_tdm.columns:
    top = [word for (word, count) in top_dict[comedian]]
    for t in top:
        words.append(t)
        
words

['like',
 'and',
 'you',
 'know',
 'that',
 'get',
 'it',
 'life',
 'people',
 'go',
 'thing',
 'na',
 'one',
 'gon',
 'cause',
 'if',
 'every',
 'but',
 'he',
 'shit',
 'they',
 'would',
 'time',
 'good',
 'tit',
 'there',
 'think',
 'right',
 'really',
 'see',
 'like',
 'and',
 'know',
 'said',
 'you',
 'he',
 'it',
 'shit',
 'the',
 'people',
 'ahah',
 'time',
 'that',
 'fuck',
 'fucking',
 'black',
 'one',
 'what',
 'man',
 'got',
 'would',
 'but',
 'good',
 'see',
 'back',
 'so',
 'get',
 'show',
 'right',
 'they',
 'right',
 'and',
 'it',
 'you',
 'like',
 'go',
 'he',
 'that',
 'they',
 'know',
 'but',
 'said',
 'got',
 'what',
 'yeah',
 'so',
 'fucking',
 'say',
 'she',
 'one',
 'went',
 'we',
 'people',
 'the',
 'no',
 'little',
 'joke',
 'would',
 'get',
 'thing',
 'voice',
 'you',
 'and',
 'know',
 'guy',
 'like',
 'we',
 'bro',
 'love',
 'bo',
 'it',
 'girl',
 'think',
 'but',
 'stuff',
 'robotic',
 'repeat',
 'so',
 'yeah',
 'laughter',
 'want',
 'applause',
 'right',
 'th

In [21]:
Counter(words).most_common()

[('like', 12),
 ('and', 12),
 ('you', 12),
 ('know', 12),
 ('it', 12),
 ('right', 12),
 ('that', 11),
 ('get', 11),
 ('one', 11),
 ('people', 10),
 ('go', 10),
 ('got', 10),
 ('but', 9),
 ('so', 9),
 ('na', 8),
 ('we', 8),
 ('he', 7),
 ('would', 7),
 ('think', 7),
 ('what', 7),
 ('gon', 6),
 ('they', 6),
 ('see', 6),
 ('said', 6),
 ('shit', 5),
 ('time', 5),
 ('fucking', 5),
 ('no', 5),
 ('thing', 4),
 ('cause', 4),
 ('the', 4),
 ('fuck', 4),
 ('yeah', 4),
 ('say', 4),
 ('good', 3),
 ('she', 3),
 ('guy', 3),
 ('want', 3),
 ('this', 3),
 ('day', 3),
 ('there', 2),
 ('really', 2),
 ('man', 2),
 ('back', 2),
 ('went', 2),
 ('little', 2),
 ('joke', 2),
 ('love', 2),
 ('laughter', 2),
 ('applause', 2),
 ('dude', 2),
 ('oh', 2),
 ('going', 2),
 ('goes', 2),
 ('my', 2),
 ('life', 1),
 ('if', 1),
 ('every', 1),
 ('tit', 1),
 ('ahah', 1),
 ('black', 1),
 ('show', 1),
 ('voice', 1),
 ('bro', 1),
 ('bo', 1),
 ('girl', 1),
 ('stuff', 1),
 ('robotic', 1),
 ('repeat', 1),
 ('cos', 1),
 ('eye', 1),
 

In [22]:
# for each word in the list of most common word, if more than half of the comedian have a that word 
# as their common word, we consider that as a stop word

In [23]:
additional_stop_words = [word for word, count in Counter(words).most_common() if count >= 6]
additional_stop_words

['like',
 'and',
 'you',
 'know',
 'it',
 'right',
 'that',
 'get',
 'one',
 'people',
 'go',
 'got',
 'but',
 'so',
 'na',
 'we',
 'he',
 'would',
 'think',
 'what',
 'gon',
 'they',
 'see',
 'said']

In [24]:
cv = CountVectorizer(stop_words=additional_stop_words)
data_cv = cv.fit_transform(cleaned_data.transcript)
data_dtm_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm_stop.index = cleaned_data.index

In [25]:
data_dtm_stop.head()

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zealand,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,1,1,1,0


In [26]:
data_dtm_stop.to_pickle('dtm.pkl')

In [27]:
cleaned_data.to_pickle('cleaned_corpus.pkl')

In [28]:
pickle.dump(cv, open('cv.pkl', 'wb'))