In [1]:
import pandas as pd
import pickle
from gensim import matutils, models
import scipy.sparse
from nltk import word_tokenize, pos_tag

import nltk
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vignesh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
data = pd.read_pickle('dtm.pkl')
data.head()

Unnamed: 0,aaaaah,aaaaahhhhhhh,aaaaauuugghhhhhh,aaaahhhhh,aaah,aah,abc,abcs,ability,abject,...,zealand,zee,zen,zeppelin,zero,zillion,zombie,zombies,zoning,zoo
louis,0,0,0,0,0,3,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
dave,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1
bo,0,1,1,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
bill,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,1,1,1,1,0


In [3]:
tdm = data.transpose()
tdm.head()

Unnamed: 0,louis,dave,ricky,bo,bill,jim,john,hasan,ali,anthony,mike,joe
aaaaah,0,0,0,0,1,0,0,0,0,0,0,0
aaaaahhhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaaauuugghhhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaaahhhhh,0,0,0,1,0,0,0,0,0,0,0,0
aaah,0,1,0,0,0,0,0,0,0,0,0,0


In [4]:
sparse_counts = scipy.sparse.csr_matrix(tdm)

In [5]:
sparse_counts

<7583x12 sparse matrix of type '<class 'numpy.int64'>'
	with 17668 stored elements in Compressed Sparse Row format>

In [6]:
corpus = matutils.Sparse2Corpus(sparse_counts)

In [7]:
corpus

<gensim.matutils.Sparse2Corpus at 0x7fb9697b38d0>

In [8]:
cv = pickle.load(open("cv.pkl", "rb"))
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [9]:
# topic modeling - with all text

In [10]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=2, passes=10)
lda.print_topics()

[(0,
  '0.009*"fucking" + 0.006*"fuck" + 0.006*"yeah" + 0.005*"shit" + 0.005*"voice" + 0.005*"thing" + 0.005*"guy" + 0.005*"say" + 0.005*"the" + 0.004*"going"'),
 (1,
  '0.005*"she" + 0.005*"time" + 0.005*"the" + 0.005*"no" + 0.004*"oh" + 0.004*"shit" + 0.004*"dad" + 0.004*"say" + 0.004*"well" + 0.004*"going"')]

In [11]:
lda = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=3, passes=10)
lda.print_topics()

[(0,
  '0.007*"voice" + 0.006*"she" + 0.005*"no" + 0.005*"say" + 0.005*"time" + 0.005*"dad" + 0.005*"love" + 0.004*"the" + 0.004*"well" + 0.004*"want"'),
 (1,
  '0.010*"fucking" + 0.007*"shit" + 0.007*"yeah" + 0.007*"fuck" + 0.005*"say" + 0.005*"the" + 0.005*"no" + 0.004*"she" + 0.004*"going" + 0.004*"little"'),
 (2,
  '0.008*"fucking" + 0.006*"shit" + 0.006*"fuck" + 0.006*"the" + 0.005*"went" + 0.005*"good" + 0.005*"thing" + 0.005*"time" + 0.004*"man" + 0.004*"cause"')]

In [12]:
# topic modelling - with noun only

In [13]:
def nouns(text):
    """ given a string of text, tokenize the text and pull out only the nouns."""
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)]
    return ' '.join(all_nouns)

In [14]:
data_clean = pd.read_pickle('cleaned_corpus.pkl')
data_clean.head()

Unnamed: 0,transcript
louis,intro fade music let roll hold lights do ...
dave,this dave he tells dirty jokes living that s...
ricky,hello hello how great thank wow calm sh...
bo,bo what old macdonald farm e i e i o and farm...
bill,cheers applause all right thank thank much...


In [15]:
data_nouns = pd.DataFrame(data_clean.transcript.apply(nouns))
data_nouns.head()

Unnamed: 0,transcript
louis,intro fade music let roll lights lights i i pl...
dave,dave jokes work train alchemist fire transform...
ricky,hello wow calm thank i gon tonight money guy r...
bo,macdonald farm e i o farm pig e i i macdonald ...
bill,cheers thank thank thank pleasure georgia area...


In [16]:
add_stop_words = ['like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [17]:
cvn = CountVectorizer(stop_words = add_stop_words)
data_cvn = cvn.fit_transform(data_nouns.transcript)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn.head()

Unnamed: 0,aaaaah,aaaahhhhh,aah,abc,ability,abortion,abortions,abuse,accent,accents,...,youth,youtube,yulin,zealand,zee,zeppelin,zero,zillion,zombie,zombies
louis,0,0,2,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
dave,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
bo,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
bill,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1


In [18]:
# creating the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [19]:
ldan = models.LdaModel(corpus=corpusn, id2word=id2wordn, num_topics=2, passes=10)
ldan.print_topics()

[(0,
  '0.028*"voice" + 0.009*"laughter" + 0.007*"dad" + 0.006*"life" + 0.006*"bo" + 0.006*"guy" + 0.006*"man" + 0.005*"repeat" + 0.005*"stuff" + 0.004*"show"'),
 (1,
  '0.011*"thing" + 0.010*"day" + 0.009*"man" + 0.008*"cause" + 0.008*"life" + 0.007*"way" + 0.007*"guy" + 0.007*"shit" + 0.006*"gon" + 0.006*"women"')]

In [20]:
ldan = models.LdaModel(corpus=corpusn, id2word=id2wordn, num_topics=3, passes=10)
ldan.print_topics()

[(0,
  '0.013*"day" + 0.010*"thing" + 0.009*"man" + 0.009*"cause" + 0.007*"house" + 0.007*"women" + 0.007*"way" + 0.006*"life" + 0.006*"things" + 0.006*"lot"'),
 (1,
  '0.016*"voice" + 0.009*"guy" + 0.008*"man" + 0.008*"thing" + 0.007*"way" + 0.007*"day" + 0.006*"shit" + 0.006*"years" + 0.006*"gon" + 0.006*"something"'),
 (2,
  '0.014*"life" + 0.008*"dad" + 0.008*"thing" + 0.007*"laughter" + 0.006*"kids" + 0.006*"way" + 0.005*"mom" + 0.005*"man" + 0.005*"school" + 0.005*"parents"')]

In [21]:
ldan = models.LdaModel(corpus=corpusn, id2word=id2wordn, num_topics=4, passes=10)
ldan.print_topics()

[(0,
  '0.035*"voice" + 0.012*"laughter" + 0.008*"dad" + 0.008*"life" + 0.007*"bo" + 0.007*"guy" + 0.007*"man" + 0.006*"repeat" + 0.006*"stuff" + 0.005*"show"'),
 (1,
  '0.001*"man" + 0.000*"voice" + 0.000*"thing" + 0.000*"day" + 0.000*"life" + 0.000*"cause" + 0.000*"way" + 0.000*"things" + 0.000*"something" + 0.000*"years"'),
 (2,
  '0.012*"thing" + 0.009*"life" + 0.009*"day" + 0.009*"cause" + 0.008*"way" + 0.008*"guy" + 0.007*"years" + 0.007*"gon" + 0.006*"man" + 0.006*"kind"'),
 (3,
  '0.012*"man" + 0.011*"day" + 0.009*"thing" + 0.009*"women" + 0.009*"shit" + 0.008*"cause" + 0.007*"lot" + 0.006*"fuck" + 0.006*"things" + 0.006*"way"')]

In [22]:
# topic modeling - with nouns and adjectives

In [23]:
def nouns_adj(text):
    """given a string of text, tokenize the text and pull out only the nouns and adjectives"""
    is_noun_adj = lambda pos: pos[:2] == 'NN' or pos[:2] == 'JJ'
    tokenized = word_tokenize(text)
    nouns_adj = [word for (word, pos) in pos_tag(tokenized) if is_noun_adj(pos)]
    return ' '.join(nouns_adj)

In [24]:
data_nouns_adj = pd.DataFrame(data_clean.transcript.apply(nouns_adj))
data_nouns_adj.head()

Unnamed: 0,transcript
louis,intro fade music let roll lights lights much i...
dave,dave dirty jokes stare hard work train thought...
ricky,hello great thank wow calm fuck thank welcome ...
bo,old macdonald farm e i i o farm pig e i i old ...
bill,cheers right thank much thank thank pleasure g...


In [51]:
cvna = CountVectorizer(stop_words=stop_words, max_df=.8)
data_cvna = cvna.fit_transform(data_nouns_adj.transcript)
data_dtmna = pd.DataFrame(data_cvna.toarray(), columns=cvna.get_feature_names())
data_dtmna.index = data_nouns_adj.index
data_dtmna.head()

Unnamed: 0,aaaaah,aaaahhhhh,aah,abc,ability,abject,able,ablebodied,abortion,abortions,...,yummy,ze,zealand,zee,zen,zeppelin,zero,zillion,zombie,zombies
louis,0,0,2,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
dave,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
ricky,0,0,0,0,1,1,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
bo,0,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
bill,1,0,0,0,0,0,1,0,0,0,...,1,1,0,0,0,0,0,1,1,1


In [52]:
corpusna = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmna.transpose()))

# creating the vocabulary dictionary
id2wordna = dict((v, k) for k, v in cvna.vocabulary_.items())

In [27]:
ldana = models.LdaModel(corpus=corpusna, num_topics=2, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.005*"dude" + 0.004*"mom" + 0.004*"wife" + 0.003*"joke" + 0.003*"ahah" + 0.003*"jenny" + 0.003*"friend" + 0.003*"parents" + 0.002*"anthony" + 0.002*"son"'),
 (1,
  '0.014*"voice" + 0.005*"audience" + 0.004*"laughter" + 0.004*"bro" + 0.003*"fucking" + 0.003*"joke" + 0.003*"bo" + 0.003*"robotic" + 0.003*"mom" + 0.003*"dude"')]

In [28]:
ldana = models.LdaModel(corpus=corpusna, num_topics=3, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.005*"ahah" + 0.004*"jenny" + 0.004*"laughter" + 0.004*"parents" + 0.003*"hasan" + 0.003*"mom" + 0.003*"door" + 0.002*"minutes" + 0.002*"comedy" + 0.002*"friend"'),
 (1,
  '0.013*"voice" + 0.006*"dude" + 0.005*"joke" + 0.005*"mom" + 0.004*"bro" + 0.003*"bo" + 0.003*"robotic" + 0.003*"fucking" + 0.003*"wife" + 0.003*"jokes"'),
 (2,
  '0.008*"audience" + 0.007*"guns" + 0.005*"ass" + 0.005*"girlfriend" + 0.005*"gun" + 0.004*"accent" + 0.004*"cunt" + 0.004*"class" + 0.004*"fucking" + 0.004*"son"')]

In [29]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=10)
ldana.print_topics()

[(0,
  '0.000*"voice" + 0.000*"dude" + 0.000*"na" + 0.000*"mom" + 0.000*"audience" + 0.000*"ass" + 0.000*"dead" + 0.000*"parents" + 0.000*"dog" + 0.000*"date"'),
 (1,
  '0.007*"ahah" + 0.007*"joke" + 0.005*"son" + 0.005*"audience" + 0.005*"anthony" + 0.005*"guns" + 0.004*"american" + 0.004*"gun" + 0.004*"fucking" + 0.004*"party"'),
 (2,
  '0.015*"voice" + 0.005*"mom" + 0.005*"laughter" + 0.004*"bro" + 0.004*"parents" + 0.003*"dude" + 0.003*"audience" + 0.003*"bo" + 0.003*"robotic" + 0.003*"joke"'),
 (3,
  '0.006*"jenny" + 0.006*"dude" + 0.004*"morning" + 0.003*"gun" + 0.003*"jesus" + 0.003*"dog" + 0.003*"idea" + 0.003*"date" + 0.003*"parents" + 0.003*"andy"')]

In [30]:
# out of all the above topic models , the noun and adjective, 4 topic one made the most sense, lets move on with
# that model

In [53]:
ldana = models.LdaModel(corpus=corpusna, num_topics=4, id2word=id2wordna, passes=70)
ldana.print_topics()

[(0,
  '0.006*"mom" + 0.006*"clinton" + 0.005*"wife" + 0.005*"president" + 0.005*"dude" + 0.004*"cow" + 0.004*"parents" + 0.004*"dick" + 0.003*"stupid" + 0.003*"office"'),
 (1,
  '0.007*"mom" + 0.006*"laughter" + 0.006*"joke" + 0.005*"parents" + 0.005*"anthony" + 0.005*"hasan" + 0.003*"date" + 0.003*"brown" + 0.003*"york" + 0.003*"birthday"'),
 (2,
  '0.006*"dude" + 0.006*"ahah" + 0.006*"jenny" + 0.003*"gay" + 0.003*"wife" + 0.003*"son" + 0.003*"gun" + 0.003*"nigga" + 0.003*"morning" + 0.003*"friend"'),
 (3,
  '0.021*"voice" + 0.006*"audience" + 0.005*"bro" + 0.005*"joke" + 0.005*"bo" + 0.004*"robotic" + 0.004*"um" + 0.004*"repeat" + 0.004*"ass" + 0.003*"eye"')]

In [56]:
# we will settle with the above topics and try to interpret them.
# topic 0: [conversation comedies]
# topic 1: [accident, gun, spirituality]
# topic 2: [teenager, profanity]
# topic 3: [husband, wife, family]

In [57]:
# lets take a look at which topics each transcript contains
corpus_transformed = ldana[corpusna]
list(zip([a for [(a, b)] in corpus_transformed], data_dtmna.index))

[(1, 'louis'),
 (2, 'dave'),
 (3, 'ricky'),
 (3, 'bo'),
 (2, 'bill'),
 (3, 'jim'),
 (0, 'john'),
 (1, 'hasan'),
 (3, 'ali'),
 (1, 'anthony'),
 (2, 'mike'),
 (0, 'joe')]

In [58]:
# from our first pass of LDA, we come to following conclusion
# topic 0: [conversation comedies] -> (john, joe)
# topic 1: [accident, gun, spirituality] -> (louis, hasan, anthony)
# topic 2: [teenager, profanity] -> (dave, bill, mike)
# topic 3: [husband, wife, family] -> (ricky, bo, jim, ali)