In [1]:
import pickle 

import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

import pandas as pd

import re
import string

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import LatentDirichletAllocation

# Get Data

In [25]:
with open('video_id_transcript_bulk_df_semi_clean.pkl', 'rb') as picklefile:
    df = pickle.load(picklefile)

In [26]:
#with open('video_id_comments_bulk_df_2.pkl', 'rb') as picklefile:
#    df2 = pickle.load(picklefile)

In [27]:
df.head()

Unnamed: 0,url,transcript
0,_5DXs8xxaMU,[{'text': 'yeah I really don't understand why'...
1,DNrMPF3I_bs,"[{'text': '[Music]'}, {'text': 'it's the Unite..."
3,TwN8soCzjPM,[{'text': 'testing one two I don't know if thi...
4,8OMUdYoIJhI,[{'text': 'hello again I'm Walter to continue ...
5,xk_MHfOAfRQ,[{'text': 'oh whoa who are you I'm Julie from ...


In [28]:
len(df)

277

In [29]:
#move to II
#df.reset_index(inplace=True)
df.rename(columns={'url': 'video_id'}, inplace=True)

# Text Preprocessing

In [30]:
#stemmer=LancasterStemmer()
#porter=PorterStemmer()

In [31]:
englishStemmer=SnowballStemmer("english", ignore_stopwords=True)
def stemComment(comment_line):
    token_words=word_tokenize(comment_line)
    stem_line=[]
    for word in token_words:
        stem_line.append(englishStemmer.stem(word))
        stem_line.append(" ")
    return "".join(stem_line)

In [32]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(comment_line):
    token_words=word_tokenize(str(comment_line))
    filtered_comment_line = [w for w in token_words if not w in stop_words] 
    
    return " ".join(filtered_comment_line)

In [33]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [34]:
df['transcript'] = df['transcript'].map(remove_stop_words).map(alphanumeric).map(punc_lower)

In [83]:
#https://stackoverflow.com/questions/29270917/removing-custom-stop-words-form-a-phrase-in-python
my_stop_words_lst = ['music', 'applause', 'text', 'br', 'https', 'http', 'youtu', 'href', 'com', 'video', 'www', 'youtube', 
                    'watch', 'nice', 'don', 'shall', 'virus', 'just', 'corona', 'coronavirus', 'covid',
                     'love', 'like', 
                     'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
                     'yeah', 'right', 'na', 'oh', 'think', 'got', 'say', 'want', 'thing',
                     'hey', 'fuck', 'motherfucker', 'subscribed', 'okay',
                     'subscribe', 'subscriber', 'subscribing', 'subscribers', 'please', 'make', 'thank', 'channel', 
                     'um', ' yo', 'uh'
                    ]

for w in my_stop_words_lst:
    pattern = r'\b'+w+r'\b'
    custom_stop = lambda x: re.sub(pattern, ' ', x)
    df['transcript'] = df['transcript'].map(custom_stop)

In [84]:
#stemSentence('Greetings from UK.. I love your videos' )

In [85]:
df['transcript'] = df['transcript'].map(stemComment)

In [86]:
df.head()

Unnamed: 0,video_id,transcript
0,_5DXs8xxaMU,i realli n t understand everybodi n t follow r...
1,DNrMPF3I_bs,s unit state europ i m greg shapiro american n...
3,TwN8soCzjPM,test one two i n t know gon work be cool i bou...
4,8OMUdYoIJhI,hello i m walter continu seri co vid public se...
5,xk_MHfOAfRQ,whoa i m juli four month futur actual tell s g...


In [87]:
df['transcript'][1]

's unit state europ i m greg shapiro american netherland remind work home that plagu shakespear wrote all king lea i lousi re part limit spread kovat some other as cousin florida hooda would this social distanc birthday is social distanc let kid play social distanc keep distanc ll show meter his club i go hous walk dog buy groceri let dog guard groceri i need guard dog i valuabl toilet paper stuff grow tree peopl s not peopl joke time global pandem i re s i ve stolen bunch peopl s joke yes s time favorit nineteen covet joke start number s lockdown dutch author announc ban public gather of peopl theatr show word show must go is go mani show cancel dutch cinema marqu that read noth case go home re provid essenti servic good for essenti servic hospit worker stay for stay home us aka grandpar call war re being call sit couch do hashtag stay home cour come the horror quarantin tweet shit get real i m give drink month sorri punctuat i m give drink month re drink may suggest quarantin ii s re

# Build Document-Term matrix

In [88]:
vectorizer = CountVectorizer(stop_words='english') 
doc_word = vectorizer.fit_transform(list(df.transcript))
doc_word

<277x12987 sparse matrix of type '<class 'numpy.int64'>'
	with 51739 stored elements in Compressed Sparse Row format>

In [89]:
vectorizer = TfidfVectorizer(stop_words='english') 
doc_word = vectorizer.fit_transform(list(df.transcript))
doc_word

<277x12987 sparse matrix of type '<class 'numpy.float64'>'
	with 51739 stored elements in Compressed Sparse Row format>

In [90]:
pd.DataFrame(doc_word.toarray(), columns=vectorizer.get_feature_names()).shape

(277, 12987)

In [91]:
dtm = pd.DataFrame(doc_word.toarray(), index=df['video_id'], columns=vectorizer.get_feature_names())

# Topic Modeling

In [92]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


### Latent Semantic Analysis (LSA)

In [93]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(dtm)
lsa.explained_variance_ratio_

array([0.02138391, 0.03487296, 0.01344659, 0.01257489, 0.01037169])

In [94]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaaaah,aaaaaw,aaahhh,aaron,aasimm,ab,abacus,abanda,abandon,...,منتننتب,منني,يبخؤن,يسي,يمنن,くじょう,不好意思,法國人,阴茎病毒,ﻻهص
component_1,0.0,0.0,0.0,0.0,0.001,0.0,0.003,0.0,0.001,0.003,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,-0.001,0.0,0.005,-0.0,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_3,-0.0,-0.0,-0.0,-0.0,0.0,-0.0,0.005,-0.0,0.001,-0.006,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_4,-0.0,-0.0,-0.0,-0.0,0.001,-0.0,0.004,0.0,0.001,-0.002,...,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_5,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.002,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
display_topics(lsa, vectorizer.get_feature_names(), 30)


Topic  0
know, peopl, gon, time, ve, realli, come, good, look, ll, work, mean, lot, home, need, day, way, let, laughter, man, stay, talk, guy, toilet, paper, said, hand, tri, tell, world

Topic  1
laughter, fool, obvious, surpri, marfa, myka, plea, check, sub, switch, flip, tintin, visit, subscrib, penc, trump, god, comedi, buddi, defam, buta, vqe, bruno, vato, andrea, lovato, ibaka, hemic, tetherb, americana

Topic  2
toilet, paper, gon, home, stay, guy, man, clean, ass, hand, shit, cuz, hack, kid, touch, need, bore, drink, wipe, school, babi, subscrib, quarantin, ll, plea, groceri, stop, damn, glove, hell

Topic  3
know, man, gon, good, realli, money, mean, ll, codi, stuff, ve, funni, guy, littl, god, better, kind, babi, boy, bad, ta, appreci, someth, tell, comedi, subscrib, time, comedian, resort, happen

Topic  4
stay, home, subscrib, plea, song, wash, comedi, let, isol, money, happi, play, sure, spread, insid, yes, friend, birthday, famili, nto, hand, teacher, mommi, infect, toge

In [96]:
lsa.components_

array([[ 7.97015135e-06,  7.97015135e-06,  7.97015135e-06, ...,
         2.55044843e-04,  7.97015135e-06,  7.97015135e-06],
       [-1.43566278e-06, -1.43566278e-06, -1.43566278e-06, ...,
        -4.59412088e-05, -1.43566278e-06, -1.43566278e-06],
       [-1.84324712e-06, -1.84324712e-06, -1.84324712e-06, ...,
        -5.89839080e-05, -1.84324712e-06, -1.84324712e-06],
       [-5.85833527e-06, -5.85833527e-06, -5.85833527e-06, ...,
        -1.87466729e-04, -5.85833527e-06, -5.85833527e-06],
       [ 6.87392304e-07,  6.87392304e-07,  6.87392304e-07, ...,
         2.19965537e-05,  6.87392304e-07,  6.87392304e-07]])

In [76]:
doc_topic_lsa = pd.DataFrame(doc_topic,
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_lsa

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
_5DXs8xxaMU,0.313722,-0.037640,0.067371,-0.253653,0.093814
DNrMPF3I_bs,0.369401,-0.036863,0.036569,-0.298927,0.008293
TwN8soCzjPM,0.512953,-0.060678,0.366408,-0.085243,-0.241111
8OMUdYoIJhI,0.365220,-0.035835,0.084806,0.093842,0.099401
xk_MHfOAfRQ,0.443260,0.051666,0.068540,0.353056,-0.073502
...,...,...,...,...,...
qlH5-G576fQ,0.526200,-0.049693,0.016550,0.315958,0.019130
uLXyFJEbj_s,0.260255,-0.025891,-0.207168,-0.112689,-0.079400
A3riThai7MU,0.423757,-0.044581,0.021968,0.242374,0.011728
CrfnkgwU978,0.271992,0.087251,-0.001373,0.018363,-0.053234


In [108]:
cosine_similarity((doc_topic_lsa.values[0], 
                   doc_topic_lsa.values[1], 
                   doc_topic_lsa.values[3],  
                   doc_topic_lsa.values[4], 
                   doc_topic_lsa.values[5], 
                   doc_topic_lsa.values[6]))

array([[1.        , 0.97231186, 0.68426166, 0.17952577, 0.93627735,
        0.87603034],
       [0.97231186, 1.        , 0.74072561, 0.2383418 , 0.86494457,
        0.88806269],
       [0.68426166, 0.74072561, 1.        , 0.80877308, 0.68394879,
        0.92116077],
       [0.17952577, 0.2383418 , 0.80877308, 1.        , 0.3207386 ,
        0.60785513],
       [0.93627735, 0.86494457, 0.68394879, 0.3207386 , 1.        ,
        0.89214641],
       [0.87603034, 0.88806269, 0.92116077, 0.60785513, 0.89214641,
        1.        ]])

In [109]:
#cosine_similarity((doc_topic_lsa.values[0], doc_topic_lsa.values[6]))

### NMF (Non-Negative Matrix Factorization)

In [98]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)

In [99]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaaaah,aaaaaw,aaahhh,aaron,aasimm,ab,abacus,abanda,abandon,...,منتننتب,منني,يبخؤن,يسي,يمنن,くじょう,不好意思,法國人,阴茎病毒,ﻻهص
component_1,0.0,0.0,0.0,0.0,0.003,0.0,0.008,0.001,0.001,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.01,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.0,0.0,0.0,0.003,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.011,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.001,0.0,0.002,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [100]:
display_topics(nmf_model, vectorizer.get_feature_names(), 14)


Topic  0
know, gon, peopl, good, man, realli, time, ve, ll, mean, come, guy, look, talk

Topic  1
laughter, fool, obvious, surpri, plea, check, myka, marfa, sub, switch, flip, tintin, god, visit

Topic  2
toilet, paper, gon, ass, need, hack, panic, day, shelv, hand, groceri, wipe, john, store

Topic  3
peopl, china, countri, trump, presid, test, case, number, american, know, infect, chine, come, spread

Topic  4
home, stay, plea, wash, hand, let, distanc, spread, bore, social, insid, eat, isol, yes


In [114]:
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_nmf

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
_5DXs8xxaMU,0.00000,0.00000,0.13633,0.19867,0.00000
DNrMPF3I_bs,0.00000,0.00066,0.20240,0.19646,0.03069
TwN8soCzjPM,0.13429,0.00000,0.03938,0.36182,0.02518
8OMUdYoIJhI,0.16283,0.00000,0.05823,0.05626,0.00207
xk_MHfOAfRQ,0.34669,0.04909,0.00000,0.00000,0.00000
...,...,...,...,...,...
qlH5-G576fQ,0.30730,0.00000,0.04917,0.00000,0.03118
uLXyFJEbj_s,0.00000,0.00246,0.23338,0.00000,0.00000
A3riThai7MU,0.21781,0.00000,0.05861,0.00000,0.00000
CrfnkgwU978,0.06823,0.06130,0.09577,0.02117,0.04522


In [115]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[1]))

array([[1.        , 0.97453512],
       [0.97453512, 1.        ]])

In [116]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[26]))

array([[1.        , 0.89487066],
       [0.89487066, 1.        ]])

### LDA

Change vectorizer!!!!

In [120]:
lda = LatentDirichletAllocation(n_components=5)

In [121]:
doc_topic = lda.fit_transform(dtm)


In [122]:
lda.score

<bound method LatentDirichletAllocation.score of LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)>

In [123]:
topic_word = pd.DataFrame(lda.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaaaah,aaaaaw,aaahhh,aaron,aasimm,ab,abacus,abanda,abandon,...,منتننتب,منني,يبخؤن,يسي,يمنن,くじょう,不好意思,法國人,阴茎病毒,ﻻهص
component_1,0.2,0.2,0.2,0.2,0.2,0.2,2.2,0.2,0.223,0.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
component_2,0.2,0.2,0.2,0.2,1.2,0.2,0.2,1.2,0.213,2.435,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
component_3,0.2,0.2,0.2,0.2,0.2,0.2,1.2,0.2,0.202,3.965,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
component_4,1.2,1.2,1.2,1.2,3.2,34.2,1.2,0.2,1.163,0.2,...,1.2,1.2,1.2,1.2,1.2,1.2,3.2,32.2,1.2,1.2
component_5,0.2,0.2,0.2,0.2,1.2,0.2,0.2,0.2,0.2,0.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2


In [124]:
display_topics(lda, vectorizer.get_feature_names(), 15)


Topic  0
yeah, know, think, peopl, mean, right, thing, make, realli, ve, way, time, got, someth, laughter

Topic  1
know, peopl, come, say, oh, yeah, right, year, na, think, want, time, got, ve, look

Topic  2
know, peopl, na, right, gon, got, say, thing, want, time, think, ve, realli, need, come

Topic  3
georgehotz, lul, pogchamp, claim, lol, georg, make, dj, vicio, use, work, stream, theonlymonka, harbad, know

Topic  4
know, peopl, time, say, right, think, okay, thank, thing, ve, na, look, want, ll, mask


In [79]:
lda.components_

array([[0.20000031, 0.20000031, 0.20000031, ..., 0.20000702, 0.20000031,
        0.20000031],
       [0.20054092, 0.20054092, 0.20054092, ..., 0.21819025, 0.20054092,
        0.20054092],
       [0.20000031, 0.20000031, 0.20000031, ..., 0.20000713, 0.20000031,
        0.20000031],
       [0.20000031, 0.20000031, 0.20000031, ..., 0.20000714, 0.20000031,
        0.20000031],
       [0.20025767, 0.20025767, 0.20025767, ..., 0.20737333, 0.20025767,
        0.20025767]])

In [80]:
doc_topic_lda = pd.DataFrame(doc_topic.round(5),
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_lda

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
_5DXs8xxaMU,0.01856,0.92575,0.01856,0.01856,0.01856
DNrMPF3I_bs,0.01291,0.94838,0.01290,0.01290,0.01290
TwN8soCzjPM,0.01373,0.94509,0.01373,0.01373,0.01373
8OMUdYoIJhI,0.01509,0.93965,0.01509,0.01509,0.01509
xk_MHfOAfRQ,0.01924,0.92306,0.01923,0.01923,0.01923
...,...,...,...,...,...
qlH5-G576fQ,0.01129,0.95486,0.01128,0.01128,0.01128
uLXyFJEbj_s,0.01758,0.92969,0.01758,0.01758,0.01758
A3riThai7MU,0.02080,0.91681,0.02080,0.02080,0.02080
CrfnkgwU978,0.02080,0.91683,0.02079,0.02079,0.02079


In [81]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[1]))

array([[1.        , 0.96752052],
       [0.96752052, 1.        ]])

In [55]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[136]))

array([[1.        , 0.84052477],
       [0.84052477, 1.        ]])