In [414]:
import pickle 

import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

import pandas as pd

import re
import string

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.decomposition import LatentDirichletAllocation

# Get Data

In [415]:
with open('video_id_comments_bulk_df.pkl', 'rb') as picklefile:
    df = pickle.load(picklefile)

In [416]:
#with open('video_id_comments_bulk_df_2.pkl', 'rb') as picklefile:
#    df2 = pickle.load(picklefile)

In [420]:
df.head()

Unnamed: 0,video_id,comments
0,-6Li2qT0ZkI,{Greetings from UK.. I love your videos ! Your...
1,-CCW4Xnp_sQ,"{Same problem over here Randy, only with less ..."
2,-Cg1jBuYZYM,"{Kuch jahilo se toh kutte Jada samjhdar hai, 😄..."
3,-LAaN5VnTDo,{Where the vid where the Italian people in the...
4,-skA4GhVX7k,{Love yo ass. Hope you’re okay. Remember to us...


In [418]:
len(df)

496

In [419]:
#move to II
df.reset_index(inplace=True)
df.rename(columns={'snippet.topLevelComment.snippet.textDisplay': 'comments', 'snippet.videoId':'video_id'}, inplace=True)

# Text Preprocessing

In [381]:
#stemmer=LancasterStemmer()
#porter=PorterStemmer()

In [421]:
englishStemmer=SnowballStemmer("english", ignore_stopwords=True)
def stemComment(comment_line):
    token_words=word_tokenize(comment_line)
    stem_line=[]
    for word in token_words:
        stem_line.append(englishStemmer.stem(word))
        stem_line.append(" ")
    return "".join(stem_line)

In [422]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(comment_line):
    token_words=word_tokenize(comment_line)
    filtered_comment_line = [w for w in token_words if not w in stop_words] 
    
    return " ".join(filtered_comment_line)

In [423]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [424]:
df['comments'] = df['comments'].map(remove_stop_words).map(alphanumeric).map(punc_lower)

In [425]:
#https://stackoverflow.com/questions/29270917/removing-custom-stop-words-form-a-phrase-in-python
my_stop_words_lst = ['br', 'https', 'http', 'youtu', 'href', 'com', 'video', 'www', 'youtube', 
                    'watch', 'nice', 'don', 'shall', 'virus', 'just', 'corona', 'coronavirus', 'covid',
                     'love', 'like', 
                     'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
                    ]

for w in my_stop_words_lst:
    pattern = r'\b'+w+r'\b'
    custom_stop = lambda x: re.sub(pattern, ' ', x)
    df['comments'] = df['comments'].map(custom_stop)

In [426]:
#stemSentence('Greetings from UK.. I love your videos' )

In [427]:
df['comments'] = df['comments'].map(stemComment)

In [428]:
df.head()

Unnamed: 0,video_id,comments
0,-6Li2qT0ZkI,greet uk i video your dialect uniqu articul pl...
1,-CCW4Xnp_sQ,same problem randi less chin tut tut brilliant...
2,-Cg1jBuYZYM,kuch jahilo se toh kutt jada samjhdar hai 😄😄😄😄...
3,-LAaN5VnTDo,where vid italian peopl neighborhood sing anim...
4,-skA4GhVX7k,yo ass hope ’ okay rememb use suppositori gave...


In [429]:
df['comments'][1]

'same problem randi less chin tut tut brilliant brilliant say brilliant randi rainbow presid what ’ bet dislik southern lol 🤣 god bless randi rainbow genius truth teller we need ‘ randi rainbow ’ tragic trump puke face penc fiasco kill thousand american you love right point 😘😘😘😘😘 the lie liar lie tell we want novemb the open line b floor die laughter b bravo randi you gurl randi rainbow god work randi funni talent thank ’ servic fabulosoooooo 💗❤💕👌😄 he sassi version disney u give me hope that i will smile again and be back to being more concern of random shoot hit n run serious would take daili wake um quot quot presid quot quot bwteen human unspecifi speci i prefer u randi than a channel that is now play crypt code with my post ty randi say world think btw im aussi dislik his nasti immor way shot vice mannequin mr penc i wonder mani peopl work lyric if wow everyon spam donald trump twitter honest thing busi randi rainbow song give clear messag stupid dumb till tendon show 😷 🤣😂🤣😂🤣😂🤣😂 mr

# Build Document-Term matrix

In [430]:
vectorizer = CountVectorizer(stop_words='english') 
doc_word = vectorizer.fit_transform(list(df.comments))
doc_word

<496x10343 sparse matrix of type '<class 'numpy.int64'>'
	with 35952 stored elements in Compressed Sparse Row format>

In [431]:
pd.DataFrame(doc_word.toarray(), columns=vectorizer.get_feature_names()).shape

(496, 10343)

In [433]:
dtm = pd.DataFrame(doc_word.toarray(), index=df['video_id'], columns=vectorizer.get_feature_names())

# Topic Modeling

In [434]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


### Latent Semantic Analysis (LSA)

In [435]:
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(dtm)
lsa.explained_variance_ratio_

array([0.1223746 , 0.04737919, 0.02751112, 0.02625209, 0.02374624])

In [436]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaaaaaaaahhhh,aaaaacchoooo,aaat,aab,aaccha,aag,aai,aaj,...,빡빡,어뎐더머,𝚌𝚘𝚖𝚎𝚍𝚢,𝚍𝚒𝚍,𝚕𝚒𝚟𝚎,𝚕𝚘𝚟𝚎,𝚖𝚊𝚛𝚔𝚊𝚗𝚐𝚎𝚕,𝚠𝚑𝚒𝚕𝚎,𝚢𝚘𝚞,𝚢𝚘𝚞𝚛
component_1,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.002,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.002,-0.0,-0.002,...,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_3,-0.005,-0.0,-0.001,0.0,-0.001,-0.0,-0.001,-0.001,-0.001,-0.003,...,-0.001,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0
component_4,0.013,0.001,-0.0,-0.001,0.002,0.0,0.003,0.011,0.002,0.01,...,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,-0.007,-0.0,0.0,-0.0,-0.001,-0.0,-0.001,-0.007,-0.001,-0.005,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [437]:
display_topics(lsa, vectorizer.get_feature_names(), 15)


Topic  0
peopl, quot, world, god, time, amp, know, good, make, trump, look, come, great, say, govern

Topic  1
god, world, believ, beast, earth, greek, church, rev, come, antichrist, fg, easter, deceiv, jesus, word

Topic  2
peopl, china, govern, trump, countri, blame, god, white, death, jamaican, politician, west, jamaica, america, corrupt

Topic  3
hai, polic, ko, god, govern, india, ki, jamaican, se, bhi, politician, jamaica, ka, ye, ho

Topic  4
god, govern, jamaican, politician, corrupt, jamaica, thank, funni, follow, pharaoh, life, need, song, laugh, stay


In [438]:
doc_topic_lsa.values[0]

array([11.60979868, -2.54190063,  5.58639827,  1.99213588,  1.26193309])

In [439]:
lsa.components_

array([[ 8.59314658e-04,  6.17811859e-05,  1.94607477e-04, ...,
         1.29832157e-04,  1.29832157e-04,  1.29832157e-04],
       [-2.40756576e-03, -1.57509702e-04, -3.20121141e-04, ...,
        -2.00739288e-04, -2.00739288e-04, -2.00739288e-04],
       [-4.54650307e-03, -2.47061698e-04, -6.78593185e-04, ...,
        -4.45400873e-04, -4.45400873e-04, -4.45400873e-04],
       [ 1.30882376e-02,  6.42974789e-04, -2.39772710e-05, ...,
         1.75141570e-04,  1.75141570e-04,  1.75141570e-04],
       [-6.83825627e-03, -2.78207516e-04,  3.11205294e-04, ...,
         3.63147437e-04,  3.63147437e-04,  3.63147437e-04]])

In [440]:
doc_topic_lsa = pd.DataFrame(doc_topic,
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_lsa

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-6Li2qT0ZkI,11.609798,-2.541641,5.597281,1.977983,1.279138
-CCW4Xnp_sQ,5.013508,-0.664213,-0.730655,-1.433681,0.460213
-Cg1jBuYZYM,0.055430,-0.113917,-0.061930,0.580788,-0.396235
-LAaN5VnTDo,5.062876,-2.060751,-1.970809,-0.870109,0.452020
-skA4GhVX7k,3.885162,-1.092680,-0.758483,-0.986067,0.069929
...,...,...,...,...,...
yt5vLJpy_sw,1.042221,-1.020226,-1.335610,0.400347,0.944982
zJD7DImCThk,6.170190,-1.906691,-2.527261,-0.821817,0.852193
zTMlWzMe-h8,2.416103,-1.295802,-0.181443,0.130939,0.262855
zqOMkTvqaDY,1.395695,0.023629,-0.979471,-0.950793,-0.391154


In [441]:
cosine_similarity((doc_topic_lsa.values[0], 
                   doc_topic_lsa.values[1], 
                   doc_topic_lsa.values[3],  
                   doc_topic_lsa.values[4], 
                   doc_topic_lsa.values[5], 
                   doc_topic_lsa.values[6]))

array([[1.        , 0.75329915, 0.65917892, 0.74107536, 0.33196703,
        0.75403626],
       [0.75329915, 1.        , 0.94443716, 0.98681935, 0.34860319,
        0.89560995],
       [0.65917892, 0.94443716, 1.        , 0.97653666, 0.59777773,
        0.97153068],
       [0.74107536, 0.98681935, 0.97653666, 1.        , 0.44466841,
        0.93749536],
       [0.33196703, 0.34860319, 0.59777773, 0.44466841, 1.        ,
        0.70954623],
       [0.75403626, 0.89560995, 0.97153068, 0.93749536, 0.70954623,
        1.        ]])

In [442]:
#cosine_similarity((doc_topic_lsa.values[0], doc_topic_lsa.values[6]))

### NMF (Non-Negative Matrix Factorization)

In [443]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)

In [444]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaaaaaaaahhhh,aaaaacchoooo,aaat,aab,aaccha,aag,aai,aaj,...,빡빡,어뎐더머,𝚌𝚘𝚖𝚎𝚍𝚢,𝚍𝚒𝚍,𝚕𝚒𝚟𝚎,𝚕𝚘𝚟𝚎,𝚖𝚊𝚛𝚔𝚊𝚗𝚐𝚎𝚕,𝚠𝚑𝚒𝚕𝚎,𝚢𝚘𝚞,𝚢𝚘𝚞𝚛
component_1,0.005,0.001,0.007,0.001,0.001,0.001,0.0,0.0,0.0,0.0,...,0.009,0.003,0.005,0.005,0.005,0.005,0.005,0.005,0.005,0.005
component_2,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_3,0.0,0.0,0.0,0.005,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.11,0.005,0.0,0.0,0.014,0.002,0.022,0.097,0.02,0.084,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [445]:
display_topics(nmf_model, vectorizer.get_feature_names(), 10)


Topic  0
quot, funni, amp, thank, good, make, time, laugh, lol, realli

Topic  1
god, world, quot, peopl, believ, come, beast, church, greek, earth

Topic  2
peopl, trump, china, countri, quot, death, blame, look, world, want

Topic  3
hai, polic, ko, india, peopl, ki, se, bhi, ye, ka

Topic  4
god, govern, jamaican, politician, corrupt, jamaica, world, pharaoh, follow, power


In [446]:
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index=df['snippet.videoId'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_nmf

KeyError: 'snippet.videoId'

In [447]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[1]))

array([[1.        , 0.68465228],
       [0.68465228, 1.        ]])

In [448]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[26]))

array([[1.        , 0.10847377],
       [0.10847377, 1.        ]])

### LDA

In [449]:
lda = LatentDirichletAllocation(n_components=5)

In [450]:
doc_topic = lda.fit_transform(dtm)


In [451]:
lda.score

<bound method LatentDirichletAllocation.score of LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)>

In [452]:
topic_word = pd.DataFrame(lda.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaaaaaaaahhhh,aaaaacchoooo,aaat,aab,aaccha,aag,aai,aaj,...,빡빡,어뎐더머,𝚌𝚘𝚖𝚎𝚍𝚢,𝚍𝚒𝚍,𝚕𝚒𝚟𝚎,𝚕𝚘𝚟𝚎,𝚖𝚊𝚛𝚔𝚊𝚗𝚐𝚎𝚕,𝚠𝚑𝚒𝚕𝚎,𝚢𝚘𝚞,𝚢𝚘𝚞𝚛
component_1,0.226,0.2,0.2,0.2,1.195,0.2,0.2,0.2,0.2,0.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
component_2,6.173,0.2,0.2,0.2,0.2,0.2,1.2,2.2,1.2,1.2,...,0.2,0.2,1.199,1.199,1.199,1.199,1.199,1.199,1.199,1.199
component_3,0.2,0.2,1.2,1.2,0.2,0.2,0.2,0.2,0.2,0.2,...,1.2,1.2,0.201,0.201,0.201,0.201,0.201,0.201,0.201,0.201
component_4,2.2,0.201,0.2,0.2,0.205,0.2,0.2,1.2,0.2,0.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2
component_5,4.2,1.199,0.2,0.2,0.2,1.2,0.2,0.2,0.2,3.2,...,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2


In [454]:
display_topics(lda, vectorizer.get_feature_names(), 15)


Topic  0
good, god, quot, great, funni, world, amp, peopl, time, lol, laugh, come, guy, thank, make

Topic  1
bro, bhai, super, ko, hai, good, sir, ke, hahaha, sa, funni, comedi, channel, polic, best

Topic  2
peopl, quot, make, thank, amp, time, trump, good, know, laugh, stay, need, funni, think, world

Topic  3
polic, hai, peopl, india, ko, good, se, ki, pleas, guy, amp, lockdown, ka, ye, way

Topic  4
funni, kkkkkkkkkkkkkkkkk, comedi, ha, hai, peopl, song, use, amp, good, bro, man, quot, video, mark


In [455]:
lda.components_

array([[0.22627258, 0.20000062, 0.20000081, ..., 0.20000038, 0.20000038,
        0.20000038],
       [6.17310532, 0.20000101, 0.20000128, ..., 1.1988437 , 1.1988437 ,
        1.1988437 ],
       [0.20037544, 0.20000025, 1.19999564, ..., 0.20115488, 0.20115488,
        0.20115488],
       [2.2000981 , 0.20103679, 0.20000121, ..., 0.20000052, 0.20000052,
        0.20000052],
       [4.20014856, 1.19896133, 0.20000106, ..., 0.20000052, 0.20000052,
        0.20000052]])

In [459]:
doc_topic_lda = pd.DataFrame(doc_topic.round(5),
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_lda

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-6Li2qT0ZkI,0.00065,0.00065,0.99740,0.00065,0.00065
-CCW4Xnp_sQ,0.00151,0.00149,0.99403,0.00149,0.00149
-Cg1jBuYZYM,0.45003,0.01451,0.01447,0.50660,0.01438
-LAaN5VnTDo,0.00141,0.00140,0.99438,0.00141,0.00141
-skA4GhVX7k,0.49276,0.00161,0.50241,0.00161,0.00162
...,...,...,...,...,...
yt5vLJpy_sw,0.00532,0.00522,0.59006,0.00521,0.39419
zJD7DImCThk,0.00107,0.25328,0.74353,0.00106,0.00106
zTMlWzMe-h8,0.00399,0.32590,0.66219,0.00395,0.00397
zqOMkTvqaDY,0.96480,0.00873,0.00894,0.00875,0.00877


In [460]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[1]))

array([[1.        , 0.68465228],
       [0.68465228, 1.        ]])

In [461]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[136]))

array([[1.        , 0.65006751],
       [0.65006751, 1.        ]])