In [107]:
import pandas as pd

import pickle 

import re
import string

from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.decomposition import LatentDirichletAllocation

from sklearn.metrics.pairwise import cosine_similarity

# Get Data

In [108]:
with open('ted_video_transcripts_2416.pkl', 'rb') as picklefile:
    df = pickle.load(picklefile)

In [109]:
df.head()

Unnamed: 0,video_id,transcript
0,YgAuFqEs6yk,[{'text': 'I remember watching my father raise...
1,bNmRr-BYnxA,[{'text': 'Transcriber: Joseph Geni Reviewer: ...
2,FVUkKKc3Vvk,"[{'text': 'Hi, everyone, my name is Elizabeth,..."
3,8bj0GR34XWc,[{'text': 'Transcriber: Ivana Korom Reviewer: ...
4,eaCrsBtiYA4,"[{'text': 'I am a public policy wonk.'}, {'tex..."


In [110]:
len(df)

2416

# Text Preprocessing

In [84]:
#stemmer=LancasterStemmer()
#porter=PorterStemmer()

In [85]:
englishStemmer=SnowballStemmer("english", ignore_stopwords=True)
def stemComment(comment_line):
    token_words=word_tokenize(comment_line)
    stem_line=[]
    for word in token_words:
        stem_line.append(englishStemmer.stem(word))
        stem_line.append(" ")
    return "".join(stem_line)

In [86]:
stop_words = set(stopwords.words('english'))

def remove_stop_words(comment_line):
    token_words=word_tokenize(str(comment_line))
    filtered_comment_line = [w for w in token_words if not w in stop_words] 
    
    return " ".join(filtered_comment_line)

In [87]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

In [88]:
df['transcript'] = df['transcript'].map(remove_stop_words).map(alphanumeric).map(punc_lower)

In [89]:
#https://stackoverflow.com/questions/29270917/removing-custom-stop-words-form-a-phrase-in-python
my_stop_words_lst = ['ted', 'text', 'thank', 'you']

for w in my_stop_words_lst:
    pattern = r'\b'+w+r'\b'
    custom_stop = lambda x: re.sub(pattern, ' ', x)
    df['transcript'] = df['transcript'].map(custom_stop)

In [90]:
df['transcript'] = df['transcript'].map(stemComment)

In [91]:
df.head()

Unnamed: 0,video_id,transcript
0,YgAuFqEs6yk,i rememb watch father rais pistol mother s hea...
1,bNmRr-BYnxA,transcrib joseph geni nreview camill martínez ...
2,FVUkKKc3Vvk,hi everyon name elizabeth and i work trade flo...
3,8bj0GR34XWc,transcrib ivana korom nreview krystian aparta ...
4,eaCrsBtiYA4,i public polici wonk i investig data point pro...


In [92]:
df['transcript'][1]

'transcrib joseph geni nreview camill martínez i never thought i would give nmi talk somewher like but like half human i ve spent last nfour week lockdown due global pandem ncreat covid i extrem fortun nthat time i ve abl come wood nnear home southern england these wood alway inspir and human tri think about nhow find inspir to retak control action terribl thing ndo n t come road without us take action avert i thought good place nfor us talk and i d like begin nthat stori six year ago when i first join nthe unit nation now i firm believ nthat un unparallel import in world right to promot collabor cooper but n t tell join is essenti work deliv main form nof extrem bore meet extrem long bore meet now may feel attend nsome long bore meet life i m sure but un meet next level and everyon work there napproach level calm normal achiev zen master but i n t readi i join expect drama nand tension breakthrough what i n t readi was process seem move nat speed glacier at speed glacier nuse move now

# Build Document-Term matrix

In [93]:
vectorizer = TfidfVectorizer(stop_words='english') 
doc_word = vectorizer.fit_transform(list(df.transcript))
doc_word

<2416x39840 sparse matrix of type '<class 'numpy.float64'>'
	with 1012696 stored elements in Compressed Sparse Row format>

In [94]:
pd.DataFrame(doc_word.toarray(), columns=vectorizer.get_feature_names()).shape

(2416, 39840)

In [95]:
dtm = pd.DataFrame(doc_word.toarray(), index=df['video_id'], columns=vectorizer.get_feature_names())

# Topic Modeling

In [96]:
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))


### Latent Semantic Analysis (LSA)

In [100]:
lsa = TruncatedSVD(10)
doc_topic = lsa.fit_transform(dtm)
lsa.explained_variance_ratio_

array([0.00691832, 0.00867407, 0.00722984, 0.00638489, 0.00584166,
       0.00547615, 0.00517708, 0.0046375 , 0.00452422, 0.00410085])

In [101]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5",
                     "component_6","component_7", "component_8","component_9", "component_10"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaargh,aaaaaah,aaaab,aaaah,aaaahhh,...,ān,čapek,ōfunato,ʾan,ʾilla,ʾilāha,อย,อยman,อร,送你葱
component_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,-0.001,0.0,-0.0,-0.0,0.001,-0.0,-0.0,-0.0,-0.0,0.0,...,-0.0,-0.0,0.0,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0
component_3,0.001,-0.0,0.002,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.0,0.0,0.0,-0.0,-0.0,-0.0,0.001,0.001,0.002,-0.0
component_4,0.001,-0.0,-0.001,-0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,...,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,-0.001,-0.0
component_5,-0.001,0.002,-0.0,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.001,0.001,0.001,0.0
component_6,-0.0,0.0,0.001,0.0,-0.0,0.0,0.0,-0.0,-0.0,0.0,...,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.001,0.0
component_7,-0.001,0.0,0.002,-0.0,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,...,0.0,0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,-0.0,0.0
component_8,0.0,-0.0,-0.0,0.0,-0.001,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.001,0.0
component_9,-0.001,-0.001,-0.0,-0.0,0.001,-0.0,-0.0,-0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.0
component_10,0.0,-0.0,-0.001,0.0,-0.001,0.0,0.0,-0.0,0.0,0.0,...,-0.0,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,0.0


In [102]:
display_topics(lsa, vectorizer.get_feature_names(), 15)


Topic  0
peopl, nto, like, nand, nof, nthat, think, thing, know, time, laughter, year, work, want, make

Topic  1
women, girl, men, school, children, said, famili, woman, parent, kid, mother, educ, peopl, young, nand

Topic  2
music, laughter, like, brain, thing, play, love, know, song, video, feel, robot, said, sound, realli

Topic  3
cell, brain, cancer, patient, diseas, nto, women, drug, nthat, nand, health, bodi, doctor, nof, medic

Topic  4
peopl, data, patient, think, inform, health, actual, govern, drug, thing, cancer, compani, know, diseas, countri

Topic  5
women, music, cell, water, ocean, girl, men, cancer, fish, planet, diseas, year, children, patient, anim

Topic  6
music, patient, health, applaus, nof, nto, govern, countri, guitar, sound, technolog, play, drug, compani, song

Topic  7
women, robot, men, girl, comput, woman, ai, gender, machin, sex, femal, data, technolog, sexual, imag

Topic  8
school, robot, design, citi, kid, build, student, children, educ, teacher, gi

In [62]:
lsa.components_

array([[ 9.68620709e-05,  1.03763392e-04,  7.48935844e-05, ...,
         2.98614931e-05,  5.97229861e-05,  1.11542964e-05],
       [ 1.88035532e-05,  1.64924212e-04,  4.20325867e-03, ...,
         6.46077822e-04,  1.29215564e-03,  2.59712183e-05],
       [ 1.10812732e-03, -5.44983553e-04, -1.47105038e-04, ...,
         1.22011256e-04,  2.44022511e-04, -8.35359701e-05],
       [ 3.75091143e-04,  5.80494082e-05,  3.32403708e-04, ...,
         1.41503516e-04,  2.83007031e-04, -1.28096529e-04],
       [ 2.62605934e-04, -1.35827142e-03,  7.39542845e-04, ...,
        -7.08270320e-04, -1.41654064e-03, -2.02189834e-04]])

In [63]:
doc_topic_lsa = pd.DataFrame(doc_topic,
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_lsa

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YgAuFqEs6yk,0.890676,0.042962,-0.056709,0.004102,-0.062049
bNmRr-BYnxA,0.883629,-0.040627,-0.010975,-0.019726,0.053228
FVUkKKc3Vvk,0.885644,-0.025310,-0.068833,0.007190,0.038097
8bj0GR34XWc,0.848307,-0.024130,0.027906,0.039088,0.000483
eaCrsBtiYA4,0.852875,-0.035804,-0.060799,-0.018492,0.026517
...,...,...,...,...,...
HUM2rCIUdeI,0.838504,0.006174,-0.023050,-0.043234,-0.045791
C_SBGTJgBGo,0.665815,0.252792,-0.011495,0.026302,0.079951
MLU7qcMYKO8,0.682086,0.002008,0.119328,-0.127289,0.008149
yyemG7V5ynQ,0.787745,0.005806,-0.005591,-0.023757,-0.052328


In [64]:
cosine_similarity((doc_topic_lsa.values[0], 
                   doc_topic_lsa.values[1], 
                   doc_topic_lsa.values[3],  
                   doc_topic_lsa.values[4], 
                   doc_topic_lsa.values[5], 
                   doc_topic_lsa.values[6]))

array([[1.        , 0.98557882, 0.98916038, 0.99057338, 0.98948304,
        0.9254981 ],
       [0.98557882, 1.        , 0.99472814, 0.99785056, 0.99538184,
        0.94692381],
       [0.98916038, 0.99472814, 1.        , 0.99177268, 0.99965085,
        0.96728182],
       [0.99057338, 0.99785056, 0.99177268, 1.        , 0.99180318,
        0.93060307],
       [0.98948304, 0.99538184, 0.99965085, 0.99180318, 1.        ,
        0.96603434],
       [0.9254981 , 0.94692381, 0.96728182, 0.93060307, 0.96603434,
        1.        ]])

In [109]:
#cosine_similarity((doc_topic_lsa.values[0], doc_topic_lsa.values[6]))

### NMF (Non-Negative Matrix Factorization)

In [65]:
nmf_model = NMF(5)
doc_topic = nmf_model.fit_transform(doc_word)

In [66]:
topic_word = pd.DataFrame(nmf_model.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaargh,aaaaaah,aaaab,aaaah,aaaahhh,...,ān,čapek,ōfunato,ʾan,ʾilla,ʾilāha,อย,อยman,อร,送你葱
component_1,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_2,0.0,0.0,0.007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0
component_3,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_4,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
component_5,0.0,0.0,0.0,0.0,0.001,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0,0.0


In [67]:
display_topics(nmf_model, vectorizer.get_feature_names(), 14)


Topic  0
text, peopl, like, thing, think, know, laughter, want, time, realli, ve, say, make, year

Topic  1
music, text, applaus, guitar, play, song, sound, cheer, sing, cello, musician, end, compos, nois

Topic  2
text, water, planet, ocean, earth, nof, fish, climat, nto, energi, carbon, mar, solar, nand

Topic  3
text, cell, cancer, brain, patient, diseas, drug, dna, gene, protein, blood, bodi, health, tumor

Topic  4
text, nto, nand, nthat, nof, women, nin, nthe, nfor, na, peopl, nis, communiti, nwith


In [68]:
doc_topic_nmf = pd.DataFrame(doc_topic.round(5),
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_nmf

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
YgAuFqEs6yk,0.14555,0.00648,0.00000,0.00000,0.01010
bNmRr-BYnxA,0.11341,0.00000,0.05289,0.00312,0.03076
FVUkKKc3Vvk,0.12999,0.00000,0.00000,0.00000,0.04142
8bj0GR34XWc,0.11079,0.00000,0.01119,0.07735,0.01531
eaCrsBtiYA4,0.12552,0.00000,0.00304,0.00000,0.03675
...,...,...,...,...,...
HUM2rCIUdeI,0.13222,0.00023,0.01592,0.00000,0.00846
C_SBGTJgBGo,0.08482,0.17426,0.00000,0.00077,0.00000
MLU7qcMYKO8,0.03039,0.00000,0.22262,0.00000,0.00000
yyemG7V5ynQ,0.13147,0.00000,0.00000,0.00328,0.00331


In [69]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[1]))

array([[1.        , 0.89336073],
       [0.89336073, 1.        ]])

In [70]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[26]))

array([[1.        , 0.31067444],
       [0.31067444, 1.        ]])

### LDA

Changing vectorizer from TF-IDF

In [71]:
vectorizer = CountVectorizer(stop_words='english') 
doc_word = vectorizer.fit_transform(list(df.transcript))
doc_word

<2416x39840 sparse matrix of type '<class 'numpy.int64'>'
	with 1017301 stored elements in Compressed Sparse Row format>

In [72]:
pd.DataFrame(doc_word.toarray(), columns=vectorizer.get_feature_names()).shape

(2416, 39840)

In [73]:
dtm = pd.DataFrame(doc_word.toarray(), index=df['video_id'], columns=vectorizer.get_feature_names())

In [74]:
lda = LatentDirichletAllocation(n_components=5)

In [75]:
doc_topic = lda.fit_transform(dtm)


In [76]:
lda.score

<bound method LatentDirichletAllocation.score of LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)>

In [77]:
topic_word = pd.DataFrame(lda.components_.round(3),
             index = ["component_1","component_2", "component_3","component_4", "component_5"],
             columns = vectorizer.get_feature_names())
topic_word

Unnamed: 0,aa,aaa,aaaa,aaaaa,aaaaaa,aaaaaaaargh,aaaaaah,aaaab,aaaah,aaaahhh,...,ān,čapek,ōfunato,ʾan,ʾilla,ʾilāha,อย,อยman,อร,送你葱
component_1,0.2,2.361,0.2,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.208,0.2,0.2,0.202,0.202,0.202,0.2,0.2,0.2,0.207
component_2,0.2,0.2,0.201,0.2,0.2,0.2,0.2,0.2,0.2,0.2,...,0.2,0.2,0.211,0.2,0.2,0.2,0.2,0.2,0.2,0.2
component_3,0.212,0.203,0.214,0.201,0.2,0.2,1.198,0.2,0.475,0.2,...,0.2,0.336,0.2,0.2,0.2,0.2,0.219,0.219,0.215,0.2
component_4,0.203,6.504,0.201,0.2,1.2,0.2,0.201,0.2,0.202,0.208,...,0.201,1.059,1.18,1.198,1.198,1.198,0.202,0.202,0.202,0.204
component_5,10.185,0.732,4.184,2.199,0.2,1.2,0.201,1.2,0.923,1.192,...,1.191,0.204,0.208,0.2,0.2,0.2,1.179,1.179,2.183,1.189


In [78]:
display_topics(lda, vectorizer.get_feature_names(), 15)


Topic  0
text, peopl, nto, nand, nthat, countri, nof, work, think, nin, world, nthe, thing, right, like

Topic  1
text, citi, build, water, nof, design, space, nto, nand, use, materi, nthat, earth, energi, nin

Topic  2
text, like, use, brain, nto, make, thing, think, time, nof, realli, cell, nthat, way, work

Topic  3
text, peopl, like, nto, nand, know, nthat, laughter, time, nof, year, want, say, think, said

Topic  4
text, like, know, think, peopl, year, thing, look, realli, time, world, actual, ve, make, use


In [79]:
lda.components_

array([[0.20000031, 0.20000031, 0.20000031, ..., 0.20000702, 0.20000031,
        0.20000031],
       [0.20054092, 0.20054092, 0.20054092, ..., 0.21819025, 0.20054092,
        0.20054092],
       [0.20000031, 0.20000031, 0.20000031, ..., 0.20000713, 0.20000031,
        0.20000031],
       [0.20000031, 0.20000031, 0.20000031, ..., 0.20000714, 0.20000031,
        0.20000031],
       [0.20025767, 0.20025767, 0.20025767, ..., 0.20737333, 0.20025767,
        0.20025767]])

In [80]:
doc_topic_lda = pd.DataFrame(doc_topic.round(5),
                             index=df['video_id'],
                             columns = ["component_1","component_2", "component_3","component_4", "component_5"])
doc_topic_lda

Unnamed: 0_level_0,component_1,component_2,component_3,component_4,component_5
video_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
_5DXs8xxaMU,0.01856,0.92575,0.01856,0.01856,0.01856
DNrMPF3I_bs,0.01291,0.94838,0.01290,0.01290,0.01290
TwN8soCzjPM,0.01373,0.94509,0.01373,0.01373,0.01373
8OMUdYoIJhI,0.01509,0.93965,0.01509,0.01509,0.01509
xk_MHfOAfRQ,0.01924,0.92306,0.01923,0.01923,0.01923
...,...,...,...,...,...
qlH5-G576fQ,0.01129,0.95486,0.01128,0.01128,0.01128
uLXyFJEbj_s,0.01758,0.92969,0.01758,0.01758,0.01758
A3riThai7MU,0.02080,0.91681,0.02080,0.02080,0.02080
CrfnkgwU978,0.02080,0.91683,0.02079,0.02079,0.02079


In [81]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[1]))

array([[1.        , 0.96752052],
       [0.96752052, 1.        ]])

In [55]:
cosine_similarity((doc_topic_nmf.values[0], doc_topic_nmf.values[136]))

array([[1.        , 0.84052477],
       [0.84052477, 1.        ]])