In [1]:
# Find most relevant terms for each topic using LDA clustering

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [4]:
df_transcripts = pd.read_csv("transcripts_m10.csv")

In [5]:
#df_transcripts.head(5)

In [6]:
tfv = TfidfVectorizer(ngram_range = (1,1))

In [7]:
vec_text = tfv.fit_transform(df_transcripts['lemmatized_text'])

In [8]:
df_transcripts.columns

Index(['text', 'alpha_text', 'no_stop', 'lemmatized_text', 'MIN(start)',
       'MAX(start)', 'SUM(duration)', 'video_id'],
      dtype='object')

In [9]:
words = tfv.get_feature_names()

In [10]:
words[:10]

['abandon',
 'abandoned',
 'abate',
 'abc',
 'aberrational',
 'abide',
 'ability',
 'able',
 'aboard',
 'abolish']

In [11]:
# now working through https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [12]:
lda_model = LatentDirichletAllocation(n_components=10)

#https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
lda_output = lda_model.fit_transform(vec_text)

In [13]:
print(lda_output)  # Model attributes

[[0.38579781 0.01673363 0.01673377 ... 0.01673566 0.48032668 0.01673317]
 [0.01812466 0.01812206 0.01812206 ... 0.31263031 0.54238816 0.01812305]
 [0.01601654 0.01600869 0.01601033 ... 0.01601058 0.253058   0.01600942]
 ...
 [0.01655544 0.01655553 0.01655512 ... 0.01655565 0.85098233 0.01655486]
 [0.01770985 0.0177113  0.01771019 ... 0.0177104  0.42294599 0.01770945]
 [0.0195406  0.01953783 0.01953968 ... 0.01953928 0.19950729 0.01953646]]


In [14]:
df_documents = pd.DataFrame(lda_output)

In [15]:
words = tfv.get_feature_names()

In [16]:
max(lda_model.components_[0])

1.842512915385023

In [17]:
for i, comp in enumerate(lda_model.components_):
    words_comp = dict(zip(words, comp))
    sorted_words = sorted(words_comp.items(), reverse=True, key=lambda item: item[1])
    print("Topic", i)
    for w in sorted_words[:10]:
        print(w[0], w[1])
    print("\n")

Topic 0
like 1.842512915385023
expansion 1.4183721700844463
soviet 1.3868562475271848
reform 1.319380496102072
change 1.2123600721126724
certainly 1.211063319596629
union 1.2055244685097901
war 1.1969552799618337
vietnam 1.1731787286794726
far 1.1514801120533074


Topic 1
soviet 1.4967339452016655
gorbachev 1.4210557022753878
capacity 1.0847735054139396
communist 1.050863217312524
criticism 1.0501550612058963
mistake 0.9817152421268656
seen 0.9299104702591494
essentially 0.9042562842679622
wall 0.8867086315427518
week 0.8513878107654173


Topic 2
relative 1.906315400012874
comparison 1.4145012960497159
value 1.3105909137025966
absolute 1.26842113912032
gain 1.188893807710677
china 1.1231378120480973
self 1.0426785084925767
common 1.0249437571522955
one 1.0152098402302694
weak 1.0049987474009385


Topic 3
kuwait 1.8875971657943647
saddam 1.5623160010968868
nation 1.412780213537012
united 1.329129573557858
hussein 1.2954186559289862
force 1.0533598763515253
harder 1.046985572773545
winne

In [18]:
# make a prediction# make a prediction
vec_text = ["people many think like working know getting rich",
           "giving devane lectures",
           "might saying china vietnam interested called reform"]
lda_model.fit_transform(tfv.transform(vec_text))

array([[0.02672151, 0.02672151, 0.02672151, 0.02672151, 0.7595064 ,
        0.02672151, 0.02672151, 0.02672151, 0.02672151, 0.02672151],
       [0.04159295, 0.04159295, 0.04159295, 0.62566344, 0.04159295,
        0.04159295, 0.04159295, 0.04159295, 0.04159295, 0.04159295],
       [0.02809462, 0.02809462, 0.02809462, 0.02809462, 0.02809462,
        0.7471484 , 0.02809462, 0.02809462, 0.02809462, 0.02809462]])

In [19]:
# view top document matches for a particular category

In [20]:
df_all = pd.concat([df_documents, df_transcripts], axis=1)
df_all.sort_values(2, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,text,alpha_text,no_stop,lemmatized_text,MIN(start),MAX(start),SUM(duration),video_id
640,0.017011,0.016995,0.846955,0.017,0.016995,0.016996,0.017001,0.016994,0.01706,0.016993,"of history which sweep\nmen one way or another. In our own time, we've seen brave men overcome obstacles that seemed insurmountable and forces that seemed overwhelming. Men with courage and vision can still determine their own destiny. They can choose slavery\nour freedom, war or peace. I have no doubt which they will choose. The treaty we are signing here today",of history which sweep men one way or another In our own time we ve seen brave men overcome obstacles that seemed insurmountable and forces that seemed overwhelming Men with courage and vision can still determine their own destiny They can choose slavery our freedom war or peace I have no doubt which they will choose The treaty we are signing here today,history sweep men one way another time seen brave men overcome obstacles seemed insurmountable forces seemed overwhelming men courage vision still determine destiny choose slavery freedom war peace doubt choose treaty signing today,history sweep men one way another time seen brave men overcome obstacle seemed insurmountable force seemed overwhelming men courage vision still determine destiny choose slavery freedom war peace doubt choose treaty signing today,1101.15,1127.94,25.489,s48b9B5gd88
644,0.01521,0.015216,0.724685,0.015215,0.015215,0.015211,0.015209,0.015213,0.153615,0.01521,"recognized by Article\n51 of the UN Charter, will assist the party\nor parties so attacked by taking forthwith, individually and in\nconcert with other parties, such action as it deems necessary, including the use of\narmed force to restore and maintain the security\nof the North Atlantic area. And any such armed attack\nand all measures taken as a result thereof shall\nimmediately be reported to the Security Council.",recognized by Article of the UN Charter will assist the party or parties so attacked by taking forthwith individually and in concert with other parties such action as it deems necessary including the use of armed force to restore and maintain the security of the North Atlantic area And any such armed attack and all measures taken as a result thereof shall immediately be reported to the Security Council,recognized article un charter assist party parties attacked taking forthwith individually concert parties action deems necessary including use armed force restore maintain security north atlantic area armed attack measures taken result thereof shall immediately reported security council,recognized article un charter assist party party attacked taking forthwith individually concert party action deems necessary including use armed force restore maintain security north atlantic area armed attack measure taken result thereof shall immediately reported security council,1238.89,1268.84,31.52,s48b9B5gd88
126,0.015951,0.015951,0.7148,0.015951,0.015953,0.015951,0.015951,0.015951,0.157589,0.015952,"while making it hard for the discredited to take a single step. The model is a place called Rong Chang it's in Chandon province. It's orderly, streets are spotless, cars slow down, which is unheard of in China. (dogs barking) Words like honesty and credibility appear on propaganda posters.",while making it hard for the discredited to take a single step The model is a place called Rong Chang it s in Chandon province It s orderly streets are spotless cars slow down which is unheard of in China dogs barking Words like honesty and credibility appear on propaganda posters,making hard discredited take single step model place called rong chang chandon province orderly streets spotless cars slow unheard china dogs barking words like honesty credibility appear propaganda posters,making hard discredited take single step model place called rong chang chandon province orderly street spotless car slow unheard china dog barking word like honesty credibility appear propaganda poster,4231.79,4252.98,22.503,4eUS8trd_yI
450,0.017058,0.017059,0.696284,0.017057,0.017062,0.017066,0.017061,0.017082,0.167215,0.017056,"the man in question Bill Browder, Hermitage Capital Management. Bill, good to have you here,\nI bet you're not surprised to hear Vladimir Putin\nrefer to you directly. - It's not the first time and\nit won't be the last time. Vladimir Putin is very mad at me. He's mad at me because\nof the Magnitsky Act. The Magnitsky Act is\na piece of legislation named after my murdered\nlawyer Sergei Magnitsky. He was murdered after\nuncovering a $230 million",the man in question Bill Browder Hermitage Capital Management Bill good to have you here I bet you re not surprised to hear Vladimir Putin refer to you directly It s not the first time and it won t be the last time Vladimir Putin is very mad at me He s mad at me because of the Magnitsky Act The Magnitsky Act is a piece of legislation named after my murdered lawyer Sergei Magnitsky He was murdered after uncovering a million,man question bill browder hermitage capital management bill good bet surprised hear vladimir putin refer directly first time last time vladimir putin mad mad magnitsky act magnitsky act piece legislation named murdered lawyer sergei magnitsky murdered uncovering million,man question bill browder hermitage capital management bill good bet surprised hear vladimir putin refer directly first time last time vladimir putin mad mad magnitsky act magnitsky act piece legislation named murdered lawyer sergei magnitsky murdered uncovering million,3636.19,3657.76,25.61,f5nbT4xQqwI
693,0.018881,0.018881,0.667595,0.018884,0.018882,0.018881,0.018882,0.01888,0.181343,0.018892,"not the sharpest knife in congress, to lobby him on the grounds that if a bank was sufficiently big, if it had five billion dollars in assets, it should be exempted\nfrom capital requirements or have reduced capital requirements because they were big\nenough to self-insure. The thought that because they were so big is maybe why they should\nhave the capital requirements didn't seem to occur to him.",not the sharpest knife in congress to lobby him on the grounds that if a bank was sufficiently big if it had five billion dollars in assets it should be exempted from capital requirements or have reduced capital requirements because they were big enough to self insure The thought that because they were so big is maybe why they should have the capital requirements didn t seem to occur to him,sharpest knife congress lobby grounds bank sufficiently big five billion dollars assets exempted capital requirements reduced capital requirements big enough self insure thought big maybe capital requirements seem occur,sharpest knife congress lobby ground bank sufficiently big five billion dollar asset exempted capital requirement reduced capital requirement big enough self insure thought big maybe capital requirement seem occur,3020.47,3050.3,30.516,s48b9B5gd88
32,0.016949,0.016945,0.663111,0.016944,0.016944,0.016943,0.016945,0.016943,0.201334,0.016943,"the Democracy Wall protesters started shifting their attention\nfrom attacking the past to attacking the current\nleadership in China. Again, a lot of demands\nnow for more democracy. Wei Jinsheng was a protester who came out and made very strong\ncriticisms of the government and this went on until\nlate 1980s, early 1981 and then there was a lot of\npushback from the regime. And so much of the 1980s were this era of testing,\npolitical testing,",the Democracy Wall protesters started shifting their attention from attacking the past to attacking the current leadership in China Again a lot of demands now for more democracy Wei Jinsheng was a protester who came out and made very strong criticisms of the government and this went on until late s early and then there was a lot of pushback from the regime And so much of the s were this era of testing political testing,democracy wall protesters started shifting attention attacking past attacking current leadership china lot demands democracy wei jinsheng protester came made strong criticisms government went late early lot pushback regime much era testing political testing,democracy wall protester started shifting attention attacking past attacking current leadership china lot demand democracy wei jinsheng protester came made strong criticism government went late early lot pushback regime much era testing political testing,1118.09,1165.93,38.779,4eUS8trd_yI
237,0.016256,0.016257,0.657029,0.016256,0.016256,0.016256,0.016256,0.016256,0.212921,0.016256,"it is rooted in a\nstraightforward view of society. In the understanding that\nthe individual does best in a strong and decent community of people with principles and standards and common aims and values. A new politics, a politics of courage\nand honesty and trust. Now it means telling it as it is, it means not opposing everything every other party does for the sake of it.",it is rooted in a straightforward view of society In the understanding that the individual does best in a strong and decent community of people with principles and standards and common aims and values A new politics a politics of courage and honesty and trust Now it means telling it as it is it means not opposing everything every other party does for the sake of it,rooted straightforward view society understanding individual best strong decent community people principles standards common aims values new politics politics courage honesty trust means telling means opposing everything every party sake,rooted straightforward view society understanding individual best strong decent community people principle standard common aim value new politics politics courage honesty trust mean telling mean opposing everything every party sake,453.83,480.55,26.646,T3-VlQu3iRM
39,0.016414,0.016408,0.651583,0.016407,0.016407,0.016407,0.016433,0.016408,0.217126,0.016406,"As the afternoon wears on here, the crowd gets bigger and bigger hoping that this protest\nwill produce results, that the government will\nrespond to the students demand for meaningful dialogue and begin to implement the\nkind of genuine reforms as opposed to cosmetic reforms\nthat the students want. - So these protests were going on and it soon became clear that the regime was\nnot going tolerate them",As the afternoon wears on here the crowd gets bigger and bigger hoping that this protest will produce results that the government will respond to the students demand for meaningful dialogue and begin to implement the kind of genuine reforms as opposed to cosmetic reforms that the students want So these protests were going on and it soon became clear that the regime was not going tolerate them,afternoon wears crowd gets bigger bigger hoping protest produce results government respond students demand meaningful dialogue begin implement kind genuine reforms opposed cosmetic reforms students want protests going soon became clear regime going tolerate,afternoon wear crowd get bigger bigger hoping protest produce result government respond student demand meaningful dialogue begin implement kind genuine reform opposed cosmetic reform student want protest going soon became clear regime going tolerate,1371.27,1397.47,27.553,4eUS8trd_yI
55,0.017696,0.017692,0.639553,0.017696,0.017693,0.017693,0.017698,0.017693,0.218893,0.017692,"elsewhere in East Asia. So the fact of being\na late developer alone is obviously not the advantage. They were predominantly\nrural at the start. So the argument this is\nin Moleskine London paper, the fact that they didn't have inefficient urban economies to dismantle, they didn't have a big, inefficient economies to dismantle, made it easier that as they urbanized,",elsewhere in East Asia So the fact of being a late developer alone is obviously not the advantage They were predominantly rural at the start So the argument this is in Moleskine London paper the fact that they didn t have inefficient urban economies to dismantle they didn t have a big inefficient economies to dismantle made it easier that as they urbanized,elsewhere east asia fact late developer alone obviously advantage predominantly rural start argument moleskine london paper fact inefficient urban economies dismantle big inefficient economies dismantle made easier urbanized,elsewhere east asia fact late developer alone obviously advantage predominantly rural start argument moleskine london paper fact inefficient urban economy dismantle big inefficient economy dismantle made easier urbanized,1953.07,1988.35,27.772,4eUS8trd_yI
3,0.018509,0.01851,0.636884,0.018508,0.018508,0.018509,0.01851,0.018508,0.215049,0.018507,"connecting 95% of the country's villages and overtaking the US as the country with the most extensive\nhighway system by almost 50%. Over the past decade, China has constructed the world's longest high speed rail network. 12,000 miles of rail lines that carry passengers between cities, at speeds up to 180 miles per hour. China now has more high speed rail tracks",connecting of the country s villages and overtaking the US as the country with the most extensive highway system by almost Over the past decade China has constructed the world s longest high speed rail network miles of rail lines that carry passengers between cities at speeds up to miles per hour China now has more high speed rail tracks,connecting country villages overtaking us country extensive highway system almost past decade china constructed world longest high speed rail network miles rail lines carry passengers cities speeds miles per hour china high speed rail tracks,connecting country village overtaking u country extensive highway system almost past decade china constructed world longest high speed rail network mile rail line carry passenger city speed mile per hour china high speed rail track,112.28,135.52,25.74,4eUS8trd_yI
