In [1]:
# Find most relevant terms for each topic using LDA clustering

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [4]:
df_transcripts = pd.read_csv("transcripts_m10.csv")

In [5]:
#df_transcripts.head(5)

In [6]:
tfv = TfidfVectorizer(ngram_range = (1,1))

In [7]:
vec_text = tfv.fit_transform(df_transcripts['lemmatized_text'])

In [8]:
df_transcripts.columns

Index(['text', 'alpha_text', 'no_stop', 'lemmatized_text', 'MIN(start)',
       'MAX(start)', 'SUM(duration)', 'video_id'],
      dtype='object')

In [9]:
words = tfv.get_feature_names()

In [10]:
words[:10]

['abandon',
 'abandoned',
 'abate',
 'abc',
 'aberrational',
 'abide',
 'ability',
 'able',
 'aboard',
 'abolish']

In [11]:
# now working through https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [12]:
lda_model = LatentDirichletAllocation(n_components=10)

#https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
lda_output = lda_model.fit_transform(vec_text)

In [13]:
print(lda_output)  # Model attributes

[[0.01673446 0.75139778 0.01674529 ... 0.01673474 0.01674906 0.01673322]
 [0.01812175 0.44429351 0.01812221 ... 0.01812189 0.01812211 0.01812215]
 [0.0160079  0.01601787 0.01602299 ... 0.01601391 0.01601787 0.01600855]
 ...
 [0.01655496 0.01656752 0.01655877 ... 0.49448248 0.01656368 0.01655549]
 [0.01771    0.01774456 0.01771487 ... 0.0177109  0.01771414 0.01771359]
 [0.01953647 0.01955398 0.82412428 ... 0.01953792 0.01953704 0.01953649]]


In [14]:
df_documents = pd.DataFrame(lda_output)

In [15]:
words = tfv.get_feature_names()

In [16]:
max(lda_model.components_[0])

0.9940087121393331

In [17]:
for i, comp in enumerate(lda_model.components_):
    words_comp = dict(zip(words, comp))
    sorted_words = sorted(words_comp.items(), reverse=True, key=lambda item: item[1])
    print("Topic", i)
    for w in sorted_words[:10]:
        print(w[0], w[1])
    print("\n")

Topic 0
capitalism 0.9940087121393331
trump 0.9352044165103239
saw 0.9318481712171199
putin 0.8946461670303033
talk 0.8339201417139129
magnitsky 0.7507338972998028
ago 0.7287078037389257
islamist 0.7055452766516559
act 0.6953297405854966
fall 0.6731580653055813


Topic 1
gonna 5.764452662878344
soviet 5.515675763448479
war 5.1507511280021285
union 4.7817239337027395
nato 4.2999457586072785
cold 4.06859684760112
talk 3.7939712780543515
student 3.770526451927841
like 3.5290190895272784
nation 3.4963490573321603


Topic 2
one 2.292093704347
party 2.080506596855935
give 1.8796653924027285
music 1.804441187147431
communist 1.7640256314107494
see 1.7343044742038274
yeltsin 1.678229498134157
country 1.5529600670300563
decade 1.5393917956406367
first 1.4958593605065273


Topic 3
conservative 1.3859954095741545
criticism 1.2060978964246447
europe 1.1210155022983654
late 1.0034734756840717
crisis 0.9682857326298118
government 0.8885868025477593
mile 0.8477926690046178
thatcher 0.8399016472999926

In [18]:
# make a prediction# make a prediction
vec_text = ["people many think like working know getting rich",
           "giving devane lectures",
           "might saying china vietnam interested called reform"]
lda_model.fit_transform(tfv.transform(vec_text))

array([[0.02672151, 0.02672151, 0.02672151, 0.02672151, 0.02672151,
        0.02672151, 0.7595064 , 0.02672151, 0.02672151, 0.02672151],
       [0.04159295, 0.04159295, 0.62566344, 0.04159295, 0.04159295,
        0.04159295, 0.04159295, 0.04159295, 0.04159295, 0.04159295],
       [0.02809462, 0.02809462, 0.02809462, 0.02809462, 0.02809462,
        0.02809462, 0.02809462, 0.02809462, 0.7471484 , 0.02809462]])

In [19]:
# view top document matches for a particular category

In [20]:
df_all = pd.concat([df_documents, df_transcripts], axis=1)
df_all.sort_values(2, ascending=False).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,text,alpha_text,no_stop,lemmatized_text,MIN(start),MAX(start),SUM(duration),video_id
369,0.014283,0.014289,0.871433,0.014284,0.014291,0.014283,0.014285,0.014284,0.014285,0.014283,"- [Reporter] Present at\nthe rally, many of the men who once served with Mikhail Gorbachev. Economic advisor Stanislav Shatalin, and former Foreign Minster\nEduard Shevardnadze. The men who helped mold glasnost now siding with the man of the hour. The Mayor of Moscow, Gavriil Popov, said to regain the\nsupport of the democrats, Gorbachev had to break\nwith the Communist Party and recognize its seven decades\nin power were at an end.",Reporter Present at the rally many of the men who once served with Mikhail Gorbachev Economic advisor Stanislav Shatalin and former Foreign Minster Eduard Shevardnadze The men who helped mold glasnost now siding with the man of the hour The Mayor of Moscow Gavriil Popov said to regain the support of the democrats Gorbachev had to break with the Communist Party and recognize its seven decades in power were at an end,reporter present rally many men served mikhail gorbachev economic advisor stanislav shatalin former foreign minster eduard shevardnadze men helped mold glasnost siding man hour mayor moscow gavriil popov said regain support democrats gorbachev break communist party recognize seven decades power end,reporter present rally many men served mikhail gorbachev economic advisor stanislav shatalin former foreign minster eduard shevardnadze men helped mold glasnost siding man hour mayor moscow gavriil popov said regain support democrat gorbachev break communist party recognize seven decade power end,811.249,834.28,26.761,f5nbT4xQqwI
25,0.015377,0.015384,0.861564,0.01538,0.015394,0.015379,0.015385,0.015377,0.015383,0.015378,"It's a recreation to some degree of what it existed before the war, but nothing like the levels that were seen in years afterward. But as you can see, per capita GDP in China\nwas basically flat. There was no economic\ndevelopment to speak of. 1978 Mao is now dead. The third plenum of the Communist\nParty Central Committee, led vice premier soon to\nbe premier, Deng Xiaoping",It s a recreation to some degree of what it existed before the war but nothing like the levels that were seen in years afterward But as you can see per capita GDP in China was basically flat There was no economic development to speak of Mao is now dead The third plenum of the Communist Party Central Committee led vice premier soon to be premier Deng Xiaoping,recreation degree existed war nothing like levels seen years afterward see per capita gdp china basically flat economic development speak mao dead third plenum communist party central committee led vice premier soon premier deng xiaoping,recreation degree existed war nothing like level seen year afterward see per caput gdp china basically flat economic development speak mao dead third plenum communist party central committee led vice premier soon premier deng xiaoping,876.65,906.99,31.298,4eUS8trd_yI
298,0.015812,0.01582,0.857668,0.015812,0.015824,0.015811,0.015814,0.015811,0.015818,0.015811,"I know she is a young person but she has a big influence\non a lot of people. And when people say that, if you took the words white\nand black and you reversed them you might think David Duke\nwas giving that speech. - So that was very dishonest\nin its presentation, because she had not actually said that, she had talked about a\nrap singer singing that. It wasn't her music, she was talking about a rap\nsong that made those claims",I know she is a young person but she has a big influence on a lot of people And when people say that if you took the words white and black and you reversed them you might think David Duke was giving that speech So that was very dishonest in its presentation because she had not actually said that she had talked about a rap singer singing that It wasn t her music she was talking about a rap song that made those claims,know young person big influence lot people people say took words white black reversed might think david duke giving speech dishonest presentation actually said talked rap singer singing music talking rap song made claims,know young person big influence lot people people say took word white black reversed might think david duke giving speech dishonest presentation actually said talked rap singer singing music talking rap song made claim,2467.52,2496.69,29.47,T3-VlQu3iRM
40,0.015855,0.015872,0.857262,0.015856,0.015872,0.015855,0.015857,0.015856,0.015856,0.015857,"and tanks started appearing\nin Tiananmen Square. (crowd chattering) (crowd yelling) So that's not that\ndifferent from the video we were looking at outside\nthe White House in Russia when Yeltsin climbed the tank, and there was the real expectation among many of the protesters that the Chinese army would\nnot attack its own citizens. And they certainly were appeals\ngoing out to that effect,",and tanks started appearing in Tiananmen Square crowd chattering crowd yelling So that s not that different from the video we were looking at outside the White House in Russia when Yeltsin climbed the tank and there was the real expectation among many of the protesters that the Chinese army would not attack its own citizens And they certainly were appeals going out to that effect,tanks started appearing tiananmen square crowd chattering crowd yelling different video looking outside white house russia yeltsin climbed tank real expectation among many protesters chinese army would attack citizens certainly appeals going effect,tank started appearing tiananmen square crowd chattering crowd yelling different video looking outside white house russia yeltsin climbed tank real expectation among many protester chinese army would attack citizen certainly appeal going effect,1400.73,1500.77,34.958,4eUS8trd_yI
295,0.01589,0.015897,0.856965,0.01589,0.015904,0.015891,0.015891,0.015891,0.015892,0.01589,"because they no longer\nthink that they can get the solidaristic support\nof people on the left. And indeed, Clinton felt that he actually had to attack the left of the Democratic Party to get his bona fides as someone who could one in this new world. And this is his famous\nspeech about Sista Souljah. - You had a rap singer here\nlast night named Sista Souljah, I defend her right to\nexpress herself through music",because they no longer think that they can get the solidaristic support of people on the left And indeed Clinton felt that he actually had to attack the left of the Democratic Party to get his bona fides as someone who could one in this new world And this is his famous speech about Sista Souljah You had a rap singer here last night named Sista Souljah I defend her right to express herself through music,longer think get solidaristic support people left indeed clinton felt actually attack left democratic party get bona fides someone could one new world famous speech sista souljah rap singer last night named sista souljah defend right express music,longer think get solidaristic support people left indeed clinton felt actually attack left democratic party get bona fides someone could one new world famous speech sista souljah rap singer last night named sista souljah defend right express music,2386.36,2414.7,31.107,T3-VlQu3iRM
