In [1]:
# Find most relevant terms for each topic using LDA clustering

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [4]:
df_transcripts = pd.read_csv("transcripts.csv")

In [5]:
df_transcripts.head(5)

Unnamed: 0,text,start,duration,video_id
0,- Hello everybody and welcome.,8.06,2.66,BDqvzFY72mg
1,How is everybody today?,10.72,1.613,BDqvzFY72mg
2,Great.,13.404,0.916,BDqvzFY72mg
3,"Well, I'm delighted to\nhave the opportunity",14.32,3.54,BDqvzFY72mg
4,to be giving the DeVane Lectures.,17.86,2.92,BDqvzFY72mg


In [6]:
df_transcripts['text'] = df_transcripts['text'].str.lower()

In [29]:
len(df_transcripts)

7338

In [7]:
# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df_transcripts['text'] = df_transcripts['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [8]:
df_transcripts['text'] = df_transcripts['text'].str.replace("[^\w\d'\s]+", ' ')

In [9]:
#df_transcripts['text']

In [10]:
split_text = [word for word in df_transcripts['text'] if not word in stop and len(word) >2] 

In [11]:
split_text[:5]

['  hello everybody welcome ',
 'everybody today ',
 'great ',
 "well  i'm delighted opportunity",
 'giving devane lectures ']

In [12]:
tfv = TfidfVectorizer(stop_words = stop, ngram_range = (1,1))

In [13]:
vec_text = tfv.fit_transform(split_text)

In [14]:
words = tfv.get_feature_names()

In [15]:
words[:10]

['00', '000', '10', '1000', '101', '109', '11', '11th', '12', '125']

In [16]:
# now working through https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [17]:
lda_model = LatentDirichletAllocation(n_components=10)

#https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
lda_output = lda_model.fit_transform(vec_text)

In [18]:
print(lda_output)  # Model attributes

[[0.03673116 0.03673116 0.22974091 ... 0.03673116 0.03673116 0.03673116]
 [0.04155288 0.04156735 0.62599856 ... 0.04155469 0.04155759 0.04155288]
 [0.05       0.05000134 0.05000404 ... 0.05       0.54999463 0.05      ]
 ...
 [0.03095316 0.03095272 0.03095332 ... 0.0309614  0.72140147 0.03095281]
 [0.04195536 0.04195536 0.04195536 ... 0.04195536 0.04195536 0.62234438]
 [0.04149914 0.04149974 0.04149767 ... 0.04150083 0.04149767 0.04149767]]


In [19]:
df_documents = pd.DataFrame(lda_output)

In [20]:
len(df_documents)

7323

In [22]:
words = tfv.get_feature_names()

for i, comp in enumerate(lda_model.components_):
    words_comp = dict(zip(words, comp))
    sorted_words = sorted(words_comp.items(), reverse=True, key=lambda item: item[1])
    print("Document", i)
    for w in sorted_words[:10]:
        print(w[0], w[1])
    print("\n")

Document 0
way 28.65859391139933
unions 13.516396971461036
one 12.71129912448487
people 12.554178900157712
members 12.308468931195609
get 11.894586658645311
us 11.355480800263976
question 10.20093958352817
number 10.096977402449951
student 9.397452741553238


Document 1
countries 29.508405717863962
part 23.257130386871836
might 22.50703998757949
gonna 15.961265491882527
crisis 13.008737034265021
government 11.435850850842565
middle 11.317408255119522
war 11.289138244335648
able 11.14055326127264
cold 10.685524576451868


Document 2
union 41.02958328069116
soviet 35.31132791753403
left 21.545172372122263
european 19.504239247310235
want 18.45253773033481
talking 18.40809701461062
right 15.108975137947072
center 13.17156046756472
getting 12.240396383225994
governments 11.489513447328843


Document 3
communist 16.913756117331793
even 15.556392602985502
little 15.436284119300733
europe 14.606778505719406
gone 14.597512773491365
world 14.205742538937175
think 13.986783346587362
people 13.62

In [23]:
# make a prediction# make a prediction
vec_text = ["people many think like working know getting rich",
           "giving devane lectures",
           "might saying china vietnam interested called reform"]
lda_model.fit_transform(tfv.transform(vec_text))

array([[0.02636813, 0.02636813, 0.02636813, 0.02636813, 0.02636813,
        0.76268684, 0.02636813, 0.02636813, 0.02636813, 0.02636813],
       [0.03664977, 0.03664977, 0.03664977, 0.03664977, 0.03664977,
        0.03664977, 0.67015211, 0.03664977, 0.03664977, 0.03664977],
       [0.02775566, 0.02775566, 0.02775566, 0.75019904, 0.02775566,
        0.02775566, 0.02775566, 0.02775566, 0.02775566, 0.02775566]])

In [40]:
# view top document matches for a particular category

In [39]:
df_all = pd.concat([df_documents, df_transcripts], axis=1)
df_all.sort_values(2, ascending=False).head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,text,start,duration,video_id
6781,0.027559,0.027598,0.75164,0.027604,0.027559,0.027559,0.027559,0.027568,0.027796,0.027559,clinton balanced 12 budgets proposed,2358.62,2.36,T3-VlQu3iRM
3420,0.027725,0.027724,0.750479,0.02771,0.02773,0.027729,0.02776,0.027709,0.027716,0.02772,one could pick give snapshot,143.49,4.03,4eUS8trd_yI
3797,0.029095,0.029096,0.738153,0.029094,0.029093,0.029093,0.029093,0.029094,0.029098,0.029093,heading road hazardous business,1530.63,2.3,4eUS8trd_yI
1354,0.02909,0.029185,0.738069,0.029087,0.029088,0.029087,0.029093,0.029088,0.029101,0.029112,analytical schema social sciences,1702.62,5.0,f5nbT4xQqwI
4940,0.029136,0.029169,0.737757,0.029135,0.029135,0.029133,0.029134,0.029133,0.029135,0.029135,implications two party systems,838.229,3.211,q53DF6ySOZg
5907,0.029191,0.02921,0.737333,0.029181,0.029179,0.029184,0.029181,0.029182,0.02918,0.02918,seen pay,3896.93,2.185,q53DF6ySOZg
2575,0.029234,0.02927,0.736835,0.029237,0.029236,0.029237,0.029235,0.029235,0.029236,0.029246,difficult thing right,1600.32,3.04,s48b9B5gd88
5106,0.029246,0.029247,0.73676,0.029248,0.029264,0.029247,0.029248,0.029245,0.029247,0.029247,much easier discount that,1405.47,3.36,q53DF6ySOZg
3285,0.029306,0.029314,0.736207,0.029306,0.029306,0.029306,0.029308,0.029334,0.029306,0.029306,less dramatic fashion,4111.86,2.38,s48b9B5gd88
4745,0.029323,0.029326,0.736038,0.029372,0.029321,0.029322,0.029327,0.029323,0.029326,0.029321,kind refundable tax credit,230.41,2.45,q53DF6ySOZg
