In [1]:
# Find most relevant terms for each topic using LDA clustering

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [4]:
df_transcripts = pd.read_csv("transcripts.csv")

In [30]:
df_transcripts.head(5)

Unnamed: 0,text,start,duration,video_id
0,hello everybody welcome,8.06,2.66,BDqvzFY72mg
1,everybody today,10.72,1.613,BDqvzFY72mg
2,great,13.404,0.916,BDqvzFY72mg
3,well i'm delighted opportunity,14.32,3.54,BDqvzFY72mg
4,giving devane lectures,17.86,2.92,BDqvzFY72mg


In [6]:
df_transcripts['text'] = df_transcripts['text'].str.lower()

In [7]:
# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df_transcripts['text'] = df_transcripts['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [8]:
df_transcripts['text'] = df_transcripts['text'].str.replace("[^\w\d'\s]+", ' ')

In [9]:
#df_transcripts['text']

In [10]:
split_text = [word for word in df_transcripts['text'] if not word in stop and len(word) >2] 

In [11]:
split_text[:5]

['  hello everybody welcome ',
 'everybody today ',
 'great ',
 "well  i'm delighted opportunity",
 'giving devane lectures ']

In [12]:
tfv = TfidfVectorizer(stop_words = stop, ngram_range = (1,1))

In [13]:
vec_text = tfv.fit_transform(split_text)

In [14]:
words = tfv.get_feature_names()

In [15]:
words[:10]

['00', '000', '10', '1000', '101', '109', '11', '11th', '12', '125']

In [16]:
# now working through https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [17]:
lda_model = LatentDirichletAllocation(n_components=10)

#https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
lda_output = lda_model.fit_transform(vec_text)

In [18]:
print(lda_output)  # Model attributes

[[0.0367344  0.03673116 0.03674078 ... 0.03673116 0.03673453 0.03673291]
 [0.04155443 0.04155411 0.04155764 ... 0.04155398 0.04155531 0.04155372]
 [0.54998393 0.05       0.05       ... 0.05000438 0.05       0.05      ]
 ...
 [0.03095321 0.03097081 0.03095434 ... 0.03096456 0.03095409 0.40897099]
 [0.04195617 0.39368446 0.04195536 ... 0.04195536 0.04195675 0.04195536]
 [0.04149767 0.04150554 0.04149767 ... 0.04153792 0.04149767 0.04150513]]


In [19]:
df_documents = pd.DataFrame(lda_output)

In [20]:
df_documents.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.036734,0.036731,0.036741,0.036734,0.427152,0.278958,0.036751,0.036731,0.036735,0.036733
1,0.041554,0.041554,0.041558,0.041555,0.041559,0.041553,0.626003,0.041554,0.041555,0.041554
2,0.549984,0.05,0.05,0.05,0.050009,0.05,0.050003,0.050004,0.05,0.05
3,0.037014,0.037014,0.037015,0.037016,0.037014,0.037014,0.037014,0.037017,0.037026,0.666857
4,0.03665,0.03665,0.03665,0.463608,0.03665,0.036652,0.03665,0.03665,0.03665,0.243191
5,0.03667,0.036721,0.036671,0.669921,0.03667,0.03667,0.036669,0.036669,0.036669,0.03667
6,0.66935,0.036733,0.036734,0.036759,0.036734,0.036745,0.036733,0.036736,0.03674,0.036736
7,0.697162,0.033648,0.033646,0.033649,0.033646,0.033647,0.033662,0.033647,0.033646,0.033648
8,0.669906,0.036711,0.036673,0.036672,0.036672,0.036672,0.036679,0.036672,0.036672,0.036672
9,0.69943,0.03339,0.033394,0.033424,0.033394,0.033399,0.03339,0.03339,0.033393,0.033395


In [21]:
words = tfv.get_feature_names()

for i, comp in enumerate(lda_model.components_):
    words_comp = dict(zip(words, comp))
    sorted_words = sorted(words_comp.items(), reverse=True, key=lambda item: item[1])
    print("Document", i)
    for w in sorted_words[:10]:
        print(w[0], w[1])
    print("\n")

Document 0
union 44.631182350297365
soviet 39.76222632740541
party 22.34158761333463
european 20.58872893354754
countries 19.745561598014017
part 19.192330999721626
left 14.151042961600218
two 14.087844811319728
systems 13.129011585867444
political 12.891593513535051


Document 1
communist 20.712448105199602
us 16.52534653581773
point 14.356099508240176
crisis 13.008710797065708
gonna 12.856419594219364
leadership 12.000357828466337
party 11.96903441472972
came 11.566601858844166
since 11.00740735836723
indeed 9.647301149380096


Document 2
might 17.916577154235224
saying 16.073930065010853
china 15.569717646606476
vietnam 12.546514067650332
basically 11.237976329419425
people 10.486091594353057
interest 9.581819474690212
called 9.497123833007521
reform 8.534647570300898
language 8.304013360837917


Document 3
way 20.96988347410647
people 18.18912158388891
unions 15.545972152769
even 14.76596840047766
middle 13.56857296822916
better 13.43391213750844
get 13.123503181655474
idea 12.1766

In [31]:
# make a prediction# make a prediction
vec_text = ["people many think like working know getting rich",
           "giving devane lectures",
           "might saying china vietnam interested called reform"]
lda_model.fit_transform(tfv.transform(vec_text))

array([[0.02636813, 0.02636813, 0.02636813, 0.76268684, 0.02636813,
        0.02636813, 0.02636813, 0.02636813, 0.02636813, 0.02636813],
       [0.03664977, 0.03664977, 0.03664977, 0.03664977, 0.03664977,
        0.03664977, 0.03664977, 0.67015211, 0.03664977, 0.03664977],
       [0.02775566, 0.02775566, 0.02775566, 0.02775566, 0.02775566,
        0.02775566, 0.75019904, 0.02775566, 0.02775566, 0.02775566]])