In [1]:
# Find most relevant terms for each topic using LDA clustering

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
from nltk.corpus import stopwords

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [4]:
df_transcripts = pd.read_csv("transcripts.csv")

In [5]:
#df_transcripts

In [6]:
df_transcripts['text'] = df_transcripts['text'].str.lower()

In [7]:
# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df_transcripts['text'] = df_transcripts['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [8]:
df_transcripts['text'] = df_transcripts['text'].str.replace("[^\w\d'\s]+", ' ')

In [9]:
#df_transcripts['text']

In [10]:
split_text = [word for word in df_transcripts['text'] if not word in stop and len(word) >2] 

In [11]:
split_text[:5]

['  hello everybody welcome ',
 'everybody today ',
 'great ',
 "well  i'm delighted opportunity",
 'giving devane lectures ']

In [12]:
tfv = TfidfVectorizer(stop_words = stop, ngram_range = (1,1))

In [13]:
vec_text = tfv.fit_transform(split_text)

In [14]:
words = tfv.get_feature_names()

In [15]:
words[:10]

['00', '000', '10', '1000', '101', '109', '11', '11th', '12', '125']

In [16]:
# now working through https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6

In [17]:
lda_model = LatentDirichletAllocation(n_components=10)

#https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
lda_output = lda_model.fit_transform(vec_text)

In [18]:
print(lda_output)  # Model attributes

[[0.03673116 0.03673116 0.03674541 ... 0.03673116 0.03674366 0.03673116]
 [0.04155288 0.04155288 0.0415531  ... 0.04155527 0.62596624 0.04155288]
 [0.05001821 0.05000901 0.05000495 ... 0.05       0.05       0.05001014]
 ...
 [0.72138052 0.03095272 0.03095986 ... 0.03095547 0.0309534  0.03095454]
 [0.04195567 0.04195536 0.04195568 ... 0.04195536 0.04195536 0.04195627]
 [0.04149767 0.04149767 0.04149767 ... 0.04150043 0.04149767 0.04149767]]


In [19]:
df_documents = pd.DataFrame(lda_output)

In [20]:
df_documents.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.036731,0.036731,0.036745,0.036731,0.036732,0.036736,0.669387,0.036731,0.036744,0.036731
1,0.041553,0.041553,0.041553,0.041564,0.041557,0.041571,0.041575,0.041555,0.625966,0.041553
2,0.050018,0.050009,0.050005,0.050009,0.549943,0.050001,0.050004,0.05,0.05,0.05001
3,0.037016,0.037017,0.349009,0.037017,0.037015,0.037016,0.354864,0.037016,0.037016,0.037015
4,0.03665,0.03665,0.036661,0.03665,0.469656,0.036656,0.036652,0.03665,0.237126,0.03665
5,0.036669,0.036669,0.036676,0.036669,0.669964,0.036673,0.036669,0.036669,0.036669,0.036672
6,0.669325,0.03674,0.036735,0.036733,0.036735,0.036734,0.036786,0.036741,0.036733,0.036739
7,0.033651,0.499121,0.033675,0.033646,0.033648,0.033648,0.033664,0.033646,0.231656,0.033646
8,0.669892,0.036683,0.036676,0.036689,0.036678,0.036677,0.036676,0.036672,0.036672,0.036685
9,0.033393,0.033392,0.69938,0.033471,0.033393,0.033392,0.033392,0.033394,0.033402,0.03339


In [21]:
words = tfv.get_feature_names()

for i, comp in enumerate(lda_model.components_):
    words_comp = dict(zip(words, comp))
    sorted_words = sorted(words_comp.items(), reverse=True, key=lambda item: item[1])
    print("Document", i)
    for w in sorted_words[:10]:
        print(w[0], w[1])
    print("\n")

Document 0
germany 21.18204340335756
last 20.052609695396406
foreign 18.38699650225307
time 16.225280661575635
two 14.862553776811033
like 14.834040700807131
britain 14.637458889540044
part 14.438016979379041
things 13.970756971512884
fact 12.587642542559378


Document 1
soviet 30.869115847422773
union 27.57095540898192
important 13.852164651560004
sector 12.612740383225908
thought 12.558256579025004
leadership 12.070486504751173
party 11.844037996625548
really 11.492792047953191
collapse 11.449280140340452
different 11.390517397074362


Document 2
war 27.389147070355786
communist 18.60625012519783
cold 16.57060467418705
would 15.951813966692441
look 13.875407901661587
case 12.504112852515322
start 12.05891693143643
governments 11.90425377847024
might 11.901669909321802
systems 10.87214066772887


Document 3
see 38.94785053313053
europe 18.1688878122595
median 17.694273953945917
audience 16.14020130784168
voter 14.832405758072708
thing 14.294714878223932
way 13.541938869670224
economie