In [1]:
# Find most relevant terms for each topic using KMeans clustering

In [2]:
import pandas as pd
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import nltk
from nltk.corpus import stopwords

In [3]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [4]:
df_transcripts = pd.read_csv("transcripts.csv")

In [5]:
#df_transcripts

In [6]:
df_transcripts['text'] = df_transcripts['text'].str.lower()

In [7]:
# remove stop words
from nltk.corpus import stopwords
stop = stopwords.words('english')
df_transcripts['text'] = df_transcripts['text'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stop))

In [8]:
df_transcripts['text'] = df_transcripts['text'].str.replace("[^\w\d'\s]+", ' ')

In [9]:
#df_transcripts['text']

In [10]:
text = [word for word in df_transcripts['text'] if not word in stop and len(word) >2] 
text = [re.sub('[^a-zA-Z]', ' ', word) for word in df_transcripts['text']] 

In [11]:
tfv = TfidfVectorizer(stop_words = stop, ngram_range = (1,1))

In [12]:
vec_text = tfv.fit_transform(text)

In [13]:
#vec_text

In [14]:
words = tfv.get_feature_names()

In [15]:
#words[:10]

In [16]:
kmeans = KMeans(n_clusters = 10)
kmeans.fit(vec_text)
cluster_words = kmeans.cluster_centers_
df_cluster_words = pd.DataFrame(cluster_words, columns=words).T

In [17]:
#df_cluster_words.iloc[0:10, 0:10]

In [18]:
for i in range(0, 10):
    print('Document', i)
    print(df_cluster_words.sort_values(i, ascending=False)[i].head(10))
    print('\n')

Document 0
one          0.065011
countries    0.046566
course       0.045828
world        0.041901
going        0.041706
look         0.035687
get          0.034797
go           0.034541
war          0.034068
actually     0.033550
Name: 0, dtype: float64


Document 1
people       0.362198
many         0.028673
think        0.017858
like         0.015475
working      0.014433
know         0.013327
get          0.010886
getting      0.010596
rich         0.010496
interests    0.010259
Name: 1, dtype: float64


Document 2
gonna         0.223932
union         0.159861
soviet        0.130003
european      0.030905
talk          0.025435
membership    0.016111
less          0.013773
look          0.013525
get           0.011969
pressure      0.011828
Name: 2, dtype: float64


Document 3
would       0.226111
think       0.187755
thought     0.015108
way         0.012828
let         0.012222
come        0.011906
make        0.011155
good        0.010401
maybe       0.010357
argument    0.00991

In [47]:
# make a prediction
vec_text = ["people many think like working know getting rich",
           "how china cold war united states",
           "political parties played blame game",
           "lawsuits said lawyers actually going back earlier",
           "political parties played blame game lawsuits said lawyers actually going back earlier"]

kmeans.predict(tfv.transform(vec_text))

array([1, 0, 5, 8, 8], dtype=int32)