In [44]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [45]:
input_file_name = 'outputWithDates.csv'
output_file_name = 'lda_result_15topics.csv'

In [46]:
df = pd.read_csv(input_file_name)
df.head(3)

Unnamed: 0,uuid,date,text
0,e826d5b5-d176-41b4-b100-3f2ccf2b55ff,2021-02-17 05:00:53,Private equity group CVC Capital Partners has ...
1,db70a308-aa88-439e-bae2-e0ca30ab84ec,2020-10-09 18:18:10,Your level-headed briefing on how the coronavi...
2,e593e7d4-b82a-4bf9-8497-426eee43bcbc,2020-04-07 12:00:29,"Harry Truman, US president from 1945-53, reput..."


In [47]:
cv = CountVectorizer(max_df=0.95, min_df=2, token_pattern='[a-zA-Z]{3,}', stop_words='english')
dtm = cv.fit_transform(df['text'])

In [48]:
lda_model = LatentDirichletAllocation(n_components=2, random_state=42)
lda_model.fit(dtm)

LatentDirichletAllocation(n_components=2, random_state=42)

In [49]:
for i, topic in enumerate(lda_model.components_):
    print(f"The top 15 words for topic #{i}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-15:]])
    print('\n')

The top 15 words for topic #0
['covid', 'market', 'crisis', 'economy', 'companies', 'week', 'vaccine', 'economic', 'new', 'pandemic', 'coronavirus', 'government', 'year', 'cent', 'said']


The top 15 words for topic #1
['world', 'time', 'cent', 'home', 'virus', 'says', 'year', 'pandemic', 'government', 'health', 'covid', 'new', 'coronavirus', 'people', 'said']




In [50]:
doc_topic_results = lda_model.transform(dtm)
df["topic"] = doc_topic_results.argmax(axis=1)
df.head(3)

Unnamed: 0,uuid,date,text,topic
0,e826d5b5-d176-41b4-b100-3f2ccf2b55ff,2021-02-17 05:00:53,Private equity group CVC Capital Partners has ...,1
1,db70a308-aa88-439e-bae2-e0ca30ab84ec,2020-10-09 18:18:10,Your level-headed briefing on how the coronavi...,0
2,e593e7d4-b82a-4bf9-8497-426eee43bcbc,2020-04-07 12:00:29,"Harry Truman, US president from 1945-53, reput...",0


In [12]:
df.to_csv(output_file_name)

In [51]:
# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [52]:
panel = pyLDAvis.sklearn.prepare(lda_model, dtm, cv, mds='tsne')

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  dtype = np.float
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  error = np.finfo(np.float).max
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  best_error = np.finfo(np.float).max
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  error = np.finfo(np.float).max
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  best_error = np.finfo(np.float).max


In [53]:
pyLDAvis.display(panel)

In [33]:
lda_model.score(dtm)

-6783329.938308954

In [34]:
lda_model.perplexity(dtm)

3831.692225399539

In [38]:
search_params = {'n_components': [2,5,10,15]}

In [39]:
lda = LatentDirichletAllocation(random_state=42)

In [40]:
from sklearn.model_selection import GridSearchCV

In [41]:
model = GridSearchCV(lda, param_grid=search_params)

In [43]:
model.fit(dtm)
best_lda_model = model.best_estimator_
print("Best model's params: ", model.best_params_)
print("Best log likelihood score: ", model.best_score_)
print("Model perplexity: ", best_lda_model.perplexity(dtm))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  fold_sizes = np.full(n_splits, n_samples // n_splits, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  test_mask = np.zeros(_num_samples(X), dtype=np.bool)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-no

Best model's params:  {'n_components': 2}
Best log likelihood score:  -1432751.617289872


NameError: name 'feature_matrix' is not defined