In [None]:
import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import re

#nltk.download('stopwords')
#nltk.download('wordnet')
# !pip install -U gensim
stopwords = stopwords.words('english')


In [None]:
# Loading the datasets

nyt_df = pd.read_csv('/Users/sketcha/lda/nyt.csv', sep=',')
print('Original Number of Columns: {}, rows: {}'.format(nyt_df.shape[1], nyt_df.shape[0]) )

columns = ['pub_date', 'web_url','content']
nyt_df = nyt_df[columns]
nyt_df.drop_duplicates(subset ="content", inplace = True)

print(nyt_df.shape)
nyt_df.dropna(inplace=True)

nyt_df.head()

**Pre-processing**

In [None]:
# Pre-processing steps

%timeit
# initalizing the werdnet lemmatizer
lm = WordNetLemmatizer()
# creating a new column to store rows after processing
nyt_df['post_processed'] = ''

stopwords.extend(['advertisement', 'supported'])
# stopwords.extend(['rt'])
def processing(content):

    content = content.split(' ')
#     removing stopwords
    content = [word.strip().lower() for word in content if word.lower() not in stopwords]
#     removing punctuations
    rx = re.compile('([&#.:?!-()])*')
    content = [rx.sub('', word) for word in content]
    
    content = [word for word in content if len(word)>1 and word.isalpha()]
#   lemmatizing  
    content = [lm.lemmatize(word) for word in content]

    return ' '.join(content)

# removing opinion articles
for i in range(len(nyt_df)):
    if 'opinion' not in nyt_df.iloc[i,1]:
        nyt_df.iloc[i,-1] = processing(nyt_df.iloc[i,2])
        

nyt_df.dropna(inplace=True)
nyt_df.head()

In [None]:
# Frequency filtering

import gensim.corpora as corpora
#decomposing sentences into tokens 
tokens = [sentence.split(' ') for sentence in nyt_df['post_processed'] ]

# Create Dictionary
dct = corpora.Dictionary(tokens)
print('Unique words before filtering', len(dct))
# no_below= 20
dct.filter_extremes(no_below= 20, no_above=0.25 )
print('Unique words after filtering', len(dct))
# Create Corpus
corpus = [dct.doc2bow(token) for token in tokens]

**Coherence**

In [None]:
# Obtaining coherence results

from gensim.models import CoherenceModel

scores = []
for k in range(1,26):
  lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dct, num_topics=k, 
                                       random_state=100, chunksize=128, passes=10,
                                       per_word_topics=True)

  coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=dct, coherence='c_v')
  coherence_lda = coherence_model_lda.get_coherence()
  print( 'coherence_lda', coherence_lda)

  scores.append(coherence_lda)

In [None]:
# Plotting coherence results

selected_topics = np.argmax(scores)+3
plt.plot(list(range(1,26)), scores, color= "blue")
plt.xlabel('Number of topics')
plt.ylabel('Coherence Scores')
plt.savefig('nyt_coherence')

**Final Model**

In [None]:
# constructing the lda model

selected_topics= 6
lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dct, num_topics=selected_topics, 
                                           random_state=100, chunksize=128, passes=10 )

In [None]:
#save lda model 
lda_model.save("nyt.model")

**Visualization tool**

In [None]:
# !pip install pyLDAvis
import pyLDAvis.gensim_models
# import pickle 
import pyLDAvis

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dct)

**Plotting**

In [None]:
# Labeling the topics
topics_name = ['Sports', 'Community',  'U.S Politics', 'Economy', 'Travel Restrictions', 'Vaccination', 'Travel Restrictions']
predicted_topics = lda_model[corpus]

probs, topics = [], []
for k in predicted_topics:
  # print(k)
  k.sort(key=lambda x:x[1])
  topics.append(topics_name[ k[0][0] ] ), probs.append(k[0][1])

lda_model.show_topics()
nyt_df['Predicted Topic'] = topics
nyt_df['Probability'] = probs
dates = [row.split('T')[0] for row in nyt_df['pub_date'] ]
nyt_df['pub_date'] = dates

In [None]:
# Plotting for the topics discussed in the results section only

topic_name = ['Economy', 'Travel Restrictions', 'U.S Politics', 'Vaccination' ]

dates = [row.split('T')[0] for row in nyt_df['pub_date'] ]
nyt_df['pub_date'] = dates

nyt_df['new_date'] =  pd.to_datetime(nyt_df['pub_date'], format='%Y-%m-%d')
nyt_df = nyt_df.set_index('new_date', drop=True)


plt.figure()
for i in range(len(topic_name)):
    
    x = nyt_df[nyt_df['Predicted Topic'] == topic_name[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')

    plt.plot(curr.x_axis, curr['Predicted Topic'], label = topic_name[i])
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks()
plt.yticks()
plt.savefig('nyt_alltopics_plot')
plt.show()

In [None]:
# Plotting for the topics "Vaccination" and "Travel Restrictions"

topic_name = ['Vaccination', 'Travel Restrictions']

dates = [row.split('T')[0] for row in nyt_df['pub_date'] ]
nyt_df['pub_date'] = dates

nyt_df['new_date'] =  pd.to_datetime(nyt_df['pub_date'], format='%Y-%m-%d')
nyt_df = nyt_df.set_index('new_date', drop=True)


plt.figure()
for i in range(len(topic_name)):
    
    x = nyt_df[nyt_df['Predicted Topic'] == topic_name[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')

    plt.plot(curr.x_axis, curr['Predicted Topic'], label = topic_name[i], color= "red")
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks()
plt.yticks()
plt.savefig('nyt_vaccine_travel')
plt.show()

In [None]:
# Plotting for the topic "Economy"

topic_name = ['Economy']

dates = [row.split('T')[0] for row in nyt_df['pub_date'] ]
nyt_df['pub_date'] = dates

nyt_df['new_date'] =  pd.to_datetime(nyt_df['pub_date'], format='%Y-%m-%d')
nyt_df = nyt_df.set_index('new_date', drop=True)


plt.figure()
for i in range(len(topic_name)):
    
    x = nyt_df[nyt_df['Predicted Topic'] == topic_name[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')

    plt.plot(curr.x_axis, curr['Predicted Topic'], label = topic_name[i], color= 'blue')
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks()
plt.yticks()
plt.savefig('nyt_economy_')
plt.show()

In [None]:
#Plotting for the topic "U.S Politics"

topic_name = ['U.S Politics']

dates = [row.split('T')[0] for row in nyt_df['pub_date'] ]
nyt_df['pub_date'] = dates

nyt_df['new_date'] =  pd.to_datetime(nyt_df['pub_date'], format='%Y-%m-%d')
nyt_df = nyt_df.set_index('new_date', drop=True)


plt.figure()
for i in range(len(topic_name)):
    
    x = nyt_df[nyt_df['Predicted Topic'] == topic_name[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')

    plt.plot(curr.x_axis, curr['Predicted Topic'], label = topic_name[i], color= 'blue')
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.xticks()
plt.yticks()
plt.savefig('us_politics_plot')
plt.show()

In [None]:
#saving as .csv

import pandas as pd
nyt_df.reset_index(drop=True, inplace=True)
nyt_df.to_csv('nyt_topics.csv', index=True, columns=['pub_date', 'content', 'Predicted Topic'], sep=',')

In [None]:
# viewig top documents for a given topic

#topic_name = ['']

x = nyt_df[ nyt_df['Predicted Topic'] == topics_name[0] ]
x = x.sort_values('Probability', ascending=False)

x = x[['content', 'keywords']]
x.head(7)