In [None]:
import numpy as np
import pandas as pd
import gensim
import matplotlib.pyplot as plt

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
import re

# nltk.download('stopwords')
# nltk.download('wordnet')
# !pip install -U gensim
stopwords = stopwords.words('english')

In [None]:
# Loading the datasets

guard_df = pd.read_csv('/Users/sketcha/lda/guardian.csv', sep=',')
print('For Guardian original Number of Columns: {}, rows: {}'.format(guard_df.shape[1], guard_df.shape[0]) )

columns = ['firstPublicationDate', 'webUrl', 'bodyText']
guard_df = guard_df[columns]
guard_df.drop_duplicates(subset ="bodyText", inplace = True)

print(guard_df.shape)
guard_df.dropna(inplace=True)

guard_df.head()

**Pre-processing**

In [None]:
# Pre-processing steps

%timeit
# initalizing the werdnet lemmatizer
lm = WordNetLemmatizer()
# creating a new column to store rows after processing
guard_df['post_processed'] = np.nan

stopwords.extend(['advertisement', 'supported'])
def processing(content):

    content = content.split(' ')
#   removing stopwords
    content = [word.strip().lower() for word in content if word.lower() not in stopwords]
#   removing punctuations
    rx = re.compile('([&#.:?!-()])*')
    content = [rx.sub('', word) for word in content]
    
    content = [word for word in content if len(word)>1 and word.isalpha()]
#   lemmatizing    
    content = [lm.lemmatize(word) for word in content]

    return ' '.join(content)

# removing opinion articles
for i in range(len(guard_df)):
    if 'opinion' not in guard_df.iloc[i,1]:
        guard_df.iloc[i,-1] = processing(guard_df.iloc[i,2]) 


guard_df.dropna(inplace=True)
guard_df.head()

In [None]:
# Frequency filtering

import gensim.corpora as corpora
#decomposing sentences into tokens 
tokens = [sentence.split(' ') for sentence in guard_df['post_processed'] ]

# Create Dictionary
dct = corpora.Dictionary(tokens)
print('Unique words before filtering', len(dct))
# no_below= 30
dct.filter_extremes(no_below= 20, no_above=0.25 )
print('Unique words after filtering', len(dct))
# Create Corpus
corpus = [dct.doc2bow(token) for token in tokens]

**Coherence**

In [None]:
# Obtaining coherence results

%%time
from gensim.models import CoherenceModel
import time

scores = []
for k in range(1,26):
  lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dct, num_topics=k, 
                                       random_state=100, chunksize=128, passes=10,
                                       per_word_topics=True)

  coherence_model_lda = CoherenceModel(model=lda_model, texts=tokens, dictionary=dct, coherence='c_v')
  coherence_lda = coherence_model_lda.get_coherence()
  print( 'coherence_lda', coherence_lda)
  # time.sleep(20)

  scores.append(coherence_lda)

In [None]:
# Plotting coherence results

selected_topics = np.argmax(scores)+3
plt.plot(list(range(1,26)), scores, color= "green")
plt.xlabel('Number of topics')
plt.ylabel('Coherence Scores')
plt.savefig('guardian_coherence')

<b> Final Model </b>

In [None]:
# constructing the lda model

selected_topics = 7

lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dct, num_topics=selected_topics, 
                                           random_state=100, chunksize=128, passes=10 )

In [None]:
#save lda model 
lda_model.save("guardian.model")

**Visualization tool**

In [None]:
# !pip install pyLDAvis
import pyLDAvis.gensim_models
# import pickle 
import pyLDAvis

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dct)

**Plotting**

In [None]:
# in order to use the style, you must put the 'apa.mplstyle' file inside of 'stylelib' folder. please see: https://github.com/sollan/apa.mplstyle

import matplotlib
matplotlib.style.use(matplotlib.get_data_path()+'/stylelib/apa.mplstyle') # selecting the style sheet

In [None]:
# labelling the topics 

topic_names = ['Sports', 'Vaccination',  'Policy', 'Economy', 'U.S Politics', 'Travel Restrictions',
               'Australian News']
predicted_topics = lda_model[corpus]

probs, topics = [], []
for k in predicted_topics:
  # print(k)
  k.sort(key=lambda x:x[1])
  topics.append(topic_names[ k[0][0] ] ), probs.append(k[0][1])

guard_df['Predicted Topic'] = topics
guard_df['Probability'] = probs

dates = [row.split('T')[0] for row in guard_df['firstPublicationDate'] ]
guard_df['firstPublicationDate'] = dates


In [None]:
# Plotting for the topics discussed in the results section only

topic_name = ['Economy', 'Travel Restrictions', 'U.S Politics', 'Vaccination' ]

guard_df['new_date'] =  pd.to_datetime(guard_df['firstPublicationDate'], format='%Y-%m-%d')
guard_df = guard_df.set_index('new_date', drop=True)

plt.figure()
for i in range(len(topics_name)):
    
    x = guard_df[guard_df['Predicted Topic'] == topic_names[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')
    
    plt.plot(curr.x_axis, curr['Predicted Topic'], label=topics_name[i] )
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.savefig('nyt_results_topics_plot')
plt.show()

In [None]:
# Plotting for the topics "Vaccination" and "Travel Restrictions"

topic_name = ['Vaccination', 'Travel Restrictions' ]

guard_df['new_date'] =  pd.to_datetime(guard_df['firstPublicationDate'], format='%Y-%m-%d')
guard_df = guard_df.set_index('new_date', drop=True)

plt.figure()
for i in range(len(topics_name)):
    
    x = guard_df[guard_df['Predicted Topic'] == topic_names[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')
    
    plt.plot(curr.x_axis, curr['Predicted Topic'], label=topics_name[i] )
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.savefig('guardian_vaccine_travel_plot')
plt.show()

In [None]:
# Plotting for the topic "Economy"

topic_name = ['Economy']

guard_df['new_date'] =  pd.to_datetime(guard_df['firstPublicationDate'], format='%Y-%m-%d')
guard_df = guard_df.set_index('new_date', drop=True)

plt.figure()
for i in range(len(topics_name)):
    
    x = guard_df[guard_df['Predicted Topic'] == topic_names[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')
    
    plt.plot(curr.x_axis, curr['Predicted Topic'], label=topics_name[i] )
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.savefig('nyt_economy_plot')
plt.show()

In [None]:
#Plotting for the topic "U.S Politics"

topic_name = ['U.S Politics']

guard_df['new_date'] =  pd.to_datetime(guard_df['firstPublicationDate'], format='%Y-%m-%d')
guard_df = guard_df.set_index('new_date', drop=True)

plt.figure(figsize=(15,6))
for i in range(len(topics_name)):
    
    x = guard_df[guard_df['Predicted Topic'] == topic_names[i] ]
    curr = x.groupby(x.index.strftime('%Y-%m')).count()
    curr = curr.reset_index()
    curr['x_axis'] = pd.to_datetime(curr['new_date'], format='%Y-%m')
    
    plt.plot(curr.x_axis, curr['Predicted Topic'], label=topics_name[i] )
    
plt.legend(loc=0)
plt.xlabel('Time')
plt.ylabel('Frequency')
plt.savefig('guardian_uspolitics_plot')
plt.show()

In [None]:
df.reset_index(drop=True, inplace=True)
df.to_csv('topics.csv', index=True, columns=['firstPublicationDate', 'bodyText', 'Predicted Topic'], sep=',')