# LDA Topic Modelling

* This notebook is showcases the process of building an NLP Topic Model using `Latent Dirichlet Allocation` method. 
* The dataset we are going to use in this notebook is from `scrapped_fox_data_clean.csv`. 
* We tried topic modeling with the `apify` data but were not able to train a reliable model. The reason could be that `apify` data contains news articles from whole `foxnews` site, but our scrapped `dataset` has only political news. 

## Table Of Contents

## Installations


In [1]:
# ## installing required libraries
# ! pip install pandas
# ! pip install numpy
# ! pip install plotly
# ! pip install nbformat
# ! pip install ipykernel
# ! pip install matplotlip
# ! pip install wordcloud
# ! pip install gensim
# ! pip install pyLDAvis
# ! pip install nltk
# ! pip install spacy
# !python -m spacy download en_core_web_lg 

## Imports

In [18]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from pprint import pprint

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gaurang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading Data

In [4]:
## reading manaully scrapped data
data = pd.read_csv('../data/scrapped_fox_data_clean.csv')
print(data.shape)

(3972, 12)


### Preparing Stop Words


In [12]:
## extending stopwords
# lets break down the cleaning functions into smaller functions
nlp = spacy.load('en_core_web_lg')
stop_words = nltk.corpus.stopwords.words('english')

## trying to remove stopwords from stopwords super set. 
stopwords_super_set = pd.read_csv("../data/stopwords/sw10k.csv")

## filtering stopwords to pronouns and other type
stopwords_to_remove = list(stopwords_super_set.loc[(stopwords_super_set["type"] == "G" ), "term"])


# stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'say', 'one', 'time', 'people',
#                   'know', 'like', 'tell', 'get', 'year', 'go', 'around', 'award', 'actually', 'carry',
#                    'new', 'it', 'show', 'news', 'go', 'fox', 'make', 'do', 'not', 'say',
#                    'also', 'love', 'it', 'star', 'go', 'do', 'say', 'not', 'said'
#                    ])

stop_words.extend(stopwords_to_remove)
# print(stop_words)

## Utility Functions for Text Cleaning

In [13]:
## Utility Functions for Text Cleaning
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

# function to clean html tags from text
def clean_html(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

# function to convert text to lowercase


def lower_case(text):
    return text.lower()

# function to remove line breaks


def remove_line_breaks(text):
    return re.sub(r'\n', '', text)

# function to remove punctuation


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# function to remove numbers


def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# function to remove extra spaces


def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

# function to remove stopwords


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


# function for text lemmatization using spacy
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


## Gensim LDA with BOW

### Text Pre-processing

In [14]:
def preprocess_text(text):
     text = clean_html(text)
     text = lower_case(text)
     text = remove_line_breaks(text)
     text = remove_punctuation(text)
     text = remove_numbers(text)
     text = remove_extra_spaces(text)
     return text

data["cleaned_text"] = data["text"].apply(preprocess_text)

### Tokenizing

In [16]:
data_words = list(sent_to_words(data['cleaned_text']))

### Creating Bigram & Tigram Models

In [20]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['former', 'governor', 'and', 'first', 'term', 'democratic_sen_maggie', 'hassan', 'of', 'new_hampshire', 'and', 'republican', 'challenger', 'don_bolduc', 'took_aim', 'at', 'each', 'other', 'over', 'inflation', 'abortion', 'national_security', 'the', 'border', 'crisis', 'election', 'denialism', 'and', 'many', 'more', 'issues', 'in', 'their', 'third', 'and', 'final', 'debate', 'in', 'their', 'crucial_battleground', 'state', 'race', 'that', 'among', 'handful_across', 'the', 'country', 'that', 'will', 'likely', 'determine', 'if', 'the', 'gop', 'wins', 'back', 'the', 'senate', 'majority', 'but', 'ahead', 'of', 'the', 'verbal', 'crossfire', 'on', 'the', 'debate', 'stage', 'bolduc', 'former', 'army', 'general', 'who', 'served', 'ten', 'tours', 'of', 'duty', 'in', 'the', 'war', 'in', 'afghanistan', 'was', 'allegedly', 'assaulted', 'as', 'he', 'arrived', 'at', 'the', 'debate', 'site', 'at', 'saint_anselm_college', 'new_hampshire', 'institute', 'of', 'politics', 'on', 'wednesday', 'evening', 'ac

In [23]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['aim', 'abortion', 'border', 'crisis', 'denialism', 'debate', 'battleground', 'race', 'handful', 'win', 'crossfire', 'debate', 'tour', 'war', 'debate', 'campaign', 'bystander', 'crowd', 'debate', 'swing', 'campaign', 'graze', 'campaign', 'punch', 'libertarian', 'activist', 'countdown', 'stake', 'campaign', 'behavior', 'libertarian', 'activist', 'campaign', 'volunteer', 'debate', 'reference', 'altercation', 'debate', 'attack', 'problem', 'fuel', 'strike', 'campaign', 'spokesperson', 'tonight', 'temperature', 'discourse', 'debate', 'crowd', 'punch', 'apprehend', 'response', 'enforcement', 'scene', 'midterm', 'campaign', 'cycle', 'emphasize', 'magarepublican', 'outsider', 'credential', 'combustible', 'nomination', 'opinion', 'polling', 'survey', 'race', 'margin', 'contest', 'candidate', 'fire', 'debate', 'combustible', 'abortion', 'campaign', 'group', 'showcase', 'clip', 'bolduc', 'vow', 'vote', 'rule', 'legalized_abortion', 'tout', 'effort', 'push', 'abortion', 'abortion', 'debate', 'g

### Create Dictionary & Corpus

In [24]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 6), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 10), (18, 1), (19, 1), (20, 1), (21, 1), (22, 2), (23, 2), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 2), (31, 1), (32, 10), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 4), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 1), (61, 1), (62, 2), (63, 2), (64, 2), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 1), (72, 3), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 2), (80, 2), (81, 1), (82, 2), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 1), (89, 1), (90, 2), (91, 2), (92, 1), (93, 1), (94, 1), (95, 1), (96, 1), (97, 1), (98, 2), (99, 1), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), (110, 

In [26]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('abortion', 6),
  ('activist', 2),
  ('aim', 1),
  ('altercation', 1),
  ('app', 1),
  ('apprehend', 1),
  ('attack', 2),
  ('ballot', 1),
  ('battleground', 1),
  ('behavior', 1),
  ('bill', 1),
  ('birth', 1),
  ('bolduc', 1),
  ('border', 1),
  ('bottom', 1),
  ('bus', 1),
  ('bystander', 1),
  ('campaign', 10),
  ('candidate', 1),
  ('challenger', 1),
  ('chance', 1),
  ('clip', 1),
  ('combustible', 2),
  ('conceal', 2),
  ('conclusion', 1),
  ('contest', 1),
  ('countdown', 1),
  ('credential', 1),
  ('crisis', 1),
  ('crossfire', 1),
  ('crowd', 2),
  ('cycle', 1),
  ('debate', 10),
  ('decision', 2),
  ('denialism', 1),
  ('dinge', 1),
  ('disavow', 1),
  ('discourse', 1),
  ('doctor', 1),
  ('doubt', 1),
  ('dump', 1),
  ('effort', 1),
  ('election', 1),
  ('emphasize', 1),
  ('enforcement', 1),
  ('extremism', 1),
  ('fastball', 1),
  ('feed', 1),
  ('fire', 1),
  ('focus', 1),
  ('fuel', 1),
  ('future', 1),
  ('gas', 2),
  ('granite_stater', 4),
  ('graze', 1),
  ('group

### Building the Topic Model

In [74]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=30,
                     random_state=100,
                     update_every=1,
                     chunksize=250,
                     passes=75,
                     alpha='auto',
                     per_word_topics=True)


In [75]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(25,
  '0.069*"prayer" + 0.056*"parent" + 0.054*"thought" + 0.044*"ideology" + '
  '0.026*"suicide" + 0.022*"cell" + 0.021*"denial" + 0.021*"mourn" + '
  '0.020*"artist" + 0.020*"federally"'),
 (26,
  '0.107*"immigration" + 0.102*"ice" + 0.049*"title" + 0.046*"removal" + '
  '0.043*"transport" + 0.034*"enforcement" + 0.024*"immigrant" + '
  '0.021*"priority" + 0.021*"arrest" + 0.021*"customs_enforcement"'),
 (14,
  '0.060*"email" + 0.053*"recommendation" + 0.050*"shelter" + 0.039*"calendar" '
  '+ 0.035*"advice" + 0.032*"bias" + 0.032*"arrival" + 0.031*"ceremony" + '
  '0.029*"hotel" + 0.025*"interpret"'),
 (29,
  '0.094*"aid" + 0.053*"vaccinate" + 0.051*"burn" + 0.051*"rep" + 0.049*"girl" '
  '+ 0.042*"mass" + 0.038*"team" + 0.028*"accident" + 0.026*"century" + '
  '0.026*"punishment"'),
 (23,
  '0.117*"veteran" + 0.050*"mom" + 0.048*"army" + 0.046*"ethic" + 0.039*"pit" '
  '+ 0.037*"strength" + 0.033*"establishment" + 0.022*"tuition" + '
  '0.021*"reaffirm" + 0.019*"exit"'),
 (5,
  

### Analyzing Model

In [76]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -12.754988345523484

Coherence Score:  0.4702192022350141


## manually param tuning

num_topics:25
chunk_size:250
passes:20
Perplexity:  -11.926176931631348
Coherence Score:  0.4738435729328295

----------


num_topics:30
chunk_size:250
passes:20
Perplexity:  -12.765642173710603
Coherence Score:  0.4682941776918114

----------

num_topics:25
chunk_size:300
passes:20
Perplexity:  -11.44238576300284
Coherence Score:  0.4532938311246478

----------

num_topics:30
chunk_size:250
passes:25
Perplexity:  -12.763182274323162
Coherence Score:  0.4667890931756996

----------

num_topics:30
chunk_size:250
passes:35
Perplexity:  -12.760102214774426
Coherence Score:  0.472253052110792


### Visualize Topics

In [55]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
