# LDA Topic Modelling

* This notebook is showcases the process of building an NLP Topic Model using `Latent Dirichlet Allocation` method. 
* The dataset we are going to use are `title` and `soft title` from `apify_dataset_clean.csv`. 

## Table Of Contents

## Installations


In [1]:
# ## installing required libraries
! pip install pandas
! pip install numpy
! pip install plotly
! pip install nbformat
! pip install ipykernel
! pip install matplotlip
! pip install wordcloud
! pip install gensim
! pip install pyLDAvis
! pip install nltk
! pip install spacy[transformers,lookups]
!python -m spacy download en_core_web_trf 

/bin/bash: /home/gaurang/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
/bin/bash: /home/gaurang/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
/bin/bash: /home/gaurang/miniconda3/lib/libtinfo.so.6: no version information available (required by /bin/bash)

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[

## Imports

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import re
import string
from bs4 import BeautifulSoup
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
from pprint import pprint

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

from gensim import corpora, models
from gensim.utils import simple_preprocess
from gensim.models import Phrases
from gensim.models.phrases import Phraser
from gensim.models import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.models.ldamulticore import LdaMulticore

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn
import pyLDAvis.gensim_models as gensimvis
import matplotlib.pyplot as plt
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/gaurang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


OSError: /home/gaurang/workspace/Aletheia/env/lib/python3.10/site-packages/torch/lib/../../nvidia/cublas/lib/libcublas.so.11: undefined symbol: cublasLtGetStatusString, version libcublasLt.so.11

## Reading Data

In [8]:
## reading manaully scrapped data
data = pd.read_csv('../data/scrapped_fox_data_clean.csv')
print(data.shape)

(3972, 12)


### Preparing Stop Words


In [36]:
## extending stopwords
# lets break down the cleaning functions into smaller functions
nlp = spacy.load('en_core_web_trf')
stop_words = nltk.corpus.stopwords.words('english')

## trying to remove stopwords from stopwords super set. 
stopwords_super_set = pd.read_csv("../data/stopwords/sw1k.csv")

## filtering stopwords to pronouns and other type
stopwords_to_remove = list(stopwords_super_set.loc[(stopwords_super_set["type"] == "G" ), "term"])


# stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'say', 'one', 'time', 'people',
#                   'know', 'like', 'tell', 'get', 'year', 'go', 'around', 'award', 'actually', 'carry',
#                    'new', 'it', 'show', 'news', 'go', 'fox', 'make', 'do', 'not', 'say',
#                    'also', 'love', 'it', 'star', 'go', 'do', 'say', 'not', 'said'
#                    ])

# stop_words.extend(stopwords_to_remove)
print(stop_words)

OSError: [E050] Can't find model 'en_core_web_trf'. It doesn't seem to be a Python package or a valid path to a data directory.

## Utility Functions for Text Cleaning

In [10]:
## Utility Functions for Text Cleaning
def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

# function to clean html tags from text
def clean_html(html):
    # parse html content
    soup = BeautifulSoup(html, "html.parser")
    for data in soup(['style', 'script', 'code', 'a']):
        # Remove tags
        data.decompose()
    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

# function to convert text to lowercase


def lower_case(text):
    return text.lower()

# function to remove line breaks


def remove_line_breaks(text):
    return re.sub(r'\n', '', text)

# function to remove punctuation


def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# function to remove numbers


def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# function to remove extra spaces


def remove_extra_spaces(text):
    return re.sub(' +', ' ', text)

# function to remove stopwords


def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]


# function for text lemmatization using spacy
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


## Gensim LDA with BOW

### Text Pre-processing

In [11]:
def preprocess_text(text):
     text = clean_html(text)
     text = lower_case(text)
     text = remove_line_breaks(text)
     text = remove_punctuation(text)
     text = remove_numbers(text)
     text = remove_extra_spaces(text)
     return text

data["cleaned_soft_title"] = data["title"].apply(preprocess_text)



### Tokenizing

In [12]:
data_words = list(sent_to_words(data['cleaned_soft_title']))

In [13]:
data_words

[['hassan',
  'and',
  'bolduc',
  'trade',
  'fire',
  'in',
  'final',
  'showdown',
  'after',
  'gop',
  'nominee',
  'comes',
  'under',
  'attack',
  'arriving',
  'at',
  'debate'],
 ['biden',
  'suggests',
  'voting',
  'for',
  'republicans',
  'is',
  'threat',
  'to',
  'democracy'],
 ['nycs',
  'naked',
  'cowboy',
  'makes',
  'endorsement',
  'for',
  'gov',
  'while',
  'performing',
  'on',
  'times',
  'square',
  'restore',
  'law',
  'and',
  'order'],
 ['wisconsin',
  'courts',
  'shoot',
  'down',
  'liberal',
  'groups',
  'attempts',
  'to',
  'change',
  'rules',
  'for',
  'absentee',
  'ballots'],
 ['texas',
  'gubernatorial',
  'candidate',
  'beto',
  'orourke',
  'joins',
  'obama',
  'in',
  'using',
  'tiktok',
  'to',
  'push',
  'getoutthevote',
  'message'],
 ['white',
  'house',
  'wont',
  'say',
  'if',
  'it',
  'plans',
  'to',
  'pay',
  'for',
  'twitter',
  'or',
  'if',
  'republic',
  'will',
  'survive',
  'gop',
  'takeover',
  'of',
  'con

### Creating Bigram & Tigram Models

In [14]:
# Build the bigram and trigram models
bigram = Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = Phrases(bigram[data_words], threshold=100)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = Phraser(bigram)
trigram_mod = Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[10]]])

['rnc', 'chair', 'ronna', 'mcdaniel', 'says', 'gop', 'seeing', 'huge', 'enthusiasm', 'with', 'less', 'than', 'week', 'until', 'election', 'day']


In [15]:


# Form Bigrams
data_words_bigrams = make_bigrams(data_words, bigram_mod)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
# nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
data_lemmatized
# Remove Stop Words
# data_words_nostops = remove_stopwords(data_lemmatized)

# print(data_lemmatized[:1])

[['trade',
  'fire',
  'final',
  'showdown',
  'nominee',
  'come',
  'attack',
  'arrive',
  'debate'],
 ['suggest', 'voting', 'threat', 'democracy'],
 ['naked',
  'cowboy',
  'make',
  'endorsement',
  'perform',
  'time',
  'square',
  'restore',
  'law',
  'order'],
 ['court',
  'shoot',
  'liberal',
  'group',
  'attempt',
  'change',
  'rule',
  'absentee',
  'ballot'],
 ['gubernatorial',
  'candidate',
  'join',
  'use',
  'tiktok',
  'push',
  'getoutthevote',
  'message'],
 ['white_house', 'say', 'plan', 'pay', 'republic', 'survive', 'takeover'],
 ['slam', 'divide', 'nation', 'ahead', 'speech', 'assail'],
 ['official', 'say', 'vote', 'cast', 'wrong', 'race'],
 ['campaign', 'attempt', 'block', 'trump', 'ally', 'win', 'midterm'],
 ['academic', 'urge', 'act', 'response', 'iranian', 'protest', 'crackdown'],
 ['say', 'see', 'huge', 'enthusiasm', 'less', 'week', 'election', 'day'],
 ['authority',
  'stop',
  'uber',
  'driver',
  'smuggle',
  'several',
  'illegal_immigrant'],
 ['e

In [16]:
data_lemmatized

[['trade',
  'fire',
  'final',
  'showdown',
  'nominee',
  'come',
  'attack',
  'arrive',
  'debate'],
 ['suggest', 'voting', 'threat', 'democracy'],
 ['naked',
  'cowboy',
  'make',
  'endorsement',
  'perform',
  'time',
  'square',
  'restore',
  'law',
  'order'],
 ['court',
  'shoot',
  'liberal',
  'group',
  'attempt',
  'change',
  'rule',
  'absentee',
  'ballot'],
 ['gubernatorial',
  'candidate',
  'join',
  'use',
  'tiktok',
  'push',
  'getoutthevote',
  'message'],
 ['white_house', 'say', 'plan', 'pay', 'republic', 'survive', 'takeover'],
 ['slam', 'divide', 'nation', 'ahead', 'speech', 'assail'],
 ['official', 'say', 'vote', 'cast', 'wrong', 'race'],
 ['campaign', 'attempt', 'block', 'trump', 'ally', 'win', 'midterm'],
 ['academic', 'urge', 'act', 'response', 'iranian', 'protest', 'crackdown'],
 ['say', 'see', 'huge', 'enthusiasm', 'less', 'week', 'election', 'day'],
 ['authority',
  'stop',
  'uber',
  'driver',
  'smuggle',
  'several',
  'illegal_immigrant'],
 ['e

### Create Dictionary & Corpus

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]]


In [18]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]


[[('arrive', 1),
  ('attack', 1),
  ('come', 1),
  ('debate', 1),
  ('final', 1),
  ('fire', 1),
  ('nominee', 1),
  ('showdown', 1),
  ('trade', 1)]]

### Building the Topic Model

In [32]:
lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=10,
                     random_state=100,
                     update_every=1,
                     chunksize=250,
                     passes=20,
                     alpha='auto',
                     per_word_topics=True)


In [33]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.079*"abortion" + 0.039*"state" + 0.031*"election" + 0.029*"midterm" + '
  '0.025*"ruling" + 0.021*"law" + 0.018*"ban" + 0.014*"tax" + 0.013*"federal" '
  '+ 0.013*"show"'),
 (1,
  '0.036*"migrant" + 0.025*"blast" + 0.023*"spending" + 0.021*"stop" + '
  '0.020*"charge" + 0.016*"leave" + 0.016*"death" + 0.015*"mother" + '
  '0.014*"economy" + 0.012*"launch"'),
 (2,
  '0.035*"covid" + 0.032*"vote" + 0.024*"plan" + 0.024*"threat" + 0.021*"warn" '
  '+ 0.020*"hit" + 0.017*"question" + 0.015*"pass" + 0.011*"protester" + '
  '0.011*"allow"'),
 (3,
  '0.037*"group" + 0.035*"campaign" + 0.028*"police" + 0.024*"biden" + '
  '0.023*"attack" + 0.018*"follow" + 0.015*"use" + 0.014*"help" + '
  '0.014*"conservative" + 0.012*"defund"'),
 (4,
  '0.129*"say" + 0.043*"candidate" + 0.039*"trump" + 0.028*"support" + '
  '0.023*"democratic" + 0.022*"former" + 0.021*"win" + 0.019*"house" + '
  '0.016*"make" + 0.015*"face"'),
 (5,
  '0.074*"bill" + 0.022*"school" + 0.022*"dem" + 0.021*"gun" + '
  '

### Analyzing Model

In [34]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.461233955962431

Coherence Score:  0.5071908961490298


## manually param tuning

num_topics:25
chunk_size:250
passes:20
Perplexity:  -11.926176931631348
Coherence Score:  0.4738435729328295

----------


num_topics:30
chunk_size:250
passes:20
Perplexity:  -12.765642173710603
Coherence Score:  0.4682941776918114

----------

num_topics:25
chunk_size:300
passes:20
Perplexity:  -11.44238576300284
Coherence Score:  0.4532938311246478

----------

num_topics:30
chunk_size:250
passes:25
Perplexity:  -12.763182274323162
Coherence Score:  0.4667890931756996

----------

num_topics:30
chunk_size:250
passes:35
Perplexity:  -12.760102214774426
Coherence Score:  0.472253052110792


### Visualize Topics

In [35]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = gensimvis.prepare(lda_model, corpus, id2word)
vis

  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
