<a href="https://colab.research.google.com/github/githinjimary/LDATopicModelling/blob/main/MG_gensim_lda_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

  formatvalue=lambda value: "")[1:-1]


In [None]:
# download stopwords

!python -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# utility functions from -- https://ourcodingclub.github.io/tutorials/topic-modelling-python/


def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet=re.sub(r'^b\s([RT]+)?','',tweet)  #replace RT-tags
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub("b'", '', tweet) # remove the b'
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

  tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
  tweet = re.sub('\s+', ' ', tweet) #remove double spacing


In [None]:
def clean2(text):
    text = str(text).lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-z A-Z]', ' ',text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'@[A-Za-z0–9]+', '', text) 
    text = re.sub(r' +', ' ', text)
    return text

  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [None]:
# download the tweet dataset 

!wget https://dsiwork.s3.amazonaws.com/dataset.csv

--2022-04-05 03:55:05--  https://dsiwork.s3.amazonaws.com/dataset.csv
Resolving dsiwork.s3.amazonaws.com (dsiwork.s3.amazonaws.com)... 52.217.226.57
Connecting to dsiwork.s3.amazonaws.com (dsiwork.s3.amazonaws.com)|52.217.226.57|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 496370 (485K) [text/csv]
Saving to: ‘dataset.csv’


2022-04-05 03:55:06 (4.34 MB/s) - ‘dataset.csv’ saved [496370/496370]



In [None]:
data = pd.read_csv("dataset.csv", parse_dates=["date_created"],   encoding="ISO-8859-1")

In [None]:
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very..."
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa..."
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ..."


In [None]:
#data['clean_tweet'] = data.tweet.apply(clean_tweet)
data['clean_tweet'] = data.tweet.apply(clean2)
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet,clean_tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...,year old ugandan traditional healer living so...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very...",date mein kya rakha hai nmarch very significa...
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...,moeletsi mbeki has accused south africa rulin...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa...",isuzu motors south africa has begun manufactu...
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ...",safa president anc danny jordaan nis destroyi...


In [None]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
data["clean_tweet"] = data["clean_tweet"].apply(lambda x : " ".join([w.lower() for w in x.split() if w not in stop_words and len(w) > 3]))

In [None]:
#Tokenize tweet
tweets = data["clean_tweet"].apply(lambda x : x.split())

In [None]:
#skipping lemmatization improved the topic results -- needs more investigation

#lemmatize
lemma = WordNetLemmatizer()
nltk.download('wordnet')
lemming_tweets = tweets.apply(lambda x:[lemma.lemmatize(i) for i in x])
lemming_tweets


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


0       [year, ugandan, traditional, healer, living, s...
1       [date, mein, rakha, nmarch, significant, date,...
2       [moeletsi, mbeki, accused, south, africa, ruli...
3       [isuzu, motor, south, africa, begun, manufactu...
4       [safa, president, danny, jordaan, destroying, ...
                              ...                        
2395    [mined, kimberley, south, africa, carat, tiffa...
2396    [breaking, government, mexico, publicly, rejec...
2397    [giveaway, human, lion, cage, experience, life...
2398      [save, exchange, south, africa, showing, flame]
2399    [must, watch, ndrama, agony, ecstasy, access, ...
Name: clean_tweet, Length: 2400, dtype: object

# Modelling 

In [None]:
!python -m spacy download en_core_web_lg

Collecting en_core_web_lg==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-2.2.5/en_core_web_lg-2.2.5.tar.gz (827.9 MB)
[K     |████████████████████████████████| 827.9 MB 1.3 MB/s 
Building wheels for collected packages: en-core-web-lg
  Building wheel for en-core-web-lg (setup.py) ... [?25l[?25hdone
  Created wheel for en-core-web-lg: filename=en_core_web_lg-2.2.5-py3-none-any.whl size=829180942 sha256=e86c8838a6db2de2df47b29f2e6b3f41c0f14287f19f1eab6205842046a63ce0
  Stored in directory: /tmp/pip-ephem-wheel-cache-b4p6btkn/wheels/11/95/ba/2c36cc368c0bd339b44a791c2c1881a1fb714b78c29a4cb8f5
Successfully built en-core-web-lg
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
pip install gensim



In [None]:
#create id2word dictionary
id2word = Dictionary(lemming_tweets)
print(len(id2word))

3633


In [None]:
#id2word.filter_extremes(no_below=2, no_above=.99)
#print(len(id2word))

1823


In [None]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in lemming_tweets]

In [None]:
# Instantiating a Base LDA model 
base_model = LdaMulticore(corpus=corpus, num_topics=8, id2word=id2word, workers=12, passes=5)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt

In [None]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

In [None]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [None]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
south africa oscar lupita would world afterparty come report worst

------ Topic 1 ------
africa south say manufacturing shift cancer bell away northern equipment

------ Topic 2 ------
south africa country power russia sanction global government western covid

------ Topic 3 ------
nelson africa never mandela south long name happy shall forgotten

------ Topic 4 ------
south africa watch look india must http access epic scene

------ Topic 5 ------
continue friend year white south indian coloured morning medium africa

------ Topic 6 ------
south africa morning brother skill rare fighter african child country

------ Topic 7 ------
south africa woman hockey world know held zimbabwe team junior

------ Topic 8 ------
south africa east asia wheat middle note anything dependent similarly

------ Topic 9 ------
south africa stand leaked call reject text across return globe



In [None]:
# Instantiating a Base LDA model 

LDA = gensim.models.ldamodel.LdaModel
lda_model = LDA(corpus=corpus, id2word=id2word, num_topics=10, random_state=100,
                chunksize=1000, passes=50,iterations=100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad 

In [None]:
lda_model.print_topics()

[(0,
  '0.031*"khatronkekhiladi" + 0.021*"away" + 0.020*"time" + 0.020*"manufacturing" + 0.019*"say" + 0.019*"bell" + 0.019*"truck" + 0.019*"shift" + 0.018*"south" + 0.018*"northern"'),
 (1,
  '0.102*"south" + 0.095*"africa" + 0.020*"african" + 0.012*"president" + 0.012*"apartheid" + 0.011*"black" + 0.009*"country" + 0.009*"europe" + 0.009*"america" + 0.008*"nation"'),
 (2,
  '0.038*"woman" + 0.030*"hockey" + 0.029*"country" + 0.024*"held" + 0.022*"world" + 0.022*"stand" + 0.019*"south" + 0.019*"team" + 0.019*"russia" + 0.019*"know"'),
 (3,
  '0.088*"africa" + 0.087*"south" + 0.028*"morning" + 0.028*"brother" + 0.027*"skill" + 0.025*"rare" + 0.025*"fighter" + 0.014*"many" + 0.012*"organized" + 0.012*"solidarity"'),
 (4,
  '0.065*"south" + 0.063*"africa" + 0.044*"india" + 0.042*"must" + 0.040*"look" + 0.039*"watch" + 0.035*"http" + 0.035*"access" + 0.034*"epic" + 0.034*"behind"'),
 (5,
  '0.050*"africa" + 0.049*"south" + 0.041*"continue" + 0.027*"friend" + 0.022*"white" + 0.013*"indian"

**VISUALIZATION**

In [None]:
pip install pyLDAvis

Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.6 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting funcy
  Downloading funcy-1.17-py2.py3-none-any.whl (33 kB)
Building wheels for collected packages: pyLDAvis
  Building wheel for pyLDAvis (PEP 517) ... [?25l[?25hdone
  Created wheel for pyLDAvis: filename=pyLDAvis-3.3.1-py2.py3-none-any.whl size=136898 sha256=2c2c25c56fc708ab077e07bfbace634136a86aff59c88e1c8dd47a371c5fe123
  Stored in directory: /root/.cache/pip/wheels/c9/21/f6/17bcf2667e8a68532ba2fbf6d5c72fdf4c7f7d9abfa4852d2f
Successfully built pyLDAvis
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-1.17 pyLDAvis-3.3.1


In [None]:
#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import chart_studio
import chart_studio.plotly as py 
import chart_studio.tools as tls

#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
#from pprint import pprint
#from wordcloud import STOPWORDS
#stopwords = set(STOPWORDS)

visualization of model1

In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=lemming_tweets, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt


Perplexity:  -6.489449066841204

Coherence Score:  0.32775855869944387


In [None]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(base_model, corpus, id2word)
lda_viz

  by='saliency', ascending=False).head(R).drop('saliency', 1)


visualization for model2

In [None]:
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(lda_model, corpus, id2word)
lda_viz

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [None]:
# Compute Perplexity
# a measure of how good the model is. lower the better
base_perplexity = base_model.log_perplexity(corpus)
print('\nPerplexity: ', base_perplexity) 

# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=lemming_tweets, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()
print('\nCoherence Score: ', coherence_lda_model_base)

  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)
  score += np.sum(cnt


Perplexity:  -6.557480079926102

Coherence Score:  0.3445004717085379
