<a href="https://colab.research.google.com/github/githinjimary/LDATopicModelling/blob/main/MG_gensim_lda_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# download stopwords

!python -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
# utility functions from -- https://ourcodingclub.github.io/tutorials/topic-modelling-python/


def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet=re.sub(r'^b\s([RT]+)?','',tweet)  #replace RT-tags
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub("b'", '', tweet) # remove the b'
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet

In [4]:
def clean2(text):
    text = str(text).lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-z A-Z]', ' ',text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'@[A-Za-z0–9]+', '', text) 
    text = re.sub(r' +', ' ', text)
    return text

In [5]:
# download the tweet dataset 

!wget https://dsiwork.s3.amazonaws.com/dataset.csv

--2022-04-13 07:41:51--  https://dsiwork.s3.amazonaws.com/dataset.csv
Resolving dsiwork.s3.amazonaws.com (dsiwork.s3.amazonaws.com)... 54.231.128.97
Connecting to dsiwork.s3.amazonaws.com (dsiwork.s3.amazonaws.com)|54.231.128.97|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 496370 (485K) [text/csv]
Saving to: ‘dataset.csv’


2022-04-13 07:41:53 (671 KB/s) - ‘dataset.csv’ saved [496370/496370]



In [6]:
data = pd.read_csv("dataset.csv", parse_dates=["date_created"],   encoding="ISO-8859-1")

In [7]:
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very..."
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa..."
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ..."


In [8]:
#data['clean_tweet'] = data.tweet.apply(clean_tweet)
data['clean_tweet'] = data.tweet.apply(clean2)
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet,clean_tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...,year old ugandan traditional healer living so...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very...",date mein kya rakha hai nmarch very significa...
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...,moeletsi mbeki has accused south africa rulin...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa...",isuzu motors south africa has begun manufactu...
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ...",safa president anc danny jordaan nis destroyi...


In [9]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
data["clean_tweet"] = data["clean_tweet"].apply(lambda x : " ".join([w.lower() for w in x.split() if w not in stop_words and len(w) > 3]))

In [10]:
#Tokenize tweet
tweets = data["clean_tweet"].apply(lambda x : x.split())

In [11]:
#skipping lemmatization improved the topic results -- needs more investigation

#lemmatize
lemma = WordNetLemmatizer()
nltk.download('wordnet')
lemming_tweets = tweets.apply(lambda x:[lemma.lemmatize(i) for i in x])
lemming_tweets


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


0       [year, ugandan, traditional, healer, living, s...
1       [date, mein, rakha, nmarch, significant, date,...
2       [moeletsi, mbeki, accused, south, africa, ruli...
3       [isuzu, motor, south, africa, begun, manufactu...
4       [safa, president, danny, jordaan, destroying, ...
                              ...                        
2395    [mined, kimberley, south, africa, carat, tiffa...
2396    [breaking, government, mexico, publicly, rejec...
2397    [giveaway, human, lion, cage, experience, life...
2398      [save, exchange, south, africa, showing, flame]
2399    [must, watch, ndrama, agony, ecstasy, access, ...
Name: clean_tweet, Length: 2400, dtype: object

# Modelling 

In [12]:
%%capture
!python -m spacy download en_core_web_lg

In [28]:
pip install gensim



In [18]:
#create id2word dictionary
from gensim.corpora import Dictionary
id2word = Dictionary(lemming_tweets)
print(len(id2word))

3633


In [16]:
#id2word.filter_extremes(no_below=2, no_above=.99)
#print(len(id2word))

In [21]:
# Creating a corpus object 
corpus = [id2word.doc2bow(d) for d in lemming_tweets]

In [23]:
# Instantiating a Base LDA model
%%capture 
from gensim.models.ldamulticore import LdaMulticore
base_model = LdaMulticore(corpus=corpus, num_topics=8, id2word=id2word, workers=12, passes=5)

In [24]:
# Filtering for words 
words = [re.findall(r'"([^"]*)"',t[1]) for t in base_model.print_topics()]

In [25]:
# Create Topics
topics = [' '.join(t[0:10]) for t in words]

In [26]:
# Getting the topics
for id, t in enumerate(topics): 
    print(f"------ Topic {id} ------")
    print(t, end="\n\n")

------ Topic 0 ------
south africa asia east wheat middle anything note similarly dependent

------ Topic 1 ------
oscar south africa world woman lupita hockey would held know

------ Topic 2 ------
south africa country russia year cape sanction global western town

------ Topic 3 ------
africa south nelson khatronkekhiladi brother skill morning rare fighter long

------ Topic 4 ------
south africa india watch look must http epic access behind

------ Topic 5 ------
south africa african drug black country work people entrepreneur anxiety

------ Topic 6 ------
continue friend away white manufacturing northern bell shift equipment hemisphere

------ Topic 7 ------
covid vaccine government south waiver breaking mexico trip publicly reject



**second model**

In [29]:
# Instantiating a Base LDA model
%%capture 
import gensim

LDA = gensim.models.ldamodel.LdaModel
lda_model = LDA(corpus=corpus, id2word=id2word, num_topics=10, random_state=100,
                chunksize=1000, passes=50,iterations=100)

In [30]:
lda_model.print_topics()

[(0,
  '0.031*"khatronkekhiladi" + 0.021*"away" + 0.020*"time" + 0.020*"manufacturing" + 0.019*"say" + 0.019*"bell" + 0.019*"truck" + 0.019*"shift" + 0.018*"south" + 0.018*"northern"'),
 (1,
  '0.102*"south" + 0.095*"africa" + 0.020*"african" + 0.012*"president" + 0.012*"apartheid" + 0.011*"black" + 0.009*"country" + 0.009*"europe" + 0.009*"america" + 0.008*"nation"'),
 (2,
  '0.038*"woman" + 0.030*"hockey" + 0.029*"country" + 0.024*"held" + 0.022*"world" + 0.022*"stand" + 0.019*"south" + 0.019*"team" + 0.019*"russia" + 0.019*"know"'),
 (3,
  '0.088*"africa" + 0.087*"south" + 0.028*"morning" + 0.028*"brother" + 0.027*"skill" + 0.025*"rare" + 0.025*"fighter" + 0.014*"many" + 0.012*"organized" + 0.012*"solidarity"'),
 (4,
  '0.065*"south" + 0.063*"africa" + 0.044*"india" + 0.042*"must" + 0.040*"look" + 0.039*"watch" + 0.035*"http" + 0.035*"access" + 0.034*"epic" + 0.034*"behind"'),
 (5,
  '0.050*"africa" + 0.049*"south" + 0.041*"continue" + 0.027*"friend" + 0.022*"white" + 0.013*"indian"

**VISUALIZATION**

In [31]:
# install if not available
%%capture
!pip install pyLDAvis

In [33]:
#Visualizations
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis


#Natural Language Processing (NLP)
import spacy
import gensim
from spacy.tokenizer import Tokenizer
from gensim.corpora import Dictionary
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS as SW
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV


visualization of model1

In [39]:
# Compute Perplexity
# a measure of how good the model is. lower the better
%%capture
base_perplexity = base_model.log_perplexity(corpus)

# Compute Coherence Score
coherence_model = CoherenceModel(model=base_model, texts=lemming_tweets, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()


In [40]:
print('\nPerplexity: ', base_perplexity) 
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.515977277159461

Coherence Score:  0.34292349651464127


In [35]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(base_model, corpus, id2word)
lda_viz

  by='saliency', ascending=False).head(R).drop('saliency', 1)


visualization for model2

In [36]:
pyLDAvis.enable_notebook()
lda_viz = gensimvis.prepare(lda_model, corpus, id2word)
lda_viz

  by='saliency', ascending=False).head(R).drop('saliency', 1)


In [37]:
# Compute Perplexity
# a measure of how good the model is. lower the better
%%capture
base_perplexity = base_model.log_perplexity(corpus)
 
# Compute Coherence Score
coherence_model = CoherenceModel(model=lda_model, texts=lemming_tweets, 
                                   dictionary=id2word, coherence='c_v')
coherence_lda_model_base = coherence_model.get_coherence()


In [38]:
print('\nPerplexity: ', base_perplexity)
print('\nCoherence Score: ', coherence_lda_model_base)


Perplexity:  -6.51592199107685

Coherence Score:  0.3445004717085379


*Based on the values, model two did better***