<a href="https://colab.research.google.com/github/githinjimary/LDATopicModelling/blob/main/MG_sklearn_topic_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install if not available
%%capture
!pip install pyLDAvis

In [2]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

  from collections import Iterable
  from collections import Mapping


#Preprocessing

In [3]:
# download stopwords
!python -m nltk.downloader stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
def clean2(text):
    """cleans the text to prepare for NLP"""
    
    text = str(text).lower()
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-z A-Z]', ' ',text)
    text = re.sub(r'\b\w{1,2}\b', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re.sub(r'@[A-Za-z0–9]+', '', text) 
    text = re.sub(r' +', ' ', text)
    return text

  text = re.sub('https?://\S+|www\.\S+', '', text)
  text = re.sub('\[.*?\]', '', text)
  text = re.sub('\w*\d\w*', '', text)


In [5]:
# download the tweet dataset 

!wget https://dsiwork.s3.amazonaws.com/dataset.csv

--2022-04-13 07:28:42--  https://dsiwork.s3.amazonaws.com/dataset.csv
Resolving dsiwork.s3.amazonaws.com (dsiwork.s3.amazonaws.com)... 52.216.184.147
Connecting to dsiwork.s3.amazonaws.com (dsiwork.s3.amazonaws.com)|52.216.184.147|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 496370 (485K) [text/csv]
Saving to: ‘dataset.csv’


2022-04-13 07:28:43 (6.13 MB/s) - ‘dataset.csv’ saved [496370/496370]



In [6]:
data = pd.read_csv("dataset.csv", parse_dates=["date_created"],   encoding="ISO-8859-1")

In [7]:
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very..."
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa..."
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ..."


In [9]:
data['clean_tweet'] = data.tweet.apply(clean2)
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet,clean_tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...,year old ugandan traditional healer living so...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very...",date mein kya rakha hai nmarch very significa...
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...,moeletsi mbeki has accused south africa rulin...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa...",isuzu motors south africa has begun manufactu...
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ...",safa president anc danny jordaan nis destroyi...


In [11]:
# Remove stopwords
stop_words = set(stopwords.words("english"))
data["clean_tweet"] = data["clean_tweet"].apply(lambda x : " ".join([w.lower() for w in x.split() if w not in stop_words and len(w) > 3]))

In [12]:
#Tokenize tweet
tweets = data["clean_tweet"].apply(lambda x : x.split())

In [13]:
data.head()

Unnamed: 0,id,retweet_count,date_created,tweet,clean_tweet
0,1508758968482635778,1092,2022-03-29 10:52:24+00:00,b'A 31-year-old Ugandan traditional healer liv...,year ugandan traditional healer living south a...
1,1508661904192913410,2275,2022-03-29 04:26:42+00:00,"b'Date mein kya rakha hai?\nMarch 29th, a very...",date mein rakha nmarch significant date cricke...
2,1508815292578816015,563,2022-03-29 14:36:12+00:00,b'Moeletsi Mbeki has accused South Africa\xe2\...,moeletsi mbeki accused south africa ruling par...
3,1509067192888926208,9,2022-03-30 07:17:10+00:00,"b""RT @ThamiMasemola: Isuzu Motors South Africa...",isuzu motors south africa begun manufacturing ...
4,1509067189827026945,0,2022-03-30 07:17:09+00:00,"b'SAFA President, ANC Dr Danny Jordaan \n\nIs ...",safa president danny jordaan destroying footba...


**Lemmatization**

In [15]:
%%capture
!python -m spacy download en_core_web_sm


In [17]:
%%capture
import spacy
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append(" ".join([token.lemma_ if token.lemma_ not in ['-PRON-'] else '' for token in doc if token.pos_ in allowed_postags]))
    return texts_out

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only Noun, Adj, Verb, Adverb
data_lemmatized = lemmatization(tweets, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

In [18]:
vectorizer = TfidfVectorizer(ngram_range=(2,3))
data_vectorized = vectorizer.fit_transform(data_lemmatized)

In [19]:
data_vectorized

<2400x6217 sparse matrix of type '<class 'numpy.float64'>'
	with 15522 stored elements in Compressed Sparse Row format>

# Modelling 

In [20]:
# LDA Implementation
number_of_topics = 10
model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

In [21]:
model.fit(data_vectorized)

LatentDirichletAllocation(random_state=0)

In [22]:
def display_topics(model, feature_names, no_top_words):
    """ 
    creates dataframe showing top words for each topic from the model
    Parameters
    ----------
    model: object instance of the topic model
    feature_names: output feature names from vectorizer e.g CountVectorizer.get_feature_names()
    no_top_words: 

    returns
    --------
    dataframe showing topics and the weight for the top words specified
    """

    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

In [23]:
# get the feature names from the vectorization

tf_feature_names = vectorizer.get_feature_names()



In [24]:
no_top_words = 20
display_topics(model, tf_feature_names, no_top_words)
#df.to_excel("topics_output.xlsx")

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,would oscar,40.1,exclusive biggboss time,16.3,name shall never,13.7,term use,7.0,similarly dependent,39.9,first experience,11.1,dose addition,9.1,condemn russian,6.7,watch access,36.1,shift manufacture articulate,8.2
1,lawlessness medicalaidracialprofiling,20.1,biggboss time,16.3,never forgotten,13.7,dictate term,7.0,note similarly dependent,39.9,zimpaper year can,5.1,suffer week,4.6,refuse condemn russian,6.7,watch access scene,36.1,dump truck,8.2
2,team qualify,11.3,exclusive biggboss,16.3,happy name,13.7,dictate term use,7.0,note similarly,39.9,fund medium,5.1,sign agreement,4.6,russian aggression,6.7,must watch,36.1,manufacture articulate dump,8.2
3,qualify woman,11.3,walk drinkabl,12.6,happy name shall,13.7,african entrepreneur,3.1,call country,6.0,can fund medium,5.1,serve cold harmful,3.4,russian aggression nread,6.7,must watch access,36.1,say shift,8.2
4,woman world hold,11.3,mile walk drinkabl,12.6,shall never,13.7,debut spotify daily,2.4,call country reject,6.0,zimpaper year,5.1,harmful drug,3.4,condemn russian aggression,6.7,look epic,36.1,manufacture articulate,8.2
5,woman world,11.3,mile walk,12.6,shall never forgotten,13.7,debut spotify,2.4,country reject,6.0,good morning abuse,5.1,harmful drug large,3.4,adamantly refuse,6.7,scene look epic,36.1,articulate dump,8.2
6,team qualify woman,11.3,child show mile,12.6,name shall,13.7,daily viral,2.4,find public,5.9,morning abuse zimpaper,5.1,serve cold,3.4,aggression nread,6.7,scene look,36.1,articulate dump truck,8.2
7,know woman,11.3,child show,12.6,government disturb drug,7.7,viral song,2.4,study find public,5.9,morning abuse,5.1,drug large,3.4,stick adamantly,6.7,access scene look,36.1,truck away,8.2
8,know woman team,11.3,show mile,12.6,drug protect,7.7,spotify daily,2.4,study find,5.9,year can,5.1,dangerous serve,3.4,stick adamantly refuse,6.7,access scene,36.1,say shift manufacture,8.2
9,woman team qualify,11.3,show mile walk,12.6,mozambique government disturb,7.7,spotify daily viral,2.4,reject weak leak,5.8,year can fund,5.1,dangerous serve cold,3.4,adamantly refuse condemn,6.7,friend continue,12.0,dump truck away,8.2


**Model Performance Metrics**

In [25]:
# log-likelihood
print(model.score(data_vectorized))
# perplexity
print(model.perplexity(data_vectorized))

-50035.89601931713
12972.267981287418


# pyLDAVis

In [27]:
pyLDAvis.sklearn.prepare(model, data_vectorized, vectorizer)

  by='saliency', ascending=False).head(R).drop('saliency', 1)


**Hyperparameter Tuning** 



**How to GridSearch the best LDA model**

In [28]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV


In [29]:
# Define Search Param
search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}

# Init the Model
lda = LatentDirichletAllocation()

# Init Grid Search Class
model2 = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model2.fit(data_vectorized)

GridSearchCV(estimator=LatentDirichletAllocation(),
             param_grid={'learning_decay': [0.5, 0.7, 0.9],
                         'n_components': [10, 15, 20, 25, 30]})

**How to see the best topic model and its parameters?**

In [30]:
# Best Model
best_lda_model = model2.best_estimator_

# Model Parameters
print("Best Model's Params: ", model2.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model2.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(data_vectorized))


Best Model's Params:  {'learning_decay': 0.5, 'n_components': 10}
Best Log Likelihood Score:  -18443.739474704387
Model Perplexity:  12978.496776555903


*This shows us that the best model is obtained with 10 topics as done above*

# Inference

In [31]:
def get_inference(model, vectorizer, topics, text, threshold):
    """
    runs inference on text input

    paramaters
    ----------
    model: loaded model to use to transform the input
    vectorizer: instance of the vectorizer e.g TfidfVectorizer(ngram_range=(2, 3))
    topics: the list of topics in the model
    text: input string to be classified
    threshold: float of threshold to use to output a topic

    returns
    -------
    tuple => (top score, the scores for each topic)
    
    """
    v_text = vectorizer.transform([text])
    score = model.transform(v_text)

    labels = set()
    for i in range(len(score[0])):
        if score[0][i] > threshold:
            labels.add(topics[i])
    if not labels:
        return 'None', -1, set()

    return topics[np.argmax(score)], score

In [32]:
# test the model with some text

topics = list(np.arange(0,10))
result = get_inference(model, vectorizer, topics, "operation dudula", 0 )
result

(0, array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]))

# Testing inference from loading the model

In [33]:
# Save the model then test it by loading it
with open("lda_model.pk","wb") as f:
  pickle.dump(model, f)
f.close()

# then reload it
with open("lda_model.pk","rb") as f:
  lda_model = pickle.load(f)

In [34]:
# test example text

result = get_inference(lda_model, vectorizer, topics, "operation dudula", 0 )
result

(0, array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]))

In [35]:
pickle.dump(vectorizer, open("vectorizer.pickle", "wb")) 
#pickle.load(open("models/vectorizer.pickle", 'rb'))     // Load vectorizer