# Topic modelling and sentiment analysis

## Imports 

In [46]:
import re
import warnings
import gensim
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
from gensim import corpora
from wordcloud import WordCloud, STOPWORDS
from gensim.models import CoherenceModel
from nltk.stem import WordNetLemmatizer

warnings.filterwarnings('ignore')

## Data reading 

In [2]:
tweets = pd.read_csv("./clean_tweets.csv")
tweets[:2]

Unnamed: 0,created_at,original_text,polarity,subjectivity,favorite_count,retweet_count,original_author,followers_count,friends_count,possibly_sensitive,hashtags,place,hashtags_in_tweets,screen_name,device
0,2021-06-18 17:55:49+00:00,africa is in the midst of a full blown third...,0.166667,0.188889,548,612,ketuesriche,551,351,False,,mass,,"@telglobalhealth, @telglobalhealth",twitter for iphone
1,2021-06-18 17:55:59+00:00,dr moeti is head of who in africa and one of ...,0.133333,0.455556,195,92,grid1949,66,92,False,,"edinburgh, scotland",,@globalhlthtwit,twitter web app


In [3]:
tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6532 entries, 0 to 6531
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   created_at          6532 non-null   object 
 1   original_text       6532 non-null   object 
 2   polarity            6532 non-null   float64
 3   subjectivity        6532 non-null   float64
 4   favorite_count      6532 non-null   int64  
 5   retweet_count       6532 non-null   int64  
 6   original_author     6532 non-null   object 
 7   followers_count     6532 non-null   int64  
 8   friends_count       6532 non-null   int64  
 9   possibly_sensitive  6532 non-null   bool   
 10  hashtags            6532 non-null   object 
 11  place               6532 non-null   object 
 12  hashtags_in_tweets  6532 non-null   object 
 13  screen_name         6532 non-null   object 
 14  device              6532 non-null   object 
dtypes: bool(1), float64(2), int64(4), object(8)
memory usag

## Topic modelling

## Feature Extraction

We will create a dataset containg only the columns we need for topic modeling.

In [4]:
df = pd.DataFrame(columns=['clean_text'])
df['clean_text'] = tweets['original_text'].astype(str)
df[:5]

Unnamed: 0,clean_text
0,africa is in the midst of a full blown third...
1,dr moeti is head of who in africa and one of ...
2,thank you research note for creating this ama...
3,former pfizer vp and virologist dr michael y...
4,i think it s important that we don t sell cova...


### Data Pre-processing

I have already done common data preprocessing tasks like, converting to lowercase, removing Punctuation and links. Here we will go further and make the data ready for topic modelling.

### Remove stop words

In [5]:
freqX = pd.Series(
    ' '.join(df['clean_text']).split()).value_counts()[:10]

print('FREQ X: \n', freqX)

FREQ X: 
 the         7191
to          6094
vaccines    4462
of          4162
and         3501
in          3499
#covid      3086
a           2397
is          2261
are         1986
dtype: int64


As we can see most them are stop words so should we remove them. While exploring the data, I have also found while exploring the data using word cloud, there are some meaning less characters in the data. we will also remove those.

In [6]:
custom_stopwords = ['t', 'rt', 'ti', 'vk', 'to', 'co',
                    'dqlw', 'z', 'nd', 'm', 's', 'kur', 'u', 'o', 'd']
STOP_WORDS = STOPWORDS.union(custom_stopwords)

#### Tokenizing


In [7]:
df['clean_text']

0        africa is  in the midst of a full blown third...
1       dr moeti is head of who in africa  and one of ...
2       thank you  research note for creating this ama...
3       former pfizer vp and virologist  dr  michael y...
4       i think it s important that we don t sell cova...
                              ...                        
6527    rt  givenkazeni  zweli please just release the...
6528    former pfizer vp and virologist  dr  michael y...
6529     jenfeds    dcrinyyc respectfully  veterinaria...
6530     africa needs millions more doses here  amp  n...
6531    rt  shawajason  liars  you tried to load off y...
Name: clean_text, Length: 6532, dtype: object

In [8]:
df['clean_text'] = df['clean_text'].apply(
    lambda x: [item for item in x.split() if item not in STOP_WORDS])

In [9]:
df['clean_text']


0       [africa, midst, full, blown, third, wave, coro...
1       [dr, moeti, head, africa, one, best, public, h...
2       [thank, research, note, creating, amazing, cam...
3       [former, pfizer, vp, virologist, dr, michael, ...
4       [think, important, don, sell, covax, short, st...
                              ...                        
6527    [givenkazeni, zweli, please, release, graphic,...
6528    [former, pfizer, vp, virologist, dr, michael, ...
6529    [jenfeds, dcrinyyc, respectfully, veterinarian...
6530    [africa, needs, millions, doses, amp, now, cur...
6531    [shawajason, liars, tried, load, expired, vacc...
Name: clean_text, Length: 6532, dtype: object

In [10]:
sentence_list = [sent for sent in df['clean_text']]
print(sentence_list[:2])

[['africa', 'midst', 'full', 'blown', 'third', 'wave', 'coronavirus', 'head', 'whoafro', 'warned', 'cases', 'risen', 'across', 'continent', 'deaths', 'risen', 'last', 'week', 'jriggers', 'reports'], ['dr', 'moeti', 'head', 'africa', 'one', 'best', 'public', 'health', 'experts', 'leaders', 'know', 'desperate', 'request', 'vaccines', 'africa', 'plead', 'germany', 'uk', 'lift', 'patent', 'restrictions', 'urgently', 'transfer', 'technology', 'enable', 'production', 'africa']]


In [11]:
word_list = [sent for sent in sentence_list]
print(word_list[:2])

[['africa', 'midst', 'full', 'blown', 'third', 'wave', 'coronavirus', 'head', 'whoafro', 'warned', 'cases', 'risen', 'across', 'continent', 'deaths', 'risen', 'last', 'week', 'jriggers', 'reports'], ['dr', 'moeti', 'head', 'africa', 'one', 'best', 'public', 'health', 'experts', 'leaders', 'know', 'desperate', 'request', 'vaccines', 'africa', 'plead', 'germany', 'uk', 'lift', 'patent', 'restrictions', 'urgently', 'transfer', 'technology', 'enable', 'production', 'africa']]


### Lemmatization

In [12]:
lemmatizer = WordNetLemmatizer()
word_list_lematized = []

for w in word_list:
    word_list_lematized.append([lemmatizer.lemmatize(x) for x in w])
print(word_list_lematized[:2])

[['africa', 'midst', 'full', 'blown', 'third', 'wave', 'coronavirus', 'head', 'whoafro', 'warned', 'case', 'risen', 'across', 'continent', 'death', 'risen', 'last', 'week', 'jriggers', 'report'], ['dr', 'moeti', 'head', 'africa', 'one', 'best', 'public', 'health', 'expert', 'leader', 'know', 'desperate', 'request', 'vaccine', 'africa', 'plead', 'germany', 'uk', 'lift', 'patent', 'restriction', 'urgently', 'transfer', 'technology', 'enable', 'production', 'africa']]


### Model

In [13]:
#Create dictionary which contains Id and word
id2word = corpora.Dictionary(word_list_lematized)
corpus = [id2word.doc2bow(tweet) for tweet in word_list]

In [14]:
print(np.array(word_list).shape)
print(np.array(id2word).shape)
print(np.array(corpus).shape)

(6532,)
(6339,)
(6532,)


In [63]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                            id2word=id2word,
                                            num_topics=7,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto',
                                            per_word_topics=True)

In [64]:
pprint(lda_model.show_topics(formatted=True))

[(0,
  '0.056*"#covid" + 0.031*"amp" + 0.030*"don" + 0.028*"covid" + '
  '0.019*"#vaccines" + 0.016*"will" + 0.012*"provide" + 0.011*"question" + '
  '0.011*"#covidvaccination" + 0.010*"vaccinated"'),
 (1,
  '0.043*"israel" + 0.038*"expired" + 0.036*"another" + 0.035*"tried" + '
  '0.035*"war" + 0.035*"crime" + 0.035*"load" + 0.035*"shawajason" + '
  '0.027*"world" + 0.021*"need"'),
 (2,
  '0.081*"dose" + 0.057*"capacity" + 0.053*"age" + 0.048*"code" + 0.048*"pin" '
  '+ 0.048*"limit" + 0.048*"min" + 0.038*"covaxin" + 0.031*"pmcg" + '
  '0.024*"million"'),
 (3,
  '0.119*"india" + 0.038*"africa" + 0.035*"#covid" + 0.034*"vaccination" + '
  '0.032*"wave" + 0.030*"third" + 0.022*"amp" + 0.020*"full" + 0.019*"midst" + '
  '0.019*"vaccine"'),
 (4,
  '0.047*"people" + 0.041*"#covid" + 0.034*"australia" + 0.017*"vaccine" + '
  '0.015*"must" + 0.013*"huge" + 0.011*"#india" + 0.011*"response" + '
  '0.010*"now" + 0.009*"us"'),
 (5,
  '0.066*"africa" + 0.027*"dr" + 0.027*"risk" + 0.022*"re" + 0.

#### Compute Perplexity
It's a measure of how good the model is. The lower the better. Perplexity is a negative value

In [65]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))


Perplexity:  -6.791372861272658


#### Compute Coherence Score


In [66]:
doc_lda = lda_model[corpus]
coherence_model_lda = CoherenceModel(
    model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


 Ldamodel Coherence Score/Accuracy on Tweets:  0.35099594180510907


### Visualization

In [18]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis

In [19]:
vis_data = gensimvis.prepare(lda_model, corpus, id2word)
pyLDAvis.display(vis_data)


  and should_run_async(code)


##  Sentiment analysis

### Form a new data frame (named `cleanTweet`), containing columns $\textbf{clean-text}$ and $\textbf{polarity}$

In [32]:
df = pd.DataFrame(columns=['clean_text', 'polarity'])
df['clean_text'] = tweets['original_text']
df['polarity'] = tweets['polarity']

In [31]:
def clean_tweet(tweet):
    clean_tweet = re.sub("[^a-zA-Z]",  " ",  tweet)
    return clean_tweet

In [34]:
df['clean_text'] = df['clean_text'].apply(clean_tweet)
df[:2]

Unnamed: 0,clean_text,polarity
0,africa is in the midst of a full blown third...,0.166667
1,dr moeti is head of who in africa and one of ...,0.133333


In [35]:
print("duplicate count: {}".format(df.isnull().sum().sum()))

duplicate count: 0


### Write a function `text_category` that takes a value `p` and returns, depending on the value of p, a string `'positive'`, `'negative'` or `'neutral'`.

In [37]:
def text_category(p):
  if p > 0:
    return "positive"
  elif p < 0:
    return "negative"
  else:
    return "neutral"

### Apply this function (`text_category`) on the $\textbf{polarity}$ column of `cleanTweet` in 1 above to form a new column called $\textbf{score}$ in `cleanTweet`.

In [38]:
df["polarity"] = df["polarity"].apply(text_category)
df[:2]


Unnamed: 0,clean_text,polarity
0,africa is in the midst of a full blown third...,positive
1,dr moeti is head of who in africa and one of ...,positive


### Visualize The $\textbf{score}$ column using piechart and barchart

In [44]:
category = df.groupby(['polarity']).size()
category

polarity
negative    1277
neutral     1829
positive    3426
dtype: int64

## build a classification model on the clean tweet.

### Remove rows from `cleanTweet` where $\textbf{polarity}$ $= 0$ (i.e where $\textbf{score}$ = Neutral) and reset the frame index.

In [51]:
df = df[df['polarity'] != 'neutral']
df


Unnamed: 0,clean_text,polarity
0,africa is in the midst of a full blown third...,positive
1,dr moeti is head of who in africa and one of ...,positive
2,thank you research note for creating this ama...,positive
3,former pfizer vp and virologist dr michael y...,positive
4,i think it s important that we don t sell cova...,positive
...,...,...
6524,covid vaccines reach the remotest places of ...,negative
6526,although there is not yet a vaccine for kids t...,positive
6528,former pfizer vp and virologist dr michael y...,positive
6529,jenfeds dcrinyyc respectfully veterinaria...,positive


### Construct a column $\textbf{scoremap}$ Use the mapping {'positive':1, 'negative':0} on the $\textbf{score}$ column

In [52]:
df['scoremap'] = df["polarity"].map( lambda score: 1 if score == "positive" else 0)
df

Unnamed: 0,clean_text,polarity,scoremap
0,africa is in the midst of a full blown third...,positive,1
1,dr moeti is head of who in africa and one of ...,positive,1
2,thank you research note for creating this ama...,positive,1
3,former pfizer vp and virologist dr michael y...,positive,1
4,i think it s important that we don t sell cova...,positive,1
...,...,...,...
6524,covid vaccines reach the remotest places of ...,negative,0
6526,although there is not yet a vaccine for kids t...,positive,1
6528,former pfizer vp and virologist dr michael y...,positive,1
6529,jenfeds dcrinyyc respectfully veterinaria...,positive,1


### Create feature and target variables `(X,y)` from $\textbf{clean-text}$ and $\textbf{scoremap}$ columns respectively.

In [55]:
(X, y) = df['clean_text'], df['scoremap']


### Use `train_test_split` function to construct `(X_train, y_train)` and `(X_test, y_test)` from `(X,y)`

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Build an `SGDClassifier` model from the vectorize train text data. Use `CountVectorizer()` with a $\textit{trigram}$ parameter and Evaluate your model on the test data.

In [58]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from joblib import dump, load # used for saving and loading sklearn objects
from scipy.sparse import save_npz, load_npz # used for saving and loading sparse matrices
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

In [60]:
trigram_vectorizer = CountVectorizer(ngram_range=(1, 3))
trigram_vectorizer.fit_transform(X.values)
X_trigram = trigram_vectorizer.transform(X)


def train_and_show_scores(X, y, title: str) -> None:
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.75, stratify=y
    )

    clf = SGDClassifier()
    clf.fit(X_train, y_train)
    train_score = clf.score(X_train, y_train)
    valid_score = clf.score(X_valid, y_valid)
    print(f'{title}\nTrain score: {round(train_score, 2)} ; Validation score: {round(valid_score, 2)}\n')

In [62]:
train_and_show_scores(X_trigram, df['scoremap'], title="sentiment")

sentiment
Train score: 1.0 ; Validation score: 0.96

