In [9]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from pprint import pprint
import string
import os
import re

# visual
import matplotlib.pyplot as plt
import seaborn as sns

# filter out noise words and more clean up on word
from wordcloud import STOPWORDS,WordCloud
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

# gensim
import gensim
from gensim.models import CoherenceModel
from gensim import corpora



file_name = 'tweet_data.csv'
df = pd.read_csv(file_name)

In [10]:
df.head(2)

AttributeError: 'NoneType' object has no attribute 'items'

                       created_at  \
0  Fri Jun 18 17:55:49 +0000 2021   
1  Fri Jun 18 17:55:59 +0000 2021   

                                              source  \
0  <a href="http://twitter.com/download/iphone" r...   
1  <a href="https://mobile.twitter.com" rel="nofo...   

                                       original_text  polarity  subjectivity  \
0  🚨Africa is "in the midst of a full-blown third...  0.166667      0.188889   
1  Dr Moeti is head of WHO in Africa, and one of ...  0.133333      0.455556   

  lang  favorite_count  retweet_count original_author  followers_count  \
0   en             548          612.0     ketuesriche              551   
1   en             195           92.0        Grid1949               66   

   friends_count possibly_sensitive  hashtags  user_mentions  \
0            351                NaN       NaN            NaN   
1             92                NaN       NaN            NaN   

                 place  
0                 Mass  
1  Edinburgh

In [11]:
df.shape

(6532, 15)

In [12]:
df.columns

Index(['created_at', 'source', 'original_text', 'polarity', 'subjectivity',
       'lang', 'favorite_count', 'retweet_count', 'original_author',
       'followers_count', 'friends_count', 'possibly_sensitive', 'hashtags',
       'user_mentions', 'place'],
      dtype='object')

In [13]:
#print original_text column
print("orignal text: \n", df['original_text'])

orignal text: 
 0       🚨Africa is "in the midst of a full-blown third...
1       Dr Moeti is head of WHO in Africa, and one of ...
2       Thank you @research2note for creating this ama...
3       Former Pfizer VP and Virologist, Dr. Michael Y...
4       I think it’s important that we don’t sell COVA...
                              ...                        
6527                                                  NaN
6528    Former Pfizer VP and Virologist, Dr. Michael Y...
6529                                                  NaN
6530    "Africa needs millions more doses here &amp; n...
6531                                                  NaN
Name: original_text, Length: 6532, dtype: object


In [32]:
#check if null value is found in original_text
count_null = df['original_text'].isnull().sum()
count_not_null  = df['original_text'].notnull().sum()
print("Data Frame before removing null values: ", df.shape)
print("Number of null values in original text: ", count_null)
print("Number of not null values in orignal text: ", count_not_null)
print("Number of all orignal text value is: ", count_null + count_not_null)

Data Frame before removing null values:  (6532, 15)
Number of null values in original text:  2812
Number of not null values in orignal text:  3720
Number of all orignal text value is:  6532


In [35]:
df = df[~df['original_text'].isnull()]
count_null = df['original_text'].isnull().sum()
count_not_null = df['original_text'].notnull().sum()
print("Data Frame value after removing null value: ", df.shape)
print("Number of null values after removing null value: ", count_null)
print("Number of not null values after removing null value: ", count_not_null)

Data Frame value after removing null value:  (3720, 15)
Number of null values after removing null value:  0
Number of not null values after removing null value:  3720


In [36]:

# clean out unwanted values from the text, like links and other staff
import re
def clean_text(text):
    hash_tag_removed = re.sub('(#[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    removed_links = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', hash_tag_removed, flags=re.MULTILINE)
    result = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', removed_links)
    return result

df['original_text'] = df['original_text'].apply(clean_text)

In [54]:
cleanTweet = pd.DataFrame()
cleanTweet['clean_text'] = df['original_text']
cleanTweet['polarity'] = df['polarity']
cleanTweet.columns

Index(['clean_text', 'polarity'], dtype='object')

In [61]:
cleanTweet.dropna()

AttributeError: 'NoneType' object has no attribute 'items'

                                             clean_text  polarity
0     🚨Africa is "in the midst of a full-blown third...  0.166667
1     Dr Moeti is head of WHO in Africa, and one of ...  0.133333
2     Thank you  for creating this amazing campaign ...  0.316667
3     Former Pfizer VP and Virologist, Dr. Michael Y...  0.086111
4     I think it’s important that we don’t sell COVA...  0.280000
...                                                 ...       ...
6521  Australia is sending vaccines.\nAustralia is s...  0.100000
6522  The Truth Behind COVID-19 Vaccines (6) —— Bell... -0.386111
6524  Covid19 vaccines reach the remotest places of ... -0.050000
6528  Former Pfizer VP and Virologist, Dr. Michael Y...  0.086111
6530  "Africa needs millions more doses here &amp; n...  0.125000

[3720 rows x 2 columns]

In [62]:

# Noise words need to be removed.
def process_data():
    cleanTweet['clean_text'] = cleanTweet['clean_text'].apply(lambda x: x.lower())
    cleanTweet['clean_text'] = cleanTweet['clean_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
    
    sentence_list = [tweet for tweet in cleanTweet['clean_text']]
    word_list = [sentence.split() for sentence in sentence_list]
    
    word_to_id = corpora.Dictionary(word_list)
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in word_list]
    
    return word_list, word_to_id, corpus_1

In [73]:
word_list, id2word, corpus = process_data()
print("word list: \n", word_list[0])
print("--------------------------------")
print("id2word\n", id2word)
print("--------------------------------")
print("corpus \n", corpus[0])

word list: 
 ['🚨africa', 'is', 'in', 'the', 'midst', 'of', 'a', 'fullblown', 'third', 'wave', 'of', 'coronavirus', 'the', 'head', 'of', 'has', 'warned', 'cases', 'have', 'risen', 'across', 'the', 'continent', 'by', 'more', 'than', '20', 'and', 'deaths', 'have', 'also', 'risen', 'by', '15', 'in', 'the', 'last', 'week', 'reports', '🧵']
--------------------------------
id2word
 Dictionary(4493 unique tokens: ['15', '20', 'a', 'across', 'also']...)
--------------------------------
corpus 
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 2), (16, 1), (17, 1), (18, 1), (19, 1), (20, 3), (21, 1), (22, 2), (23, 1), (24, 4), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1)]


In [74]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [75]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('the', 0.059252035),
   ('of', 0.04634107),
   ('in', 0.02593328),
   ('and', 0.020663053),
   ('to', 0.018838376),
   ('vaccines', 0.01675732),
   ('by', 0.016396994),
   ('have', 0.015626788),
   ('amp', 0.01390223),
   ('on', 0.0120095955)]),
 (1,
  [('in', 0.0441797),
   ('the', 0.039230913),
   ('india', 0.037722193),
   ('of', 0.03573105),
   ('a', 0.02738507),
   ('and', 0.021746453),
   ('wave', 0.020910054),
   ('africa', 0.020825405),
   ('amp', 0.020103747),
   ('third', 0.020095466)]),
 (2,
  [('to', 0.058690723),
   ('vaccines', 0.033360586),
   ('need', 0.0325346),
   ('we', 0.029928887),
   ('the', 0.026092283),
   ('are', 0.023547826),
   ('and', 0.023392055),
   ('you', 0.02166997),
   ('from', 0.01607634),
   ('with', 0.015258445)]),
 (3,
  [('to', 0.06913969),
   ('the', 0.036816966),
   ('is', 0.02801097),
   ('of', 0.02688317),
   ('and', 0.026510056),
   ('africa', 0.02183866),
   ('vaccines', 0.019329717),
   ('australia', 0.017714353),
   ('in', 0.01611

In [76]:
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)


Perplexity:  -6.079594905886277

 Ldamodel Coherence Score/Accuracy on Tweets:  0.41991022045536186
