In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tweepy as tweepy
%matplotlib inline
import seaborn as sns
import re
import os
import time
from datetime import datetime, date, timedelta
#Count for common words
from collections import Counter
#Tweet pre-processor
import preprocessor as p
#NLTK
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
#Loading NLTK
import nltk
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import stopwords
#TextBlob
from textblob import TextBlob
#Stanza
import stanza
#stanza.download('en')
#Stanford CoreNLP
from pycorenlp import StanfordCoreNLP
#Spacy 
import spacy
#Word cloud
#pip install wordcloud
from wordcloud import WordCloud
#Creating nlp object
#Dowload package first from terminal of environment acaconda: python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
#ignore warnings
import warnings
warnings.filterwarnings("ignore")
#VADER
#pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#Sentiwordnet
#nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
#wordnet
nltk.download('wordnet')
from nltk.corpus import wordnet
#pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()
import twint

# Crawl data from twitter with twin  Python

In [None]:
# Crawl > 10.000 data from twitter using twin with keyword ="covid19 covid vaccine"
config = twint.Config()
config.Search = "covid19 covid vaccine"
config.Limit = 10000
config.Lang = "en"
config.Store_csv = True
config.Output = "./datavaccine-twitter-new.csv"
config.Pandas = True
# Run
twint.run.Search(config)

# 1. Read csv file

In [None]:
from datetime import datetime, date, timedelta
df = pd.read_csv("./data/vaccine-twitter.csv")

In [None]:
df = df[(df['tweet'].str.contains("vacc"))
                            | (df['tweet'].str.contains("Vacc"))
                            | (df['hashtags'].str.contains("vacc"))
                            | (df['hashtags'].str.contains("Vacc"))]
len(df)

# 2. Pre-processing data

# # 2.1 Remove punctuation

In [None]:
import string
string.punctuation
def processTweet(tweet):
    # Remove HTML special entities (e.g. &amp;)
    tweet = re.sub(r'\&\w*;', '', tweet)
    #Convert @username to AT_USER
    tweet = re.sub('@[^\s]+','',tweet)
    # Remove tickers
    tweet = re.sub(r'\$\w*', '', tweet)
    # To lowercase
    tweet = tweet.lower()
    # Remove hyperlinks
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)
    # Remove hashtags
    tweet = re.sub(r'#\w*', '', tweet)
    # Remove Punctuation and split 's, 't, 've with a space for filter
    #tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet)
    # Remove words with 2 or fewer letters
    tweet = re.sub(r'\b\w{1,2}\b', '', tweet)
    # Remove whitespace (including new line characters)
    tweet = re.sub(r'\s\s+', ' ', tweet)
    # Remove single space remaining at the front of the tweet.
    tweet = tweet.lstrip(' ') 
    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
    tweet = ''.join(c for c in tweet if c <= '\uFFFF') 
    return tweet

In [None]:
df['cleaned_puntuation_tweet'] = df['tweet'].apply(processTweet)
# preview some cleaned tweets
df['cleaned_puntuation_tweet'][0]

In [None]:
#Save as CSV file
df.to_csv('./data/covid_vaccine_tweets_extracted_20211106_204713.csv')
#jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000 from terminal if it has error: data exceeded
df = pd.read_csv('./data/covid_vaccine_tweets_extracted_20211106_204713.csv', )

## 2.2 Remove stop words 

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
stop_words.update(["new", "total", "due", "first","amp", "vaccines", "covid", "get", "today", "nhs", "india","nwo", "nhs", "additional",
                   "biden","national", "usfda", "adverse", "clinical", "deltavariant", "read", "dose", "fda", "cdc", "dont", "dose",
                  "least", "daily", "many", "current", "second", "last", "long", "next", "severe", "third","read", "receive",
                 "next", "full", "covid19vaccine", "natural", "media", "old", "young", "public", "global", "covid19vic"
                  , "early", "different", "local", "social", "much", "true", "ready", "federal", "medical"])
df['removed_stopwords_tweet'] = df['cleaned_puntuation_tweet'].apply(lambda x: ' '.join([word for word in x.split()
                                                                        if word not in stop_words]))
print(df['removed_stopwords_tweet'].head(10))

# 2.3 Steam word - Lemmatazation

In [None]:
df['lemma_tweet'] = df['removed_stopwords_tweet'].apply(lambda row: " ".join([w.lemma_ for w in nlp(row)]))
print(df['lemma_tweet'].head(10))

# #  2.4 Tokenization

In [None]:
#Break text paragraph into words
tweet_df = df['removed_stopwords_tweet']
tokenized_df = []
for row in tweet_df:
    word_tokenized_in_line = word_tokenize(row)
    tokenized_df.append(word_tokenized_in_line)
df['tokenized_tweet'] = tokenized_df


## 2.5 POS(part of speech)

In [None]:
# POS tagger dictionary
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

df['tagged_POS_tweet'] = df['removed_stopwords_tweet'].apply(token_stop_pos)
print(df.head(10))

In [None]:
#Count for common adj
word_counts = Counter(line for line in df['tagged_POS_tweet'] for line in set(line) if line[1] == 'a')
common_words= word_counts.most_common(58)
# print(word_counts)

# 3. BOW and IF-IDF

In [None]:
#BoW
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['Covid vaccine is good.', 'Covid vaccine is good and important.', 'Covid vaccine is safe.']
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())
print(X.toarray())

In [None]:
#TF-IDF VECTORIZATION
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X.toarray())

In [None]:
#MOST COMMON WORDS IN TWITTER DATASET
print("MOST COMMON WORDS IN TWITTER DATASET:")
all_words = []
for line in list(df['removed_stopwords_tweet']):
    words = line.split()
    for word in words:
        all_words.append(word.lower())
   
Counter(all_words).most_common(10)

In [None]:
# PLOT word frequency distribution of first few words
plt.figure(figsize=(12,5))
plt.xticks(fontsize=13, rotation=90)
fd = nltk.FreqDist(all_words)
fd.plot(20,cumulative=False)

# 4. Sentiment Analysis with 3 models: Textblob, Vader and Stanza
#### SA Using Textblob

In [None]:
v_compare = 0.05
# function to calculate subjectivity
def getSubjectivity(words):
    return TextBlob(words).sentiment.subjectivity
# function to calculate polarity
def getPolarity(words):
        return TextBlob(words).sentiment.polarity

# function to analyze the reviews
def analysis(score):
    if score <= -(v_compare):
        return 'Negative'
    elif score >= v_compare:
        return 'Positive'
    else:
        return 'Neutral'
df = df[['removed_stopwords_tweet', 'lemma_tweet']]
df['polarity'] = df['lemma_tweet'].apply(getPolarity) 
df['analysis'] = df['polarity'].apply(analysis)


In [None]:
#Count the number of positive, negative, neutral tweets
tb_counts = df.analysis.value_counts()
print(tb_counts)

In [None]:
#Visualize
tb_count= df.analysis.value_counts()
plt.figure(figsize=(10, 5))
plt.pie(tb_counts.values, labels = tb_count.index, explode = (0, 0, 0.25), autopct='%1.1f%%', shadow=False)
plt.legend()
plt.title("Textblob Sentiment Result")
plt.savefig('./img/SA_Textblob')

#### SA Using VADER(Valence Aware Dictionary and Sentiment Reasoner)

In [None]:
analyzer = SentimentIntensityAnalyzer()
# function to calculate vader sentiment
def vadersentimentanalysis(tweet):
    vs = analyzer.polarity_scores(tweet)
    return vs['compound']
df['vader_sentiment'] = df['lemma_tweet'].apply(vadersentimentanalysis)
# function to analyse
def vader_analysis(compound):
    if compound >= v_compare:
        return 'Positive'
    elif compound <= -(v_compare) :
        return 'Negative'
    else:
        return 'Neutral'
df['vader_analysis'] = df['vader_sentiment'].apply(vader_analysis)
df.head(10)

In [None]:
#Count the number of positive, negative, neutral tweets
vader_counts = df['vader_analysis'].value_counts()
vader_counts

In [None]:
vader_counts= df['vader_analysis'].value_counts()
plt.figure(figsize=(10, 5))
plt.pie(vader_counts.values, labels = vader_counts.index, explode = (0.1, 0, 0), autopct='%1.1f%%', shadow=False)
plt.legend()
plt.title("Vader Sentiment Result")
plt.savefig('./img/SA_Vader')

#### Sentiment Analysis with Stanza

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')

In [None]:
# Minus 1 so as to bring score range of [0,2] to [-1,1]
def stanza_analyze(Text):
    document = nlp(Text)
    print('Processing')
    return np.mean([(i.sentiment - 1) for i in document.sentences]) 
# Obtain sentiment categorical score generated by Stanza
df['stanza_score'] = df['lemma_tweet'].apply(lambda x: stanza_analyze(x))
# Convert average Stanza sentiment score into sentiment categories
df['stanza_sentiment'] = df['stanza_score'].apply(lambda c: 'Positive' if c >= v_compare else
                                                  ('Negative' if c <= -(v_compare) else 'Neutral'))

In [None]:
neutral_thresh = 0.05
# Convert average Stanza sentiment score into sentiment categories
df['stanza_sentiment'] = df['stanza_score'].apply(lambda c: 'Positive' if c >= v_compare else ('Negative' if c <= -(v_compare) else 'Neutral'))

In [None]:
stanza_counts= df['stanza_sentiment'].value_counts()
stanza_counts

In [None]:
plt.figure(figsize=(10, 5))
plt.pie(stanza_counts.values, labels = stanza_counts.index, explode = (0.1, 0, 0), autopct='%1.1f%%')
plt.legend()
plt.title("Stanza Sentiment Result")
plt.savefig('./img/SA_stanza')


#### Visual representation of TextBlob, VADER, Stanza results by Bar chart

In [None]:
df_sentiments = pd.concat([tb_counts, 
                           vader_counts, 
                           stanza_counts
                          ]).reset_index(drop=True)
df_sentiments

In [None]:
# Define function to get value counts
def get_value_counts(col_name, analyzer_name):
    count = pd.DataFrame(df[col_name].value_counts())
    percentage = pd.DataFrame(df[col_name].value_counts(normalize=True).mul(100))
    value_counts_df = pd.concat([count, percentage], axis = 1)
    value_counts_df = value_counts_df.reset_index()
    value_counts_df.columns = ['sentiment', 'counts', 'percentage']
    value_counts_df.sort_values('sentiment', inplace = True)
    value_counts_df['percentage'] = value_counts_df['percentage'].apply(lambda x: round(x,2))
    value_counts_df = value_counts_df.reset_index(drop = True)
    value_counts_df['analyzer'] = analyzer_name
    return value_counts_df

In [None]:
#SA by VADER
sia = SentimentIntensityAnalyzer()

# Obtaining NLTK scores
df['nltk_scores'] = df['lemma_tweet'].apply(lambda x: sia.polarity_scores(x))

# Obtaining NLTK compound score
df['nltk_cmp_score'] = df['nltk_scores'].apply(lambda score_dict: score_dict['compound'])
neutral_thresh = 0.05
# Categorize scores into the sentiments of positive, neutral or negative
df['nltk_sentiment'] = df['nltk_cmp_score'].apply(lambda c: 'Positive' if c >= neutral_thresh else ('Negative' if c <= -(neutral_thresh) else 'Neutral'))
nltk_sentiment_df = get_value_counts('nltk_sentiment','NLTK Vader')
nltk_sentiment_df


In [None]:
#SA by TextBlob
# Obtain polarity scores generated by TextBlob
df['textblob_score'] = df['lemma_tweet'].apply(lambda x: TextBlob(x).sentiment.polarity)
neutral_thresh = 0.05
# Convert polarity score into sentiment categories
df['textblob_sentiment'] = df['textblob_score'].apply(lambda c: 'Positive' if c >= neutral_thresh else ('Negative' if c <= -(neutral_thresh) else 'Neutral'))
textblob_sentiment_df = get_value_counts('textblob_sentiment','TextBlob')
textblob_sentiment_df

In [None]:
#SA with Stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,sentiment')
def stanza_analyze(Text):
    document = nlp(Text)
    print('Processing')
    return np.mean([(i.sentiment - 1) for i in document.sentences]) # Minus 1 so as to bring score range of [0,2] to [-1,1]
# Obtain sentiment categorical score generated by Stanza
df['stanza_score'] = df['lemma_tweet'].apply(lambda x: stanza_analyze(x))
neutral_thresh = 0.05
# Convert average Stanza sentiment score into sentiment categories
df['stanza_sentiment'] = df['stanza_score'].apply(lambda c: 'Positive' if c >= neutral_thresh else ('Negative' if c <= -(neutral_thresh) else 'Neutral'))
stanza_sentiment_df = get_value_counts('stanza_sentiment','Stanza')
stanza_sentiment_df

# 4. Insights from SA

In [None]:
df_sentiments = pd.concat([nltk_sentiment_df, 
                           textblob_sentiment_df, 
                           stanza_sentiment_df
                          ]).reset_index(drop=True)
df_sentiments

In [None]:
df_sentiments_pivot = df_sentiments.pivot(index='sentiment', columns='analyzer', values='percentage')
df_sentiments_pivot

In [None]:
plt.figure(figsize=(10,6))
ax = sns.barplot(x="analyzer", y="percentage",
                 hue="sentiment", data=df_sentiments)

# Display annotations
for p in ax.patches:
    ax.annotate(f"{round(p.get_height(),1)}%", 
                   (p.get_x() + p.get_width() / 2., p.get_height()), 
                   ha = 'center', va = 'center', 
                   size=12,
                   xytext = (0, -12), 
                   textcoords = 'offset points')
plt.savefig("./img/SA_all_BarChart")

In [None]:
df['composite_scores'] = (df['polarity']+df['vader_sentiment']+df['stanza_score'])/3

In [None]:
df['composite_scores'].describe()

In [None]:
composite_counts= df['composite_vote'].value_counts()
composite_counts

In [None]:
# Threshold for neutral sentiment
neutral_thresh = 0.05
# Convert average sentiment score (from all 3 analyzers) into sentiment categories
df['composite_vote'] = df['composite_scores'].apply(lambda c: 'Positive' if c >= neutral_thresh else ('Negative' if c <= -(neutral_thresh) else 'Neutral'))

In [None]:
plt.figure(figsize=(10, 5))
plt.pie(stanza_counts.values, labels = composite_counts.index, explode = (0.1, 0, 0), autopct='%1.1f%%')
plt.legend()
plt.title("Composite Result")
plt.savefig('./img/composite')

In [None]:
# Make use of sentiments from NLTK Vader, TextBlob and Stanza
df['sentiment_votes'] =  df.apply(lambda x: list([x['analysis'], 
                                                                x['vader_analysis'], 
                                                                x['stanza_sentiment']]),axis=1) 

In [None]:
# Create function to get sentiment that appears most often amongst the 3 votes
def get_most_voted_senti(List):
    if len(List) == len(set(List)): # If all elements are different
        return 'Neutral'
    else:
        return max(set(List), key = List.count)

In [None]:
# Get composite sentiment vote
df['composite_vote'] = df['sentiment_votes'].apply(lambda x: get_most_voted_senti(x))

In [None]:
composite_sentiment_count = df['composite_vote'].value_counts()
composite_sentiment_count

In [None]:
plt.figure(figsize=(10, 5))
plt.pie(composite_sentiment_count.values, labels = composite_counts.index, explode = (0.1, 0, 0), autopct='%1.1f%%')
plt.legend()
plt.title("Composite Sentiment Result")
plt.savefig('./img/SA_Composite')