In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re 
import matplotlib.pyplot as plt
import seaborn as sns 

# Cleaning Data Tools
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer 
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
stopwords = stopwords.words('english')

# Sentiment Analysis 
!pip install neattext
!pip install vaderSentiment
!pip install emoji
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import neattext.functions as nfx
from textblob import TextBlob
import emoji

from gensim.models.phrases import Phrases, Phraser

# Word Embedding
!pip install gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #for TF-IDF
from sklearn.feature_extraction.text import CountVectorizer  #For Bag of words
from gensim.models import Word2Vec  #For Word2Vec
from gensim.models import FastText  #For Fast Text

# Scaling and Evaluation Methods
from sklearn import preprocessing
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

# ML Models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

In [None]:
tweet_data = pd.read_csv("vaccination_all_tweets.csv", encoding='ISO-8859-1')

In [None]:
tweet_data.head()

In [None]:
tweet_data.info()

In [None]:
unnecessary_col = ['id', 'user_name', 'user_description', 'user_created',
       'user_followers', 'user_friends', 'user_favourites', 'user_verified',
       'date','source', 'retweets', 'favorites',
       'is_retweet']

In [None]:
data = tweet_data.drop(unnecessary_col,axis=1)

In [None]:
len(data.loc[data.duplicated()])

In [None]:
data.drop_duplicates(keep="first",inplace=True)
data.reset_index(drop=True , inplace = True)

In [None]:
def clean(string):
    if (len(string)==0):
        return ''
    if (set(string) == set(string[0])):
        return ''    
    prev = None
    letters = [l for l in string]
    counter = 1
    new = []
    for l in letters:
        if l==prev:
            counter+=1
        else:
            if (counter==2):
                new.append(prev)
            counter=1
            new.append(l)
            prev = l
    return ''.join(new)

In [None]:
# Cleaning Text: Multiple hashtags
data['clean_tweet'] = data['text'].apply(nfx.remove_hashtags)

# Cleaning Text: userhandles


data['clean_tweet'] = data['clean_tweet'].apply(lambda x: nfx.remove_userhandles(x))

# Cleaning Text : Remove urls


data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_urls)

# Cleaning Text : custom remove special characters (':', ',', ';', '.', '|','-','_','^', [&amp, &yen, ....])


data['clean_tweet'] = data['clean_tweet'].apply(lambda x: nfx.remove_custom_pattern(x,':+|\,+|\;+|\.+|\"+|\|+|\-+|\_+|\%+|\^|\*|\&[a-zA-Z]*'))
data['clean_tweet'] = data['clean_tweet'].apply(lambda x: nfx.remove_custom_words(x,'\n'))

# Cleaning Text: Punctuations


data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_puncts)
data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_punctuations)

# Cleaning Text: dates


data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_dates)

# Cleaning Text: Emails


data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_emails)

# Cleaning Text: Numbers


data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_numbers)
                                                
                                                
                                                

# data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_special_characters)



#Remove words made up of repetitive letters


data['clean_tweet'] = data['clean_tweet'].fillna('').map(clean)



# Cleaning Text: Multiple WhiteSpaces


data['clean_tweet'] = data['clean_tweet'].apply(nfx.remove_multiple_spaces)




In [None]:
data.head()

In [None]:
print(data.text[58])
print("=====")
print(data.clean_tweet[58])

In [None]:
vader_obj = SentimentIntensityAnalyzer()

In [None]:
def get_sentiment(tweet):
    
    text = emoji.demojize(tweet, delimiters=("", "")).replace("_" , " ")
    
    blob = TextBlob(text)
    sentiment_dict = vader_obj.polarity_scores(text)
    
    Compound = sentiment_dict['compound']
    sentiment_subjectivity = blob.sentiment.subjectivity
    
    if sentiment_subjectivity >= 0.25:
        if Compound >= 0.05:
            sentiment_label = 'Positive'
        elif Compound <= - 0.05:
            sentiment_label = 'Negative'
        else:
            sentiment_label = 'Neutral'
    else: 
        sentiment_label = 'Objective'
        

    return sentiment_label

In [None]:
ex1 = data['clean_tweet'][17]
ex1 

In [None]:
get_sentiment(ex1)

In [None]:
data['sentiment'] = data['clean_tweet'].apply(get_sentiment)

In [None]:
data = data.loc[data.sentiment != "Objective"]

In [None]:
final_data = data.drop(["text"],axis=1)

In [None]:
final_data.reset_index(drop=True , inplace = True)
final_data

In [None]:
#Stop-Word Removal, Lower Casing, Stemming, Tokenization.
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
tags = "[^A-Za-z]+"

In [None]:
days=['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
months=['january','february','march', 'april','may','june','july','august','september','october','november','december']

In [None]:
days=['monday','tuesday','wednesday','thursday','friday','saturday','sunday']
months=['january','february','march', 'april','may','june','july','august','september','october','november','december']


def preprocess_text(sentence, stem = True):
    
    sentence = re.sub(tags,' ', str(sentence).lower()).strip()
    text = []
    w=""
    for word in sentence.split():
        
        if word not in stopwords + days + months and len(word) >= 3:
            
            if stem:
                w=lemmatizer.lemmatize(word)
                text.append(stemmer.stem(w))
                w=""
            else:
                text.append(word)
                
    return " ".join([str(i) for i in text])

In [None]:
print(f"Orignal Text : {final_data.clean_tweet[7]}")
print("\nAfter Preprocessed : \n")
print(f"Preprocessed Text : {preprocess_text(final_data.clean_tweet[7])}")

In [None]:
final_data.clean_tweet = final_data.clean_tweet.map(preprocess_text)
final_data.head()

In [None]:
from collections import defaultdict
sentiment_positive_unigrams = defaultdict(int)
for tweet in final_data.loc[final_data.sentiment == 'Positive'].clean_tweet:
    for word in tweet.split(" "):
        sentiment_positive_unigrams[word] += 1
        
df_sentiment_positive_unigrams = pd.DataFrame(sorted(sentiment_positive_unigrams.items(), key=lambda x: x[1])[::-1])

unigrams_positive_100 = df_sentiment_positive_unigrams[:20]




sentiment_negative_unigrams = defaultdict(int)
for tweet in final_data.loc[final_data.sentiment == 'Negative'].clean_tweet:
    for word in tweet.split(" "):
        sentiment_negative_unigrams[word] += 1
        
df_sentiment_negative_unigrams = pd.DataFrame(sorted(sentiment_negative_unigrams.items(), key=lambda x: x[1])[::-1])

unigrams_negative_100 = df_sentiment_negative_unigrams[:20]

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(18, 20//2), dpi=80)
plt.tight_layout()

sns.barplot(y=unigrams_positive_100[0], x=unigrams_positive_100[1], ax=axes[0], color='green')
sns.barplot(y=unigrams_negative_100[0], x=unigrams_negative_100[1], ax=axes[1], color='red')

for i in range(2):
    axes[i].spines['right'].set_visible(False)
    axes[i].set_xlabel('')
    axes[i].set_ylabel('')
    axes[i].tick_params(axis='x', labelsize=13)
    axes[i].tick_params(axis='y', labelsize=13)

axes[0].set_title(f'The most common words used in positive tweets {20} ', fontsize=13)
axes[1].set_title(f'The most common words used in negative tweets {20} ', fontsize=13)

plt.show()

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
final_data.sentiment = le.fit_transform(final_data.sentiment)

In [None]:
final_data.head()

In [None]:
print("Samples per class {}".format(np.bincount(final_data.sentiment)))

In [None]:
final_data.sentiment.value_counts().plot.pie(autopct='%1.1f%%', labels=None, legend=True)
plt.tight_layout ()

In [None]:
vect = CountVectorizer()

In [None]:
nag_tweet_log = final_data.loc[final_data['sentiment'] == 0]
nag_tweet_location = nag_tweet_log['user_location'].value_counts().reset_index().rename(columns={'index':'user_location','user_location':'Count'})
sns.barplot(y = nag_tweet_location['user_location'][1:16], x= nag_tweet_location['Count'][1:16], palette=('icefire'))
plt.xticks(rotation=90)
fig = plt.gcf()
fig.set_size_inches(13,6)
plt.title('Negative tweets ordered by their locations ')
plt.show()

In [None]:
top_locations = final_data['user_location'].value_counts().reset_index().rename(columns = {
    'index':'user_location','user_location':'Count'})
top_locations[:10]

In [None]:
nag_tweet_location = final_data['user_location'].value_counts().reset_index().rename(columns={'index':'user_location','user_location':'Count'})
sns.barplot(y = nag_tweet_location['user_location'][1:16], x= nag_tweet_location['Count'][1:16], palette=('icefire'))
plt.xticks(rotation=90)
fig = plt.gcf()
fig.set_size_inches(13,6)
plt.title('Countries ordered by the number of tweets posted')
plt.show()

In [None]:
nag_tweet_log = final_data.loc[final_data['sentiment'] == 2]
nag_tweet_location = nag_tweet_log['user_location'].value_counts().reset_index().rename(columns={'index':'user_location','user_location':'Count'})
sns.barplot(y = nag_tweet_location['user_location'][1:16], x= nag_tweet_location['Count'][1:16], palette=('icefire'))
plt.xticks(rotation=90)
fig = plt.gcf()
fig.set_size_inches(13,6)
plt.title('Positive tweets ordered by their locations')
plt.show()

In [None]:
tweet_lococations = final_data[final_data['user_location'].isin(final_data['user_location'].value_counts()[1:16].index)]
# Now Ploting
pd.crosstab(tweet_lococations.user_location, tweet_lococations.sentiment).plot.barh(stacked=True,width=1, color=sns.color_palette("icefire", 9))
fig=plt.gcf()
fig.set_size_inches(15,7)
plt.show()

In [None]:
!pip install gensim
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer #for TF-IDF
from gensim.models import Word2Vec  #For Word2Vec
from gensim.models import FastText  #For Fast Text

In [None]:
vect = CountVectorizer()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(final_data.clean_tweet, final_data.sentiment, random_state=42, test_size=0.2)

print(X_train.shape, X_test.shape)

In [None]:
print("Samples per class in train {}".format(np.bincount(y_train)))
print("Samples per class in test {}".format(np.bincount(y_test)))

In [None]:
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [None]:
bow_df = pd.DataFrame(X_train.toarray(), columns = vect.get_feature_names_out())

In [None]:
bow_df.head()

In [None]:
feature_names = vect.get_feature_names_out()
print("Number of features: {}".format(len(feature_names)))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans


In [None]:
/* Svm Operations  */ 

In [None]:
from sklearn import svm

In [None]:
 clf = svm.SVC()

In [None]:
clf.fit(X_train, y_train)

In [None]:
prediction = clf.predict(X_test)

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, prediction), precision_score(y_test, prediction, average='macro'), recall_score(y_test, prediction, average='macro'), f1_score(y_test, prediction, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, prediction)

In [None]:
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])


In [None]:
 cm_display.plot()
       

In [None]:
 plt.show()

In [None]:
/* Kernal Linear */ 

In [None]:
linear_svc= svm.SVC(kernel='linear', C=1.0) 

In [None]:
linear_svc.fit(X_train,y_train)

In [None]:
y_pred_test=linear_svc.predict(X_test)

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred_test), precision_score(y_test, y_pred_test, average='macro'), recall_score(y_test, y_pred_test, average='macro'), f1_score(y_test, y_pred_test, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_test)

In [None]:
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])


In [None]:
 cm_display.plot()

In [None]:
 plt.show()

In [None]:
*/ Kernal Poly */

In [None]:
poly_svc=SVC(kernel='poly', C=1.0)

In [None]:
poly_svc.fit(X_train,y_train)

In [None]:
y_pred_poly=poly_svc.predict(X_test)

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, y_pred_poly), precision_score(y_test, y_pred_poly, average='macro'), recall_score(y_test, y_pred_poly, average='macro'), f1_score(y_test, y_pred_poly, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, y_pred_poly)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])
cm_display.plot()
plt.show()

In [None]:
*/ MultiNomial Logistic Regression */

In [None]:
model = LogisticRegression(multi_class='multinomial',max_iter=10000)

In [None]:
model.fit(X_train, y_train)

In [None]:
mlr_predict = model.predict(X_test)

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, mlr_predict), precision_score(y_test, mlr_predict, average='macro'), recall_score(y_test, mlr_predict, average='macro'), f1_score(y_test, mlr_predict, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, mlr_predict)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])
cm_display.plot()
plt.show()

In [None]:
*/DecisionTreeClassifier /*

In [None]:
dt = DecisionTreeClassifier()

In [None]:
dt.fit(X_train, y_train)

In [None]:
dt_predict = dt.predict(X_test) 

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, dt_predict), precision_score(y_test, dt_predict, average='macro'), recall_score(y_test, dt_predict, average='macro'), f1_score(y_test, dt_predict, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, dt_predict)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])
cm_display.plot()
plt.show()

In [None]:
/* Random Forest */

In [None]:
rf = RandomForestClassifier()

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_predict = rf.predict(X_test)

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, rf_predict), precision_score(y_test, rf_predict, average='macro'), recall_score(y_test, rf_predict, average='macro'), f1_score(y_test, rf_predict, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, rf_predict)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])
cm_display.plot()
plt.show()

In [None]:
/* Multinomial Naive Bayes */ 

In [None]:
nb = MultinomialNB()

In [None]:
nb.fit(X_train, y_train)

In [None]:
nb_predict = nb.predict(X_test)

In [None]:
 print("Accuracy: %1.3f \tPrecision: %1.3f \tRecall: %1.3f \t\tF1: %1.3f\n" % (accuracy_score(y_test, nb_predict), precision_score(y_test, nb_predict, average='macro'), recall_score(y_test, nb_predict, average='macro'), f1_score(y_test, nb_predict, average='macro')))
    

In [None]:
confusion_matrix = metrics.confusion_matrix(y_test, nb_predict)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = ['negative', 'neutural', 'positive'])
cm_display.plot()
plt.show()