# Twitter Sentiment Analysis
   * Dataset used : [Sentiment140 dataset](https://www.kaggle.com/kazanova/sentiment140).  It contains 1,600,000 tweets extracted using the twitter api . The tweets have been annotated (0 = negative, 4 = positive) and they can be used to detect sentiment .
   
   * We have tried to imitate the model architecture described in [this paper](https://www.itm-conferences.org/articles/itmconf/abs/2021/02/itmconf_icitsd2021_01012/itmconf_icitsd2021_01012.html).
   
   * This model will be used in further analysis of Twitter User Data to generate a measure of thier trust and affluence.

## Importing the libraries

In [1]:
# For working with data
import pandas as pd
import numpy as np

# Removing stopwords and Stemming
import nltk

# Time taken by functions
import time

# Cleaning Tweets
import re

## Importing the dataset
* Reading the Train CSV file
* Dropping a few useless columns
* Mapping Positive (4) to 1 and Negative (0) to 0
* Shuffling the data

In [2]:
# Reading the Train CSV file

raw_data = pd.read_csv("./train.csv" , header = None , encoding = 'latin')

# Dropping a few useless columns

raw_data.columns = ['Y', 'A', 'B', 'C', 'D', 'X']
raw_data = raw_data.drop(['A', 'B', 'C', 'D'], axis=1)

# Mapping Positive (4) to 1 and Negative (0) to 0

raw_data['Y'] = raw_data['Y'].map({4 : 1 , 0 : 0})
raw_data['Y'].value_counts()

# Shuffling the data

raw_data = raw_data.sample(frac=1).reset_index(drop=True)

## Preprocessing the Tweets
* Cleaning the Tweets
    * Lowercasing
    * Removing all the urls, user tags, hashtags and some punctuations.
    * Stemming and removing spaces
* Train-Test Split
* Removing very rare words ( <= 60 occurences ).
* Splitting into Targets and Features.
* Count Vectorizing and TF-IDF Vectorizing

In [3]:
# Declaring the function
def process_text(text):

    # Lowercasing
    text = text.lower()

    # Replacing all the urls
    text = re.sub('(?i)\\b((?:https?://|www\\d{0,3}[.]|[a-z0-9.\\-]+[.][a-z]{2,4}/)(?:[^\\s()<>]+|\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\))+(?:\\(([^\\s()<>]+|(\\([^\\s()<>]+\\)))*\\)|[^\\s`!()\\[\\]{};:\'\\".,<>?\xc2\xab\xc2\xbb\xe2\x80\x9c\xe2\x80\x9d\xe2\x80\x98\xe2\x80\x99]))'
               , '', text)

    # Replacing all user tags
    text = re.sub(r"@[^\s]+", '', text)

    # Replacing all hashtags
    text = re.sub(r"#[^\s]+", '', text)

    # Remove some punctuations
    text = re.sub(r"[!?,'\"*)@#%(&$_.^-]", '', text)

    # Splitting on spaces
    text = text.split(' ')

    # Stemming and removing spaces
    stemmer_ps = nltk.stem.PorterStemmer()  
    text = [stemmer_ps.stem(word) for word in text if len(word)]

    return text

start = time.time()

raw_data['X'] = raw_data['X'].map(process_text)

end = time.time()
print(end - start)

763.4864540100098


In [4]:
# Train Test Split

from sklearn.model_selection import train_test_split

Train, Test = train_test_split(raw_data , stratify = raw_data["Y"], test_size=0.02)

Train = Train.to_numpy()
Test = Test.to_numpy()

del raw_data

In [5]:
# Creating the word frequency dictionary to remove very rare words.
wordfreq = {}

start = time.time()

for _ , text in Train:
    for w in text:
        if w not in wordfreq:
            wordfreq[w] = 1
    else:
        wordfreq[w] += 1

end = time.time()
print(end - start)

9.388003587722778


In [6]:
# Getting words that have occured more than 75 times in tweets

words = [word for word in wordfreq if wordfreq[word] <= 60]
low_words = len(words)
total_words = len(wordfreq)
print(low_words , total_words - low_words , (low_words / total_words * 100) , ((total_words - low_words) / total_words * 100))

398352 2439 99.39145340089972 0.6085465991002792


In [7]:
# Removing the low occurence words from the wordfreq dictionaries

for word in words:
    if word in wordfreq: 
        del wordfreq[word]

In [8]:
# Removing the low occurence words from the tweets

start = time.time()

for i in range(len(Train)):
    Train[i][1] = [ word for word in Train[i][1] if word in wordfreq]

end = time.time()
print(end - start)

7.084949731826782


In [9]:
# Joining for Vectorizer

start = time.time()

for i in range(len(Train)):
    Train[i][1] = " ".join(Train[i][1])

end = time.time()
print(end - start)

3.3390800952911377


In [10]:
# Creating the targets and labels for train

X = []
Y = []

for i in range(len(Train)):
    if ( len(Train[i][1]) > 2 ):
        X.append(Train[i][1])
        Y.append(Train[i][0])

X = np.array(X)
Y = np.array(Y)

In [11]:
# Vectoritzing 

from sklearn.feature_extraction.text import CountVectorizer
vectorizer_CV = CountVectorizer()

X_CV = vectorizer_CV.fit_transform(X)


from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer_TFIDF = TfidfVectorizer()

X_TFIDF = vectorizer_TFIDF.fit_transform(X)

## Training the Model

In [12]:
# Importing the models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

In [13]:
# Instantiating the models 

# Logistic Regression
lr_CV = LogisticRegression(random_state=0 , max_iter=1000)
lr_TFIDF = LogisticRegression(random_state=0 , max_iter=1000)

# Random Forests
rf_CV = RandomForestClassifier(n_estimators=200 , max_depth=2, random_state=0)
rf_TFIDF = RandomForestClassifier(n_estimators=200 , max_depth=2, random_state=0)

# Gradient Boosting 
gb_CV = GradientBoostingClassifier(n_estimators=20, max_depth=2, random_state=0)
gb_TFIDF = GradientBoostingClassifier(n_estimators=20, max_depth=2, random_state=0)

# ADABoost
adb_CV = AdaBoostClassifier(n_estimators=20, random_state=0)
adb_TFIDF = AdaBoostClassifier(n_estimators=20, random_state=0)

In [14]:
print("Logistic regression started!")
start = time.time()
clf_lr_CV = lr_CV.fit(X_CV, Y)
end = time.time()
print("Logistic regression ended in " + str(end - start) + " secs")

print("Random Forests started!")
start = time.time()
clf_rf_CV = rf_CV.fit(X_CV, Y)
end = time.time()
print("Random Forests ended in " + str(end - start) + " secs")

print("GB started!")
start = time.time()
clf_gb_CV = gb_CV.fit(X_CV, Y)
end = time.time()
print("GB ended in " + str(end - start) + " secs")

print("ADB started!")
start = time.time()
clf_adb_CV = adb_CV.fit(X_CV, Y)
end = time.time()
print("ADB ended in " + str(end - start) + " secs")

Logistic regression started!
Logistic regression ended in 100.65697360038757 secs
Random Forests started!
Random Forests ended in 95.21197438240051 secs
GB started!
GB ended in 123.39501428604126 secs
ADB started!
ADB ended in 90.28889489173889 secs


In [71]:
print("Logistic regression started!")
start = time.time()
clf_lr_TFIDF = lr_TFIDF.fit(X_TFIDF, Y)
end = time.time()
print("Logistic regression ended in " + str(end - start) + " secs")

print("Random Forests started!")
start = time.time()
clf_rf_TFIDF = rf_TFIDF.fit(X_TFIDF, Y)
end = time.time()
print("Random Forests ended in " + str(end - start) + " secs")

print("GB started!")
start = time.time()
clf_gb_TFIDF = gb_TFIDF.fit(X_TFIDF, Y)
end = time.time()
print("GB ended in " + str(end - start) + " secs")

print("ADB started!")
start = time.time()
clf_adb_TFIDF = adb_TFIDF.fit(X_TFIDF, Y)
end = time.time()
print("ADB ended in " + str(end - start) + " secs")

Logistic regression started!
Logistic regression ended in 52.55265784263611 secs
Random Forests started!
Random Forests ended in 105.24345660209656 secs
GB started!
GB ended in 225.63196730613708 secs
ADB started!
ADB ended in 162.3167896270752 secs


In [15]:
from sklearn.ensemble import VotingClassifier


# Count Vectorized Part
VC_CV_LR = LogisticRegression(random_state=0 , max_iter=1000)
VC_CV_GB = GradientBoostingClassifier(n_estimators=20, max_depth=2, random_state=0)
VC_CV_AB = AdaBoostClassifier(n_estimators=20, random_state=0)
VC_CV_RF = RandomForestClassifier(n_estimators=200 , max_depth=2, random_state=0)

VC_CV = VotingClassifier(estimators=[
    ('lr_cv', VC_CV_LR), 
    ('gb_cv', VC_CV_GB), 
    ('rf_cv', VC_CV_RF ), 
    ('ab_cv', VC_CV_AB) 
], voting='soft')

print("VC_CV started!")
start = time.time()
VC_CV = VC_CV.fit(X_CV, Y)
end = time.time()
print("VC_CV ended in " + str(end - start) + " secs")

VC_CV started!
VC_CV ended in 377.9691722393036 secs


In [16]:
# TFIDF Vectorized Part
VC_TFIDF_LR = LogisticRegression(random_state=0 , max_iter=1000)
VC_TFIDF_GB = GradientBoostingClassifier(n_estimators=20, max_depth=2, random_state=0)
VC_TFIDF_AB = AdaBoostClassifier(n_estimators=20, random_state=0)
VC_TFIDF_RF = RandomForestClassifier(n_estimators=200 , max_depth=2, random_state=0)

VC_TFIDF = VotingClassifier(estimators=[
    ('lr_tfidf', VC_TFIDF_LR), 
    ('gb_tfidf', VC_TFIDF_GB), 
    ('rf_tfidf', VC_TFIDF_RF ), 
    ('ab_tfidf', VC_TFIDF_AB) 
], voting='soft')

print("VC_TFIDF started!")
start = time.time()
VC_TFIDF = VC_TFIDF.fit(X_TFIDF, Y)
end = time.time()
print("VC_TFIDF ended in " + str(end - start) + " secs")

VC_TFIDF started!
VC_TFIDF ended in 508.7841799259186 secs


## Testing the model

In [17]:
# Preprocessing the test set

for i in range(len(Test)):
    Test[i][1] = [ word for word in Test[i][1] if word in wordfreq]

In [46]:
# Creating the targets and labels for train

X_test = []
Y_test = []

for i in range(len(Test)):
    X_test.append(" ".join(Test[i][1]))
    Y_test.append(Test[i][0])

X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [47]:
# Vectorizing

X_CV_test = vectorizer_CV.transform(X_test)
X_TFIDF_test = vectorizer_TFIDF.transform(X_test)

In [48]:
# Predicting for single models

Y_pred_lr_CV = clf_lr_CV.predict(X_CV_test)
Y_pred_rf_CV = clf_rf_CV.predict(X_CV_test)
Y_pred_gb_CV = clf_gb_CV.predict(X_CV_test)
Y_pred_adb_CV = clf_adb_CV.predict(X_CV_test)

In [74]:
Y_pred_lr_TFIDF = clf_lr_TFIDF.predict(X_TFIDF_test)
Y_pred_rf_TFIDF = clf_rf_TFIDF.predict(X_TFIDF_test)
Y_pred_gb_TFIDF = clf_gb_TFIDF.predict(X_TFIDF_test)
Y_pred_adb_TFIDF = clf_adb_TFIDF.predict(X_TFIDF_test)

In [83]:
VC_CV_pred = VC_CV.predict(X_CV_test)

In [85]:
VC_TFIDF_pred = VC_TFIDF.predict(X_TFIDF_test)

In [51]:
# Importing the metrics

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [82]:
def metrics (model_name , Y_test , Y_pred ):
    print("------ Metrics for " + model_name + "------")
    print("Accuracy : " + str(accuracy_score(Y_test, Y_pred)))
    print("Precision : " + str(precision_score(Y_test, Y_pred)))
    print("Recall : " + str(recall_score(Y_test, Y_pred)))
    print("F1 Score : " + str(f1_score(Y_test, Y_pred)))
    print("------------------------------------------- ")

In [53]:
metrics("Logistic Regression", Y_test ,Y_pred_lr_CV)
metrics("Random Forest", Y_test ,Y_pred_rf_CV,)
metrics("Gradient Boosting", Y_test ,Y_pred_gb_CV)
metrics("ADABoost", Y_test ,Y_pred_adb_CV)

------ Metrics for Logistic Regression------
Accuracy : 0.7798125
Precision : 0.7661079410366144
Recall : 0.8055625
F1 Score : 0.7853399951255179
------------------------------------------- 
------ Metrics for Random Forest------
Accuracy : 0.71059375
Precision : 0.7112671640855226
Recall : 0.709
F1 Score : 0.7101317725124418
------------------------------------------- 
------ Metrics for Gradient Boosting------
Accuracy : 0.6515
Precision : 0.6154835636017151
Recall : 0.8074375
F1 Score : 0.6985131116517977
------------------------------------------- 
------ Metrics for ADABoost------
Accuracy : 0.64775
Precision : 0.6045925139368198
Recall : 0.8540625
F1 Score : 0.7079944044350034
------------------------------------------- 


In [75]:
metrics("Logistic Regression", Y_test ,Y_pred_lr_TFIDF)
metrics("Random Forest", Y_test ,Y_pred_rf_TFIDF,)
metrics("Gradient Boosting", Y_test ,Y_pred_gb_TFIDF)
metrics("ADABoost", Y_test ,Y_pred_adb_TFIDF)

------ Metrics for Logistic Regression------
Accuracy : 0.781
Precision : 0.7731138379297777
Recall : 0.7954375
F1 Score : 0.7841168135050213
------------------------------------------- 
------ Metrics for Random Forest------
Accuracy : 0.7113125
Precision : 0.7135548256695301
Recall : 0.7060625
F1 Score : 0.709788891681327
------------------------------------------- 
------ Metrics for Gradient Boosting------
Accuracy : 0.60053125
Precision : 0.7290331767051118
Recall : 0.32
F1 Score : 0.4447726186856622
------------------------------------------- 
------ Metrics for ADABoost------
Accuracy : 0.64775
Precision : 0.604666607633047
Recall : 0.8535625
F1 Score : 0.7078733219302338
------------------------------------------- 


In [86]:
metrics("Voting Classifier Count Vectorized", Y_test ,Y_pred_adb_CV)
metrics("Voting Classifier TFIDF", Y_test ,Y_pred_adb_CV)

------ Metrics for Voting Classifier Count Vectorized------
Accuracy : 0.64775
Precision : 0.6045925139368198
Recall : 0.8540625
F1 Score : 0.7079944044350034
------------------------------------------- 
------ Metrics for Voting Classifier TFIDF------
Accuracy : 0.64775
Precision : 0.6045925139368198
Recall : 0.8540625
F1 Score : 0.7079944044350034
------------------------------------------- 
