# Tweets Sentiment Classification

### By: Soorya Parthiban

## Problem Statement: Build a machine learning model to classify the tweet sentiments.

## Importing Library & Dataset

In [62]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soory\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soory\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\soory\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
twitter_df = pd.read_csv(r"D:\001_Data\NLP\Data\twitter_training.csv")

In [10]:
twitter_df.head()

Unnamed: 0,ID,Sentiment,Tweet
0,2401,Positive,im getting on borderlands and i will murder yo...
1,2401,Positive,I am coming to the borders and I will kill you...
2,2401,Positive,im getting on borderlands and i will kill you ...
3,2401,Positive,im coming on borderlands and i will murder you...
4,2401,Positive,im getting on borderlands 2 and i will murder ...


In [11]:
twitter_df.tail()

Unnamed: 0,ID,Sentiment,Tweet
74677,9200,Positive,Just realized that the Windows partition of my...
74678,9200,Positive,Just realized that my Mac window partition is ...
74679,9200,Positive,Just realized the windows partition of my Mac ...
74680,9200,Positive,Just realized between the windows partition of...
74681,9200,Positive,Just like the windows partition of my Mac is l...


In [13]:
twitter_df['Sentiment'].describe().T

count        74682
unique           4
top       Negative
freq         22542
Name: Sentiment, dtype: object

In [14]:
twitter_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID         74682 non-null  int64 
 1   Sentiment  74682 non-null  object
 2   Tweet      73996 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.7+ MB


In [15]:
twitter_df.isnull().sum()

ID             0
Sentiment      0
Tweet        686
dtype: int64

In [16]:
twitter_df.dropna(inplace=True)

In [17]:
twitter_df.isnull().sum()

ID           0
Sentiment    0
Tweet        0
dtype: int64

In [19]:
twitter_df['Sentiment'].value_counts()

Negative      22358
Positive      20655
Neutral       18108
Irrelevant    12875
Name: Sentiment, dtype: int64

## Data Pre-Processing

In [20]:
twitter_df['Sentiment'] = twitter_df['Sentiment'].str.lower()


def remove_special_characters(text, remove_digits=True):
  pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
  text = re.sub(pattern, '', str(text))
  return text

twitter_df['Sentiment'] = twitter_df['Sentiment'].apply(remove_special_characters, remove_digits=False)

def f(r):
  wnl = WordNetLemmatizer()
  words = nltk.word_tokenize(r)
  lemmatized_words = [wnl.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  return " ".join(lemmatized_words)

twitter_df['Sentiment'] = twitter_df['Sentiment'].apply(f)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
data_X = cv.fit_transform(twitter_df['Sentiment']).toarray()
data_X = pd.DataFrame(data_X, columns=cv.get_feature_names())

In [23]:
X = data_X
y = twitter_df.Sentiment

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4)

In [25]:
print(X_train.shape, X_val.shape)

(44397, 4) (29599, 4)


## Building the ML Models

In [59]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

### 1. Decision Tree  

In [27]:
from sklearn.tree import DecisionTreeClassifier

In [28]:
model1 = DecisionTreeClassifier()

In [29]:
model1.fit(X_train, y_train)

DecisionTreeClassifier()

In [30]:
y_preds_model1 = model1.predict(X_val)

In [31]:
print("Decision Tree model's accuracy: ", accuracy_score(y_val, y_preds_model1))

Decision Tree model's accuracy:  1.0


In [32]:
print("Decision Tree model's F1 Score: ", f1_score(y_val, y_preds_model1, average='weighted')) 

Decision Tree model's F1 Score:  1.0


In [33]:
print("Decision Tree model's Classification Report: \n", classification_report(y_val, y_preds_model1))

Decision Tree model's Classification Report: 
               precision    recall  f1-score   support

  irrelevant       1.00      1.00      1.00      5195
    negative       1.00      1.00      1.00      8967
     neutral       1.00      1.00      1.00      7179
    positive       1.00      1.00      1.00      8258

    accuracy                           1.00     29599
   macro avg       1.00      1.00      1.00     29599
weighted avg       1.00      1.00      1.00     29599



### 2. Random Forest  

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
model2 = RandomForestClassifier(n_estimators=3000)

In [36]:
model2.fit(X_train, y_train)

RandomForestClassifier(n_estimators=3000)

In [37]:
y_preds_model2 = model2.predict(X_val)

In [38]:
print("Random Forest model's accuracy: ", accuracy_score(y_val, y_preds_model2))

Random Forest model's accuracy:  1.0


In [39]:
print("Random Forest model's F1 Score: ", f1_score(y_val, y_preds_model2, average='weighted')) 

Random Forest model's F1 Score:  1.0


In [40]:
print("Random Forest model's Classification Report: \n", classification_report(y_val, y_preds_model2))

Random Forest model's Classification Report: 
               precision    recall  f1-score   support

  irrelevant       1.00      1.00      1.00      5195
    negative       1.00      1.00      1.00      8967
     neutral       1.00      1.00      1.00      7179
    positive       1.00      1.00      1.00      8258

    accuracy                           1.00     29599
   macro avg       1.00      1.00      1.00     29599
weighted avg       1.00      1.00      1.00     29599



### 3. Extra Tree  

In [41]:
from sklearn.ensemble import ExtraTreesClassifier

In [42]:
model3 = ExtraTreesClassifier(n_estimators=3000)

In [43]:
model3.fit(X_train, y_train)

ExtraTreesClassifier(n_estimators=3000)

In [44]:
y_preds_model3 = model3.predict(X_val)

In [45]:
print("Extra Tree model's accuracy: ", accuracy_score(y_val, y_preds_model3))

Extra Tree model's accuracy:  1.0


In [46]:
print("Extra Tree model's F1 Score: ", f1_score(y_val, y_preds_model3, average='weighted')) 

Extra Tree model's F1 Score:  1.0


In [47]:
print("Extra Tree model's Classification Report: \n", classification_report(y_val, y_preds_model3))

Extra Tree model's Classification Report: 
               precision    recall  f1-score   support

  irrelevant       1.00      1.00      1.00      5195
    negative       1.00      1.00      1.00      8967
     neutral       1.00      1.00      1.00      7179
    positive       1.00      1.00      1.00      8258

    accuracy                           1.00     29599
   macro avg       1.00      1.00      1.00     29599
weighted avg       1.00      1.00      1.00     29599



### 4. XGB Classifier  

In [52]:
from xgboost import XGBClassifier

In [53]:
model4 = XGBClassifier(n_estimators=500)

In [54]:
model4.fit(X_train, y_train)



XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=500, n_jobs=8, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [55]:
y_preds_model4 = model4.predict(X_val)

In [56]:
print("XGBoost model's accuracy: ", accuracy_score(y_val, y_preds_model4))

XGBoost model's accuracy:  1.0


In [57]:
print("XGBoost model's F1 Score: ", f1_score(y_val, y_preds_model4, average='weighted')) 

XGBoost model's F1 Score:  1.0


In [58]:
print("XGBoost model's Classification Report: \n", classification_report(y_val, y_preds_model4))

XGBoost model's Classification Report: 
               precision    recall  f1-score   support

  irrelevant       1.00      1.00      1.00      5195
    negative       1.00      1.00      1.00      8967
     neutral       1.00      1.00      1.00      7179
    positive       1.00      1.00      1.00      8258

    accuracy                           1.00     29599
   macro avg       1.00      1.00      1.00     29599
weighted avg       1.00      1.00      1.00     29599

