## Data Preprocessing

In [3]:
import pandas as pd
import numpy as np

In [4]:
train_df = pd.read_csv("data/Train.csv")
valid_df = pd.read_csv("data/Valid.csv")
test_df = pd.read_csv("data/Test.csv")

In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    40000 non-null  object
 1   label   40000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 625.1+ KB


In [6]:
valid_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.3+ KB


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5000 non-null   object
 1   label   5000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 78.3+ KB


In [8]:
# Convert label to 8 bit since it takes either 1 or 0 and int8 is the smallest.
train_df["label"].astype("int8")
valid_df["label"].astype("int8")
test_df["label"].astype("int8")

0       0
1       0
2       0
3       1
4       0
       ..
4995    1
4996    1
4997    0
4998    0
4999    0
Name: label, Length: 5000, dtype: int8

In [9]:
train_df["label"].value_counts()

label
0    20019
1    19981
Name: count, dtype: int64

> Data is balanced

### Text Cleaning

In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\dhiab\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dhiab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dhiab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
from re import sub

lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

""" Function to remove puntctuation, numbers and stopwords then apply lemmatization"""
def clean_text(text):
    text = text.lower()
    text = sub(r'[^\w\s]', '', text) # Remove punctuation
    text = sub(r'[\d+]', '', text) # Remove numbers
    
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(words)

In [10]:
train_df["cleaned_text"] = train_df["text"].apply(clean_text)
valid_df["cleaned_text"] = valid_df["text"].apply(clean_text)
test_df["cleaned_text"] = test_df["text"].apply(clean_text)

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - 'C:\\Users\\dhiab/nltk_data'
    - 'c:\\Users\\dhiab\\AppData\\Local\\Programs\\Python\\Python311\\nltk_data'
    - 'c:\\Users\\dhiab\\AppData\\Local\\Programs\\Python\\Python311\\share\\nltk_data'
    - 'c:\\Users\\dhiab\\AppData\\Local\\Programs\\Python\\Python311\\lib\\nltk_data'
    - 'C:\\Users\\dhiab\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


#### Feature Extraction
##### We will use BoW (Bag of Words) to represent each sentence as a vector.

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(train_df["cleaned_text"])
X_test = vectorizer.transform(test_df["cleaned_text"])
X_valid = vectorizer.transform(valid_df["cleaned_text"])
y_train = train_df['label']
y_valid = valid_df['label']
y_test  = test_df['label']

In [13]:
np.shape(X_test)

(5000, 145288)

#### Model selection

> Our goal is to predict whether the text provided is positive or negative , so we have a binary classification problem , thus we will try some classifications models.

In [14]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

In [15]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
from sklearn.svm import LinearSVC
svm_model = LinearSVC()
svm_model.fit(X_train, y_train)



### Evaluation

> Naive Bayes

In [None]:
from sklearn.metrics import accuracy_score, classification_report

y_pred = nb_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8624
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      2495
           1       0.88      0.84      0.86      2505

    accuracy                           0.86      5000
   macro avg       0.86      0.86      0.86      5000
weighted avg       0.86      0.86      0.86      5000



> Logistic Regression

In [None]:
y_pred = lr_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.889
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      2495
           1       0.88      0.90      0.89      2505

    accuracy                           0.89      5000
   macro avg       0.89      0.89      0.89      5000
weighted avg       0.89      0.89      0.89      5000



> SVM

In [None]:
y_pred = svm_model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8746
              precision    recall  f1-score   support

           0       0.88      0.87      0.87      2495
           1       0.87      0.88      0.88      2505

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000



> Random Forest

> In conclusion, after evaluating four different models for sentiment analysis—Naive Bayes (NB), Logistic Regression (LR), Support Vector Machine (SVM) we found that all models performed similarly, with accuracy scores closely clustered around the 0.86–0.89 range. Logistic Regression achieved the highest accuracy at 0.889, indicating it may be the most effective model for this dataset. However, the difference in performance between the models is minimal, suggesting that each approach is fairly robust for this task. Given the computational efficiency and simplicity of Naive Bayes, it may be a strong candidate when quick predictions are needed with minimal resources. Meanwhile, Logistic Regression or SVM might be preferable for scenarios where slightly higher accuracy is prioritized.

In [None]:
# I will proceed with LR model for my app , so let's save it
import pickle
with open("twitter_sentiment_analysis_LR.pkl", "wb") as file:
    pickle.dump(lr_model, file)

In [None]:
# Save the vectorizer as well so we can use it later
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)