In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report
import re

In [None]:
# Load the dataset
# dataset link--> https://www.kaggle.com/datasets/arhamrumi/amazon-product-reviews/data
data=pd.read_csv('Datasets/Reviews.csv')
data.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


### Data Preprocessing

In [3]:
data.shape

(568454, 10)

In [4]:
data.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               26
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   27
Text                       0
dtype: int64

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568428 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


In [6]:
X=data['Text']
y=data['Score']

In [7]:
X

0         I have bought several of the Vitality canned d...
1         Product arrived labeled as Jumbo Salted Peanut...
2         This is a confection that has been around a fe...
3         If you are looking for the secret ingredient i...
4         Great taffy at a great price.  There was a wid...
                                ...                        
568449    Great for sesame chicken..this is a good if no...
568450    I'm disappointed with the flavor. The chocolat...
568451    These stars are small, so you can give 10-15 o...
568452    These are the BEST treats for training and rew...
568453    I am very satisfied ,product is as advertised,...
Name: Text, Length: 568454, dtype: object

In [None]:
# mapping the output feature
map_sentiment = lambda x: 0 if x <= 2 else 1 if x == 3 else 2
y=y.apply(map_sentiment)

In [13]:
y.value_counts()

Score
2    443777
0     82037
1     42640
Name: count, dtype: int64

In [None]:
# cleaning the text
def clean_text(text):
    text=text.lower()
    text=re.sub(r'<.*?>0-9', '', text)
    text=re.sub(r'[^a-z\s]','',text)
    return text

In [15]:
X_clean=X.apply(clean_text)

In [16]:
X_clean

0         i have bought several of the vitality canned d...
1         product arrived labeled as jumbo salted peanut...
2         this is a confection that has been around a fe...
3         if you are looking for the secret ingredient i...
4         great taffy at a great price  there was a wide...
                                ...                        
568449    great for sesame chickenthis is a good if not ...
568450    im disappointed with the flavor the chocolate ...
568451    these stars are small so you can give  of thos...
568452    these are the best treats for training and rew...
568453    i am very satisfied product is as advertised i...
Name: Text, Length: 568454, dtype: object

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=20000,   
    ngram_range=(1,2),
    min_df=5,
    stop_words='english',
    sublinear_tf=True
)

In [19]:
X_clean=vectorizer.fit_transform(X_clean)

In [20]:
X_clean

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 20663920 stored elements and shape (568454, 20000)>

In [21]:
y.value_counts()

Score
2    443777
0     82037
1     42640
Name: count, dtype: int64

In [None]:
# Splitting the data for training and testing
X_train,X_test,y_train,y_test=train_test_split(X_clean,y,test_size=0.20,random_state=42,stratify=y)

In [None]:
# model Implementation
multi_models = {
    "LogReg": LogisticRegression(
        solver='lbfgs',
        max_iter=1000
    ),
    "LinearSVM": LinearSVC()
}


In [None]:
# creating a function to calculate the metrics
from sklearn.metrics import classification_report, confusion_matrix, f1_score

def evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print("Macro F1:", f1_score(y_test, y_pred, average='macro'))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


In [None]:
# Fitting the Model
for name,model in multi_models.items():
    model.fit(X_train,y_train)
    print(f"\n{name} — Multiclass")
    evaluate(model,X_test,y_test)



LogReg — Multiclass
              precision    recall  f1-score   support

           0       0.78      0.73      0.75     16407
           1       0.62      0.27      0.38      8528
           2       0.91      0.97      0.94     88756

    accuracy                           0.88    113691
   macro avg       0.77      0.66      0.69    113691
weighted avg       0.87      0.88      0.87    113691

Macro F1: 0.6900816475309665
Confusion Matrix:
 [[11911   670  3826]
 [ 1731  2330  4467]
 [ 1715   766 86275]]

LinearSVM — Multiclass
              precision    recall  f1-score   support

           0       0.77      0.75      0.76     16407
           1       0.66      0.28      0.40      8528
           2       0.92      0.97      0.94     88756

    accuracy                           0.89    113691
   macro avg       0.78      0.67      0.70    113691
weighted avg       0.88      0.89      0.88    113691

Macro F1: 0.6993902927032575
Confusion Matrix:
 [[12279   571  3557]
 [ 1808  240

In [None]:
# Finalized model
model=LinearSVC(class_weight='balanced')
model.fit(X_train,y_train)

evaluate(model=model,X_test=X_test,y_test=y_test)


              precision    recall  f1-score   support

           0       0.73      0.78      0.75     16407
           1       0.44      0.51      0.47      8528
           2       0.95      0.92      0.94     88756

    accuracy                           0.87    113691
   macro avg       0.70      0.74      0.72    113691
weighted avg       0.88      0.87      0.87    113691

Macro F1: 0.7196187839195267
Confusion Matrix:
 [[12827  1722  1858]
 [ 1791  4336  2401]
 [ 3036  3881 81839]]


In [None]:
# saving the model and TF-IDF as Pickle file
import pickle

with open('model.pkl','wb') as file:
    pickle.dump(model,file)

with open('vectorizer.pkl','wb') as file:
    pickle.dump(vectorizer,file)
    