In [52]:
# Libaries
import pandas as pd
import numpy as np
from b2_tokenizer_input import CustomTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [53]:
# Read Dataset In
reviews = pd.read_csv("product_review.csv")
reviews = reviews[["body", "rating"]]

In [54]:
# Sentiment based on rating
def preSentiment(ratings):
    sentiments = []

    for i in ratings:
        if i > 3.0:
            sentiment = 1
        else:
            sentiment = 0
            
        sentiments.append(sentiment)
    return sentiments

reviews["rating"] = pd.to_numeric(reviews["rating"], errors='coerce')
reviews["sentiment"] = preSentiment(reviews["rating"])


In [55]:
# Remove null values
reviews = reviews.dropna()

x = reviews["body"]
y = reviews["sentiment"]

token = CustomTokenizer()
vectorizer = TfidfVectorizer(tokenizer=token.text_data_cleaning)


In [56]:
# Model Training

xTrain, xTest, yTrain, yTest = train_test_split(x,y,stratify=reviews["rating"],random_state=0)


classifier = LinearSVC()
pipeline = Pipeline([("vctr", vectorizer), ("cls", classifier)])
pipeline.fit(xTrain, yTrain)



In [57]:
# Check performances

y_pred = pipeline.predict(xTest)
confusion_matrix(yTest, y_pred)
print(classification_report(yTest, y_pred))



              precision    recall  f1-score   support

           0       0.82      0.54      0.65        69
           1       0.92      0.98      0.95       353

    accuracy                           0.91       422
   macro avg       0.87      0.76      0.80       422
weighted avg       0.90      0.91      0.90       422

