In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
import numpy as np
import math

df = pd.read_csv("prg3.csv")
X = df['summary']
y = df['overall'].apply(lambda x: 1 if x >= 4 else (0 if x <= 2 else 2))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

stop_words = set(stopwords.words('english'))
preprocess = lambda text: ' '.join([word.lower() for word in word_tokenize(text) if word.isalpha() and word.lower() not in stop_words])
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

In [2]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A2IBPI20UZIR0U,1384719342,"cassandra tu ""Yeah, well, that's just like, u...","[0, 0]","Not much to write about here, but it does exac...",5.0,good,1393545600,"02 28, 2014"
1,A14VAT5EAX3D9S,1384719342,Jake,"[13, 14]",The product does exactly as it should and is q...,5.0,Jake,1363392000,"03 16, 2013"
2,A195EZSQDW3E21,1384719342,"Rick Bennette ""Rick Bennette""","[1, 1]",The primary job of this device is to block the...,5.0,It Does The Job Well,1377648000,"08 28, 2013"
3,A2C00NNG1ZQQG2,1384719342,"RustyBill ""Sunday Rocker""","[0, 0]",Nice windscreen protects my MXL mic and preven...,5.0,GOOD WINDSCREEN FOR THE MONEY,1392336000,"02 14, 2014"
4,A94QU4C90B1AX,1384719342,SEAN MASLANKA,"[0, 0]",This pop filter is great. It looks and perform...,5.0,No more pops when I record my vocals.,1392940800,"02 21, 2014"


In [3]:
def calculate_tf(term, document):
    words = document.split()
    return words.count(term) / (len(words) + 1)

def calculate_idf(term, documents):
    document_containing_term = sum(1 for document in documents if term in document.split())
    return math.log(len(documents) / (document_containing_term + 1)) if document_containing_term > 0 else 0

all_documents = X_train.tolist() + X_test.tolist()
idf_values = {term: calculate_idf(term, all_documents) for term in set(' '.join(all_documents).split())}

vocabulary = sorted(list(idf_values.keys()))

X_train_tfidf_manual = []
for document in X_train:
    tfidf_vector = [calculate_tf(term, document) * idf_values[term] for term in vocabulary]
    X_train_tfidf_manual.append(tfidf_vector)

X_test_tfidf_manual = []
for document in X_test:
    tfidf_vector = [calculate_tf(term, document) * idf_values[term] for term in vocabulary]
    X_test_tfidf_manual.append(tfidf_vector)

X_train_tfidf_manual = np.array(X_train_tfidf_manual)
X_test_tfidf_manual = np.array(X_test_tfidf_manual)

In [4]:
model = LogisticRegression(max_iter=1000).fit(X_train_tfidf_manual, y_train)
y_pred = model.predict(X_test_tfidf_manual)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8840532640467684


In [5]:
new_text = "It's good"

new_text_tfidf_manual = [calculate_tf(term, preprocess(new_text)) * idf_values[term] for term in vocabulary]
predicted_sentiment = model.predict([new_text_tfidf_manual])

sentiment_mapping = {0: "Negative", 1: "Positive", 2: "Neutral"}
predicted_sentiment_label = sentiment_mapping.get(predicted_sentiment[0], "Unknown")

print("Predicted Sentiment:", predicted_sentiment_label)

Predicted Sentiment: Positive


In [6]:
new_text = "It's okay"

new_text_tfidf_manual = [calculate_tf(term, preprocess(new_text)) * idf_values[term] for term in vocabulary]
predicted_sentiment = model.predict([new_text_tfidf_manual])

sentiment_mapping = {0: "Negative", 1: "Positive", 2: "Neutral"}
predicted_sentiment_label = sentiment_mapping.get(predicted_sentiment[0], "Unknown")

print("Predicted Sentiment:", predicted_sentiment_label)

Predicted Sentiment: Neutral


In [11]:

    # Predict on new text
    new_text = "It's worst"
    new_text_tfidf_manual = [calculate_tf(term, preprocess(new_text)) * idf_values[term] for term in vocabulary]
    predicted_sentiment = model.predict([new_text_tfidf_manual])
    print("Predicted Sentiment:", "Positive" if predicted_sentiment[0] == 1 else "Negative")

Predicted Sentiment: Negative
