### Import module

In [19]:
import pandas as pd
import requests
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import gzip
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

### Import data

In [None]:
url='http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Digital_Music_5.json.gz'

output = url.split('/')[-1]
with open(output, 'wb') as f:
    f.write(requests.get(url).content)
records = []

with gzip.open(output, 'rt', encoding='utf-8') as f:
    for line in f:
        records.append(json.loads(line))
df = pd.DataFrame(records)
print(df.head())

       reviewerID        asin          reviewerName helpful  \
0  A3EBHHCZO6V2A4  5555991584  Amaranth "music fan"  [3, 3]   
1   AZPWAXJG9OJXV  5555991584             bethtexas  [0, 0]   
2  A38IRL0X2T4DPF  5555991584           bob turnley  [2, 2]   
3  A22IK3I6U76GX0  5555991584                 Calle  [1, 1]   
4  A1AISPOIIHTHXX  5555991584           Cloud "..."  [1, 1]   

                                          reviewText  overall  \
0  It's hard to believe "Memory of Trees" came ou...      5.0   
1  A clasically-styled and introverted album, Mem...      5.0   
2  I never thought Enya would reach the sublime h...      5.0   
3  This is the third review of an irish album I w...      5.0   
4  Enya, despite being a successful recording art...      4.0   

                        summary  unixReviewTime   reviewTime  
0       Enya's last great album      1158019200  09 12, 2006  
1      Enya at her most elegant       991526400   06 3, 2001  
2               The best so far      1058

### Preprocessing

In [20]:
# Map ratings to binary sentiment (positive 4-5, negative 1-2, neutral 3)
reviews = df['reviewText'].fillna('').str.lower().str.replace(r'[^a-z0-9 ]', ' ', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()  
ratings = df['overall']
labels = ratings.apply(lambda x: 1 if x >= 4 else 0)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(reviews, labels, test_size=0.3)

vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

### Building Model

In [21]:
# (Logistic Regression)
lg_model = LogisticRegression(max_iter=2000)
lg_model.fit(X_train_vec, y_train)

### Evaluation

In [22]:
y_pred = lg_model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8706985369874305
Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.45      0.58      3772
           1       0.88      0.97      0.92     15640

    accuracy                           0.87     19412
   macro avg       0.84      0.71      0.75     19412
weighted avg       0.86      0.87      0.86     19412



### Other models

#### Random Forest

In [23]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    class_weight='balanced_subsample',
    n_jobs=-1,
    random_state=42
)
rf_model.fit(X_train_vec, y_train)

y_pred_rf = rf_model.predict(X_test_vec)

print("RandomForest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))


RandomForest Accuracy: 0.8421594889758912
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.24      0.37      3772
           1       0.84      0.99      0.91     15640

    accuracy                           0.84     19412
   macro avg       0.83      0.61      0.64     19412
weighted avg       0.84      0.84      0.81     19412



#### SVM

In [24]:
svm_model = LinearSVC(class_weight='balanced', max_iter=2000)
svm_model.fit(X_train_vec, y_train)

y_pred_svm = svm_model.predict(X_test_vec)

print("LinearSVC Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))


LinearSVC Accuracy: 0.8120234906243561
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.74      0.60      3772
           1       0.93      0.83      0.88     15640

    accuracy                           0.81     19412
   macro avg       0.72      0.78      0.74     19412
weighted avg       0.85      0.81      0.82     19412



#### K-Nearest neighbors

In [25]:
knn_model = KNeighborsClassifier(
    n_neighbors=5,   
    weights='uniform',  # 'uniform'
    n_jobs=-1
)

knn_model.fit(X_train_vec, y_train)
y_pred_knn = knn_model.predict(X_test_vec)

print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("Classification Report:\n", classification_report(y_test, y_pred_knn))


KNN Accuracy: 0.7276426952400576
Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.40      0.36      3772
           1       0.85      0.81      0.83     15640

    accuracy                           0.73     19412
   macro avg       0.59      0.60      0.59     19412
weighted avg       0.75      0.73      0.74     19412

