In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import nltk
nltk.data.path.append("/usr/share/nltk_data")
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
br = pd.read_csv('/kaggle/input/amazon-books-reviews/Books_rating.csv')
br.head()

In [None]:
bd = pd.read_csv('/kaggle/input/amazon-books-reviews/books_data.csv')
bd.head()

In [None]:
books = pd.merge(br,bd, on = 'Title')
books.shape

In [None]:
df = books[['Title','review/score','review/text','categories']]
df.head()

In [None]:
df.drop_duplicates(inplace = True)
df.shape

In [None]:
df.dropna(inplace=True)
df.isna().sum()

In [None]:
df.shape

In [None]:
df = df[df['categories'].str.contains('Fiction', case=False, na=False)]
df.shape

In [None]:
df = df[~df['categories'].str.contains('Nonfiction', case=False, na=False)]
df.shape

In [None]:
df['categories'] = df['categories'].str.extract(r'\'(.*)\'')
df.head()

In [None]:
df['word_count'] = df['review/text'].apply(lambda x: len(x.split(' ')))
df.head()

In [None]:
# Negative class: 1 star
neg_df = df[df['review/score'].isin([1, 2])]

# Neutral class: 3 stars
neu_df = df[df['review/score'] == 3]

# Positive class: 4 or 5 stars
pos_df = df[df['review/score'].isin([4, 5])]

print(len(neg_df), len(neu_df), len(pos_df))

In [None]:
neg_10k = neg_df.sample(n=10000, random_state=42)
neu_10k = neu_df.sample(n=10000, random_state=42)
pos_10k = pos_df.sample(n=10000, random_state=42)

In [None]:
balanced_df = pd.concat([neg_10k, neu_10k, pos_10k])
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)  # Shuffle

In [None]:
def map_label(score):
    if score in [1, 2]:
        return 0  # Negative
    elif score == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

balanced_df['label'] = balanced_df['review/score'].apply(map_label)

In [None]:
balanced_df.head()

In [None]:
balanced_df.to_csv('balanced30k.csv', index=False)

In [None]:
import nltk

nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt')
nltk.download('wordnet')

print("All NLTK data downloaded!")


In [None]:
import re
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def clean_text_ml(text):
    text = str(text).lower()
    text = re.sub(r'<.*?>', '', text)                      # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)    # Remove URLs
    text = re.sub(r'@\w+|\#', '', text)                    # Remove mentions and hashtags
    text = re.sub(r'[^\w\s]', '', text)                    # Remove punctuation except underscore
    text = re.sub(r'\d+', '', text)                        # Remove digits/numbers
    text = re.sub(r'\s+', ' ', text).strip()               # Normalize whitespace
     # Lemmatize
    words = text.split()
    text = ' '.join([lemmatizer.lemmatize(word) for word in words])
    return text

In [None]:
balanced_df['cleaned_text_ml'] = balanced_df['review/text'].apply(clean_text_ml)
balanced_df.head()

In [None]:
balanced_df.to_csv('balanced30k_SVM_cleaned.csv', index=False)

In [None]:
from sklearn.model_selection import train_test_split

X = balanced_df['cleaned_text_ml']
y = balanced_df['label']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=1/3, random_state=42, stratify=y_temp)


print(f"Train size: {len(X_train)} ({len(X_train)/len(X):.1%})")
print(f"Val size:   {len(X_val)} ({len(X_val)/len(X):.1%})")
print(f"Test size:  {len(X_test)} ({len(X_test)/len(X):.1%})")

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Define grid manually
param_grid = [
    #{'C': 0.01, 'stop_words': 'english', 'ngram_range': (1, 2), 'max_features': 20000},
    #{'C': 0.005, 'stop_words': 'english', 'ngram_range': (1, 2), 'max_features': 20000},
    #{'C': 0.005, 'stop_words': None, 'ngram_range': (1, 2), 'max_features': 15000},
    #{'C': 0.01,   'stop_words': None, 'ngram_range': (1, 2), 'max_features': 20000},
    {'C': 0.01,   'stop_words': None, 'ngram_range': (1, 2), 'max_features': 20000}
]

best_f1 = 0
best_model = None
best_config = None

for params in param_grid:
    print(f"Testing config: {params}")
    
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=params['max_features'],
            ngram_range=params['ngram_range'],
            stop_words=params['stop_words']
        )),
        ('svm', LinearSVC(C=params['C']))
    ])
    
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    
    report = classification_report(y_val, y_val_pred, output_dict=True, zero_division=0)
    macro_f1 = report['macro avg']['f1-score']
    print(f"Macro F1: {macro_f1:.4f}")
    
    if macro_f1 > best_f1:
        best_f1 = macro_f1
        best_model = pipeline
        best_config = params

print("\n Best config found:")
print(best_config)



In [None]:
y_train_pred = best_model.predict(X_train)

# Print classification report
print("Training Performance:")
print(classification_report(y_train, y_train_pred, zero_division=0))

from sklearn.metrics import classification_report, confusion_matrix

# Predict on validation set using the best pipeline
y_val_pred = best_model.predict(X_val)

# Print classification report
print("Validation Performance:")
print(classification_report(y_val, y_val_pred, zero_division=0))

In [None]:
# Final evaluation on the test set
y_test_pred = best_model.predict(X_test)
print("Test Set Classification Report (final):")
print(classification_report(y_test, y_test_pred, zero_division=0))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

class_names = ['Negative', 'Neutral', 'Positive'] 

# Normalized by true class (each row sums to 1)
cm_norm = confusion_matrix(y_test, y_test_pred, labels=[0, 1, 2], normalize='true')
disp_norm = ConfusionMatrixDisplay(confusion_matrix=cm_norm, display_labels=class_names)


disp_norm.plot(values_format='.2f', cmap='Blues')
plt.title('Confusion Matrix (Test Set)')
plt.tight_layout()
plt.savefig('confusion_matrix_normalized.png', dpi=300, bbox_inches='tight')
plt.show()