In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Download resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
reviews_df = pd.read_csv('/content/clothing_reviews.csv')
description_df = pd.read_csv('/content/clothing_description.csv')

# Merge the data on 'product_id'
df = pd.merge(
    reviews_df,
    description_df[['product_id', 'sub_category']],  # Select only the columns needed
    on='product_id',
    how='left'  # Keep all rows from reviews_df even if no match in description_df
)

df.head()

In [None]:
# Drop missing reviews
df.dropna(subset=['Review Text'], inplace=True)

In [None]:
# Filter to exclude only ratings 1-5 (in case of outliers)
df = df[df['Rating'].isin([1, 2, 3, 4, 5])]

# Re-map sentiment based on new condition:
# Rating 1 or 2 → 0 (negative), 3 to 5 → 1 (positive)
df['sentiment'] = df['Rating'].apply(lambda x: 1 if x >= 3 else 0)

In [None]:
# Clean review text
def clean_text(text):
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation/numbers
    text = text.strip()
    return text

df['cleaned_review'] = df['Review Text'].apply(clean_text)

In [None]:
# Lemmatize and remove stopwords
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    words = text.split()
    filtered = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return " ".join(filtered)

df['final_review'] = df['cleaned_review'].apply(preprocess)

In [None]:
# Split into training and test sets (before vectorization)
X_train_text, X_test_text, y_train, y_test = train_test_split(
    df['final_review'], df['sentiment'], test_size=0.2, random_state=42, stratify=df['sentiment']
)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=500)

# Fit only on training data
xv_train = tfidf.fit_transform(X_train_text)

# Transform test data
xv_test = tfidf.transform(X_test_text)

In [None]:
from collections import Counter

y_train_original = y_train.copy()

# Before SMOTE
print("Class distribution before SMOTE:")
print(Counter(y_train_original))

In [None]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE
smote = SMOTE(random_state=42)
xv_train, y_train = smote.fit_resample(xv_train, y_train)

# After SMOTE
print("\nClass distribution after SMOTE:")
print(Counter(y_train))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC()
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier()
lrc = LogisticRegression()
rfc = RandomForestClassifier()
abc = AdaBoostClassifier()
bc = BaggingClassifier()
etc = ExtraTreesClassifier()
gbdt = GradientBoostingClassifier()

In [None]:
clfs = {
    'SVC': svc,
    'KN': knc,
    'NB': mnb,
    'DT': dtc,
    'LR': lrc,
    'RF': rfc,
    'AdaBoost': abc,
    'BgC': bc,
    'ETC': etc,
    'GBDT':gbdt
}

In [None]:
from sklearn.metrics import accuracy_score, precision_score

results = []

# Train and evaluate classifiers
for name, clf in clfs.items():
    print(f"\nTraining {name} ...")
    try:
        clf.fit(xv_train, y_train)
        y_pred = clf.predict(xv_test)
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, pos_label=1)

        print(f"Accuracy: {acc:.4f}, Precision: {prec:.4f}")
        results.append((name, acc, prec))
    except Exception as e:
        print(f"Error training {name}: {e}")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve, f1_score, recall_score
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Select top 3 models
top_models = ['SVC', 'ETC', 'RF']

for model_name in top_models:
    clf = clfs[model_name]
    print(f"\n=== {model_name} ===")

    # Predict on test set
    y_pred = clf.predict(xv_test)

    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, pos_label=1)
    rec = recall_score(y_test, y_pred, pos_label=1)
    f1 = f1_score(y_test, y_pred, pos_label=1)
    specificity = tn / (tn + fp)
    sensitivity = rec  # same as recall for positive class

    # Print metrics
    print("Confusion Matrix:")
    print(cm)
    print("\nMetrics:")
    print(f"Accuracy   : {acc:.4f}")
    print(f"Precision  : {prec:.4f}")
    print(f"Recall     : {rec:.4f}")
    print(f"F1 Score   : {f1:.4f}")
    print(f"Sensitivity: {sensitivity:.4f}")
    print(f"Specificity: {specificity:.4f}")


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Step 1: Train ETC on training data
etc = ExtraTreesClassifier(random_state=42)
etc.fit(xv_train, y_train)

# Step 2: Predict sentiment for all reviews in the full dataset
all_reviews_vectorized = tfidf.transform(df['final_review'])
df['predicted_sentiment'] = etc.predict(all_reviews_vectorized)

# Step 3: Group by sub_category and product to compute sentiment score (mean predicted sentiment)
sentiment_scores = df.groupby(['sub_category', 'product_id', 'product_name'])['predicted_sentiment'].mean().reset_index()
sentiment_scores.rename(columns={'predicted_sentiment': 'sentiment_score'}, inplace=True)

# Step 4: Rank products within each sub_category
sentiment_scores['rank_in_subcategory'] = sentiment_scores.groupby('sub_category')['sentiment_score'] \
                                                           .rank(method='dense', ascending=False)

# Step 5: Get top 5 products per sub_category
sentiment_scores = sentiment_scores.sort_values(['sub_category', 'rank_in_subcategory'])
top5_per_subcategory = sentiment_scores.groupby('sub_category').head(5)

# Step 6: Display the results
print(top5_per_subcategory[['product_id','sub_category', 'product_name', 'sentiment_score', 'rank_in_subcategory']])


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

# Train the Extra Trees Classifier
etc = ExtraTreesClassifier(random_state=42)
etc.fit(xv_train, y_train)

# Predict sentiment for all reviews
all_reviews_vectorized = tfidf.transform(df['final_review'])
df['predicted_sentiment'] = etc.predict(all_reviews_vectorized)

# Group by product_id and product_name to compute average sentiment score
overall_scores = df.groupby(['product_id', 'product_name'])['predicted_sentiment'].mean().reset_index()
overall_scores.rename(columns={'predicted_sentiment': 'sentiment_score'}, inplace=True)

# Rank all products based on sentiment score
overall_scores['overall_rank'] = overall_scores['sentiment_score'].rank(method='dense', ascending=False)

# Sort and get top 10 products
top_products = overall_scores.sort_values('sentiment_score', ascending=False).head(300)

# Display result with product_id
print(top_products[['product_id', 'product_name', 'sentiment_score', 'overall_rank']])



In [None]:
top_products[['product_id', 'product_name', 'sentiment_score', 'overall_rank']].to_csv('top_ranked_products.csv', index=False)