In [None]:
# ============================================
# 👗 StyleSense Product Recommendation Project
# ============================================

# 1. Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [None]:
# 2. Load Dataset
df = pd.read_csv('data/reviews.csv')
df.info()
df.head()

In [None]:
# 3. Feature Engineering: Add Review Length
df['Review Length'] = df['Review Text'].apply(lambda x: len(str(x).split()))

In [None]:
# 4. Preparing Features (X) and Target (y)
X = df.drop('Recommended IND', axis=1)
y = df['Recommended IND'].copy()

In [None]:
# 5. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, shuffle=True, random_state=27
)

In [None]:
# 6. Data Exploration

# Target variable distribution
sns.countplot(x=y)
plt.title('Distribution of Recommended IND (Target)')
plt.show()

# Age distribution
sns.histplot(df['Age'], bins=30, kde=True)
plt.title('Age Distribution')
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()

# Review length distribution
sns.histplot(df['Review Length'], bins=50, kde=True)
plt.title('Distribution of Review Length')
plt.xlabel('Word Count')
plt.show()

# Top categories
for col in ['Division Name', 'Department Name', 'Class Name']:
    sns.countplot(y=df[col], order=df[col].value_counts().index)
    plt.title(f'Distribution of {col}')
    plt.show()

In [None]:
# 7. Building the Pipeline

# Feature categories
numerical_features = ['Age', 'Positive Feedback Count', 'Review Length']
categorical_features = ['Division Name', 'Department Name', 'Class Name']
text_features = 'Review Text'

# Preprocessing for numeric features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Preprocessing for text features using TF-IDF
text_transformer = Pipeline(steps=[
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=5000))
])

# Combine all preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features),
        ('text', text_transformer, text_features)
    ]
)

# Full pipeline with classifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])


In [None]:
# 8. Training Pipeline
clf.fit(X_train, y_train)


In [None]:
# 9. Evaluation on Test Data
y_pred = clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Plot Confusion Matrix
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix - Initial Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
# 10. Fine-Tuning Pipeline with Grid Search
param_grid = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [None, 10, 20],
    'classifier__min_samples_split': [2, 5]
}

# Grid search
grid_search = GridSearchCV(clf, param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters and model
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

In [None]:
# 11. Final Evaluation
y_pred_best = best_model.predict(X_test)
print("Fine-Tuned Model Report:\n", classification_report(y_test, y_pred_best))

# Plot Confusion Matrix for best model
sns.heatmap(confusion_matrix(y_test, y_pred_best), annot=True, fmt='d', cmap='Greens')
plt.title("Confusion Matrix - Fine-Tuned Model")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()