# Collaborative Based Filtering

In [1]:
import pandas as pd
import numpy as np
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split, GridSearchCV, cross_validate
from collections import defaultdict

In [2]:
customer_data = pd.read_csv('../Dataset/customer_data_final.csv')
purchase_history = pd.read_csv('../Dataset/purchase_history.csv')
df = pd.read_csv('../Dataset/Item_data2.csv')

In [3]:
purchase_history.head()

Unnamed: 0,ID,ProdID_List,Rating
0,0.0,585.0,5
1,0.0,6.0,3
2,0.0,1.0,4
3,0.0,532.0,5
4,0.0,20.0,3


In [4]:
def train_svd_model_with_tuning(ground_truth_clean):
    reader = Reader()
    data = Dataset.load_from_df(ground_truth_clean[['ID', 'ProdID_List', 'Rating']], reader)

    param_grid = {
        'n_factors': [50, 100, 150],     
        'n_epochs': [10, 20, 30],         
        'lr_all': [0.005, 0.01, 0.02],    
        'reg_all': [0.02, 0.05, 0.1]      
    }

    grid_search = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1)
    
    grid_search.fit(data)
    
    best_params = grid_search.best_params['rmse']
    print("Best RMSE score:", grid_search.best_score['rmse'])
    print("Best parameters:", best_params)
    
    best_svd = grid_search.best_estimator['rmse']
    trainset, testset = train_test_split(data, test_size=0.2)
    best_svd.fit(trainset)
    
    results = cross_validate(best_svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
    mean_rmse = results['test_rmse'].mean()
    mean_mae = results['test_mae'].mean()
    
    print(f"Mean RMSE across folds: {mean_rmse}")
    print(f"Mean MAE across folds: {mean_mae}")
    
    return best_svd

svd = train_svd_model_with_tuning(purchase_history)

Best RMSE score: 1.2718735911485932
Best parameters: {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.005, 'reg_all': 0.02}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2689  1.2540  1.2977  1.2480  1.2578  1.2653  0.0176  
MAE (testset)     1.0031  0.9644  1.0221  0.9735  0.9792  0.9885  0.0211  
Fit time          0.06    0.05    0.04    0.04    0.04    0.05    0.01    
Test time         0.00    0.01    0.00    0.00    0.00    0.00    0.00    
Mean RMSE across folds: 1.265288781775472
Mean MAE across folds: 0.988460718738924


In [57]:
reader = Reader()
data = Dataset.load_from_df(purchase_history[['ID', 'ProdID_List', 'Rating']], reader)

def get_top_n(predictions, N=10):
    """Return the top-N recommendation for each user from a set of predictions."""
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:N]

    return top_n

def model_based_recommender(user, N):
    top_n = get_top_n(predictions, N)
    user_top_n = top_n[user]
    user_top_n_df = pd.DataFrame(user_top_n, columns=['ProdID_List', 'predicted_rating'])
    
    user_top_n_df = user_top_n_df.merge(df[['ProdID', 'Name']], how='inner', left_on='ProdID_List', right_on='ProdID')
    user_top_n_df['predicted_rating'] = round(user_top_n_df['predicted_rating'], 2)
    return user_top_n_df[['ProdID_List', 'Name', 'predicted_rating']]

algo = SVD(n_factors=150, n_epochs=20, lr_all=0.01, reg_all=0.02)

trainset = data.build_full_trainset()
algo.fit(trainset)

testset = trainset.build_anti_testset()
predictions = algo.test(testset)

In [58]:
model_based_recommender(1,10)

Unnamed: 0,ProdID_List,Name,predicted_rating
0,39.0,Revlon ColorStay Gel Envy Longwear Nail Polish...,4.68
1,39.0,"6 Pack - 3D White Brilliance Toothpaste, Vibra...",4.68
2,39.0,"COVERGIRL Clean Matte BB Cream, 510 Fair",4.68
3,39.0,Nibblers Tingle Balm | Flavored Edible Gel,4.68
4,39.0,Fructis Power Gel Go Loco Slick,4.68
5,39.0,"Maybelline Color Sensational The Creams, Cream...",4.68
6,39.0,Olay Complete Daily Moisturizer for Sensitive ...,4.68
7,39.0,4 Pack - Teen Spirit Anti-Perspirant Deodorant...,4.68
8,28.0,Sally Hansen Insta Dri Black and Insta Dri Top...,4.52
9,28.0,"Hard Candy Top Ten Eyeshadow, 1185 Lap of Luxu...",4.52


In [59]:
model_based_recommender(2,10)

Unnamed: 0,ProdID_List,Name,predicted_rating
0,7507.0,Dreft Odor Eliminator & Fabric Refresher 22 oz,4.12
1,342482.0,Unscented Cetaphil Moisturizing Cream,3.93
2,342482.0,Unscented Cetaphil Moisturizing Cream,3.93
3,323853.0,"Bragg Organic Apple Cider Vinegar, Raw & Unfil...",3.92
4,425.0,"Old Spice Ultra Smooth Deodorant, Fresh Start,...",3.92
5,425.0,Platinum Care Pads Standard Washable Underpad ...,3.92
6,56014.0,"Black Radiance Perfect Tone Lip Color, Vintage...",3.86
7,705.0,Womens Maternity Belly Support Belt Pregnancy ...,3.85
8,705.0,"2 Pack - REACH Unflavored Waxed Dental Floss, ...",3.85
9,9.962804e+17,Neova After Sun Body Repair 250ml 8.5oz,3.78
