In [None]:
import pandas as pd

# Mathematical calculation
import numpy as np
from scipy.sparse.linalg import svds
from sklearn import model_selection
from sklearn.metrics.pairwise import cosine_similarity

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# scikit-surprise recommender package
!pip install surprise
from surprise import SVD, KNNWithMeans
from surprise import Dataset, Reader, accuracy
from surprise.model_selection import train_test_split, GridSearchCV

df = pd.read_csv("CDs_and_Vinyl.csv")

df.head()


Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (pyproject.toml) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.4-cp310-cp310-linux_x86_64.whl size=2357255 sha256=c9f06149c42972ffe609f26ebc7fc1e522ef13960ab01f5fd2d5840a65e8e41b
  Stored in directory: /root/.cache/pip/wheels/4b/3f/df/6acbf0a40397d9bf3ff97f582cc22fb9ce66adde75bc71fd54
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully inst

Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B000002AGY,5.0,1358286606000
1,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B002HMHR7S,4.0,1402778050000
2,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B0000062P5,5.0,1504898965457
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B00004NKAK,5.0,1524768111415
4,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B07Z76Y18X,5.0,1576100171173


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1552764 entries, 0 to 1552763
Data columns (total 4 columns):
 #   Column       Non-Null Count    Dtype  
---  ------       --------------    -----  
 0   user_id      1552764 non-null  object 
 1   parent_asin  1552764 non-null  object 
 2   rating       1552764 non-null  float64
 3   timestamp    1552764 non-null  int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 47.4+ MB


In [None]:
# Check for any Null values in the dataset
df.isnull().sum()

user_id        0
parent_asin    0
rating         0
timestamp      0
dtype: int64

In [None]:
# is there any duplicated value
df.duplicated().value_counts()

False    1552764
Name: count, dtype: int64

Observations

    The dataset comprises of 1552764 rows and 4 columns.
    There are no null values

In [None]:
# Drop the timestamp column
df.drop(labels='timestamp', axis=1, inplace=True)

df.rename(columns={'parent_asin':'product_id'}, inplace=True)

df.head()

Unnamed: 0,user_id,product_id,rating
0,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B000002AGY,5.0
1,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B002HMHR7S,4.0
2,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B0000062P5,5.0
3,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B00004NKAK,5.0
4,AFWHJ6O3PV4JC7PVOJH6CPULO2KQ,B07Z76Y18X,5.0


In [None]:
# Check the count of unique user and product data
print('Count of unique Users    :', df['user_id'].nunique())
print('Count of unique Products :', df['product_id'].nunique())

Count of unique Users    : 123876
Count of unique Products : 89370


In [None]:
# Check the distribution of ratings
print('Rating distribution:')
df['rating'].value_counts()

Rating distribution:


rating
5.0    1088270
4.0     264151
3.0     107369
2.0      46595
1.0      46379
Name: count, dtype: int64

In [None]:
# Set the rating scale
reader = Reader(rating_scale=(1, 5))

# Load data from the rattings data frame into surprise DataFolds
df_surprise = Dataset.load_from_df(df, reader)

# Divide the dataset in 80:20 ratio
trainset, testset = train_test_split(df_surprise, test_size=0.2, random_state=123)


In [None]:
def svd_model (train_data,test_data):
  svd_model = SVD()
  svd_model.fit(train_data)

  # Make predictions for test set
  pred = svd_model.test(test_data)
  print('RMSE of SVD approach is:', accuracy.rmse(pred, verbose=False))

  # Convert the predictions into pandas dataframe
  svd_pred = pd.DataFrame(pred)

  # Sort in descending order
  svd_pred.sort_values(by=['uid','est'], ascending=False, inplace=True)

  return svd_pred


In [None]:
svd_pred = svd_model(trainset,testset)
svd_pred.head(10)

RMSE of SVD approach is: 0.8445170285670276


Unnamed: 0,uid,iid,r_ui,est,details
49646,AHZZYAJWA7JY4KHZCOBTXCWDQMMQ,B00V4ZBVLC,5.0,4.546363,{'was_impossible': False}
6088,AHZZXAWBVWME2D72MMSB6HU7TCAA,B008PUXMP2,5.0,4.910049,{'was_impossible': False}
168598,AHZZXAWBVWME2D72MMSB6HU7TCAA,B000C4A20M,5.0,4.585327,{'was_impossible': False}
4850,AHZZWMAF55GQDSJEPN7M7KARU7WA,B00006CTHW,5.0,5.0,{'was_impossible': False}
124572,AHZZWMAF55GQDSJEPN7M7KARU7WA,B07NHQZPFZ,5.0,4.938848,{'was_impossible': False}
263590,AHZZWMAF55GQDSJEPN7M7KARU7WA,B003W77U2E,5.0,4.811777,{'was_impossible': False}
55746,AHZZWMAF55GQDSJEPN7M7KARU7WA,B001D0EI3Q,5.0,4.772335,{'was_impossible': False}
290021,AHZZWMAF55GQDSJEPN7M7KARU7WA,B000ICLT4Q,5.0,4.653589,{'was_impossible': False}
248822,AHZZR54QOYNVJBV2QHB7SHASZPCA,B0000DZDTG,5.0,3.688721,{'was_impossible': False}
177614,AHZZQDZH6PG5PIW2YOXTMFKZAOSA,B000001FKH,3.0,4.755865,{'was_impossible': False}


In [None]:
user_sample = ['AHZZWMAF55GQDSJEPN7M7KARU7WA']

def top_n_rec(pred_data,users,n):

  top_n_rec = pred_data.groupby('uid').head(n).reset_index(drop=True)

  top_n_rec[top_n_rec['uid'].isin(users)]

  for user in users:
    print("Top %d recommendations for : %s" %(n,user))
    print(top_n_rec[top_n_rec['uid'].isin(users)].head(n))

top_n_rec(pred_data=svd_pred,users=user_sample,n=4)

Top 4 recommendations for : AHZZWMAF55GQDSJEPN7M7KARU7WA
                            uid         iid  r_ui       est  \
3  AHZZWMAF55GQDSJEPN7M7KARU7WA  B00006CTHW   5.0  5.000000   
4  AHZZWMAF55GQDSJEPN7M7KARU7WA  B07NHQZPFZ   5.0  4.938848   
5  AHZZWMAF55GQDSJEPN7M7KARU7WA  B003W77U2E   5.0  4.811777   
6  AHZZWMAF55GQDSJEPN7M7KARU7WA  B001D0EI3Q   5.0  4.772335   

                     details  
3  {'was_impossible': False}  
4  {'was_impossible': False}  
5  {'was_impossible': False}  
6  {'was_impossible': False}  


In [None]:
##############################
# MODEL TUNING
##############################

# GridSearchCV
param_grid = {'n_epochs': [10, 20], 'lr_all': [0.001, 0.01, 0.1]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=-1, joblib_verbose=True)

gs.fit(df_surprise)

gs.best_score['rmse']
# 0.85


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 10.8min finished


0.8483439784416301

In [None]:
gs.best_params['rmse']

{'n_epochs': 10, 'lr_all': 0.01}

In [None]:
##############################
# Final model ve Prediction
##############################

svd_model = SVD(**gs.best_params['rmse'])
data_final = df_surprise.build_full_trainset()
svd_model.fit(data_final)


# Make predictions for test set
pred_final = svd_model.test(testset)

# Evaluate predictions
print('RMSE of SVD approach is:', accuracy.rmse(pred_final, verbose=False))

# Convert the predictions into pandas dataframe
svd_pred_final = pd.DataFrame(pred_final)

# Sort in descending order
svd_pred_final.sort_values(by=['uid','est'], ascending=False, inplace=True)

users = ['AE2OCN7VKZLOTA747ZLKKKGFL2OQ']

top_n_rec(pred_data=svd_pred_final,users=user_sample,n=4)


RMSE of SVD approach is: 0.5992079973971989
Top 4 recommendations for : AHZZWMAF55GQDSJEPN7M7KARU7WA
                            uid         iid  r_ui       est  \
3  AHZZWMAF55GQDSJEPN7M7KARU7WA  B00006CTHW   5.0  5.000000   
4  AHZZWMAF55GQDSJEPN7M7KARU7WA  B003W77U2E   5.0  4.991193   
5  AHZZWMAF55GQDSJEPN7M7KARU7WA  B07NHQZPFZ   5.0  4.967933   
6  AHZZWMAF55GQDSJEPN7M7KARU7WA  B000ICLT4Q   5.0  4.858641   

                     details  
3  {'was_impossible': False}  
4  {'was_impossible': False}  
5  {'was_impossible': False}  
6  {'was_impossible': False}  


In [None]:
top_n_rec(pred_data=svd_pred,users=user_sample,n=4)

Top 4 recommendations for : AHZZWMAF55GQDSJEPN7M7KARU7WA
                            uid         iid  r_ui       est  \
3  AHZZWMAF55GQDSJEPN7M7KARU7WA  B00006CTHW   5.0  5.000000   
4  AHZZWMAF55GQDSJEPN7M7KARU7WA  B07NHQZPFZ   5.0  4.938848   
5  AHZZWMAF55GQDSJEPN7M7KARU7WA  B003W77U2E   5.0  4.811777   
6  AHZZWMAF55GQDSJEPN7M7KARU7WA  B001D0EI3Q   5.0  4.772335   

                     details  
3  {'was_impossible': False}  
4  {'was_impossible': False}  
5  {'was_impossible': False}  
6  {'was_impossible': False}  
