In [2]:
import pandas as pd
import numpy as np

from scipy.sparse import coo_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
import joblib
import os

In [3]:
df = pd.read_csv(r"C:\Users\giris\Documents\datasets for practice\cleaned_amazon_reviews.csv")
df.head()


Unnamed: 0,UserId,ProductId,Score,Time,Summary,Text
0,A3SGXH7AUHU8GW,B001E4KFG0,5,2011-04-27,good quality dog food,i have bought several of the vitality canned d...
1,A1D87F6ZCVE5NK,B00813GRG4,1,2012-09-07,not as advertised,product arrived labeled as jumbo salted peanut...
2,ABXLMWJIXXAIN,B000LQOCH0,4,2008-08-18,delight says it all,this is a confection that has been around a fe...
3,A395BORC6FGVXV,B000UA0QIQ,2,2011-06-13,cough medicine,if you are looking for the secret ingredient i...
4,A1UQRSCLF8GW1T,B006K2ZZ7K,5,2012-10-21,great taffy,great taffy at a great price there was a wide ...


In [4]:
df['user_idx'] = df['UserId'].astype('category').cat.codes
df['item_idx'] = df['ProductId'].astype('category').cat.codes

num_users = df['user_idx'].nunique()
num_items = df['item_idx'].nunique()

num_users, num_items


(254889, 73924)

In [5]:
df['Time'] = pd.to_datetime(df['Time'])
df = df.sort_values("Time")

split_point = df['Time'].quantile(0.8)
train_df = df[df['Time'] <= split_point]
test_df  = df[df['Time'] >  split_point]

train_df.shape, test_df.shape

((449437, 8), (111751, 8))

In [6]:
R_train = coo_matrix(
    (train_df['Score'].values,
     (train_df['user_idx'].values, train_df['item_idx'].values)),
    shape=(num_users, num_items)
).tocsr()

R_train.shape, R_train.nnz

((254889, 73924), 443395)

In [7]:
svd = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False)),
    ("svd", TruncatedSVD(n_components=50, random_state=42))
])

user_factors = svd.fit_transform(R_train)          # U_k
item_factors = svd.named_steps["svd"].components_.T  # V_k


In [8]:
os.makedirs("../model", exist_ok=True)

joblib.dump(svd, "../model/svd_pipeline.pkl")
joblib.dump(user_factors, "../model/user_factors.npy")
joblib.dump(item_factors, "../model/item_factors.npy")

# Save mappings to recover original IDs later
user_map = dict(enumerate(df['UserId'].astype('category').cat.categories))
item_map = dict(enumerate(df['ProductId'].astype('category').cat.categories))

joblib.dump(user_map, "../model/user_map.pkl")
joblib.dump(item_map, "../model/item_map.pkl")

print("SVD model & mappings saved.")


SVD model & mappings saved.
