In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [2]:
def load_data(file_name):
  '''
    Menampilkan data yang tersedia.

      Parameters:
        filename (str): Nama file yang tersedia

      Return:
        File Dataframe (DataFrame)
  '''

  df = pd.read_csv(file_name)
  return df

In [3]:
df = load_data('data/fashion_products.csv')
df.head()

Unnamed: 0,User ID,Product ID,Product Name,Brand,Category,Price,Rating,Color,Size
0,19,1,Dress,Adidas,Men's Fashion,40,1.043159,Black,XL
1,97,2,Shoes,H&M,Women's Fashion,82,4.026416,Black,L
2,25,3,Dress,Adidas,Women's Fashion,44,3.337938,Yellow,XL
3,57,4,Shoes,Zara,Men's Fashion,23,1.049523,White,S
4,79,5,T-shirt,Adidas,Men's Fashion,79,4.302773,Black,M


In [4]:
df_rec = df[['User ID', 'Product ID', 'Rating']]
df_rec.head()

Unnamed: 0,User ID,Product ID,Rating
0,19,1,1.043159
1,97,2,4.026416
2,25,3,3.337938
3,57,4,1.049523
4,79,5,4.302773


In [5]:
# Load library
from scipy.sparse import coo_matrix

In [6]:
# Prepare the data
row = df_rec['User ID'].values
col = df_rec['Product ID'].values
data = df_rec['Rating'].values

In [8]:
# Create the utility matrix in COO format
coo_data = coo_matrix((data, (row, col)))
coo_data

<101x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in COOrdinate format>

In [11]:
def get_utility_matrix(df_rec):
    """
    Get a COO format utility matrix

    Parameters
    ----------
    rating_data : pandas DataFrame
        The sample of rating data

    Returns
    -------
    coo_data : scipy COO format
        The utility matrix in COO format
    """
    # Prepare the data
    row = df_rec['User ID'].values
    col = df_rec['Product ID'].values
    data = df_rec['Rating'].values

    # Create the utility matrix in COO format
    coo_data = coo_matrix((data, (row, col)))

    return coo_data


In [13]:
coo_data = get_utility_matrix(df_rec=df_rec)
coo_data

<101x1001 sparse matrix of type '<class 'numpy.float64'>'
	with 1000 stored elements in COOrdinate format>

In [9]:
# Define the test size
test_size = 0.2

In [14]:
def split_train_test(df_rec, test_size=0.2, random_state=42):
    """
    Function to create train & test utility matrix in COO format

    Parameters
    ----------
    rating_data : pandas DataFrame
        The rating data

    test_size : float, default=0.2
        The test size

    random_state : int, default=42
        For reproducibility
    """
    # Generate random seed for reproducibility
    np.random.seed(random_state)

    # Shuffle rating data
    raw_index = df_rec.index.tolist().copy()
    np.random.shuffle(raw_index)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_index))

    # Split the index
    train_index = raw_index[:threshold]
    test_index = raw_index[threshold:]

    # Next, extract the train & test data based on test mask
    coo_data_train = get_utility_matrix(df_rec = df_rec.loc[train_index])
    coo_data_test = get_utility_matrix(df_rec = df_rec.loc[test_index])

    # validate
    print('Train, Test shape:', (coo_data_train.nnz, coo_data_test.nnz))

    return coo_data_train, coo_data_test


In [16]:
# Split the data
coo_data_train, coo_data_test = split_train_test(df_rec = df_rec,
                                                 test_size = 0.2,
                                                 random_state = 42)

Train, Test shape: (800, 200)


In [18]:
# Import some library
from surprise import Dataset, Reader

In [19]:
reader = Reader(rating_scale = (1, 5))
reader

<surprise.reader.Reader at 0x7fccd61f1df0>

In [20]:
utility_data = Dataset.load_from_df(
                    df = df_rec[['User ID', 'Product ID', 'Rating']].copy(),
                    reader = reader
                )

utility_data

<surprise.dataset.DatasetAutoFolds at 0x7fccd663c9a0>

In [21]:
utility_data.df.head()

Unnamed: 0,User ID,Product ID,Rating
0,19,1,1.043159
1,97,2,4.026416
2,25,3,3.337938
3,57,4,1.049523
4,79,5,4.302773


In [23]:
import copy

In [24]:
# Create a function
def train_test_split(utility_data, test_size, random_state):
    """
    Train test split the data
    ref: https://surprise.readthedocs.io/en/stable/FAQ.html#split-data-for-unbiased-estimation-py

    Parameters
    ----------
    utility_data : Surprise utility data
        The sample of whole data set

    test_size : float, default=0.2
        The test size

    random_state : int, default=42
        For reproducibility

    Returns
    -------
    full_data : Surprise utility data
        The new utility data

    train_data : Surprise format
        The train data

    test_data : Surprise format
        The test data
    """
    # Deep copy the utility_data
    full_data = copy.deepcopy(utility_data)

    # Generate random seed
    np.random.seed(random_state)

    # Shuffle the raw_ratings for reproducibility
    raw_ratings = full_data.raw_ratings
    np.random.shuffle(raw_ratings)

    # Define the threshold
    threshold = int((1-test_size) * len(raw_ratings))

    # Split the data
    train_raw_ratings = raw_ratings[:threshold]
    test_raw_ratings = raw_ratings[threshold:]

    # Get the data
    full_data.raw_ratings = train_raw_ratings
    train_data = full_data.build_full_trainset()
    test_data = full_data.construct_testset(test_raw_ratings)

    return full_data, train_data, test_data


In [25]:
# Split the data
full_data, train_data, test_data = train_test_split(utility_data,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [26]:
# Load the model library
# i.e. Baseline, KNN, and SVD
from surprise import AlgoBase, KNNBasic, SVD

In [27]:
class MeanPrediction(AlgoBase):
    '''Baseline prediction. Return global mean as prediction'''
    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        '''Fit the train data'''
        AlgoBase.fit(self, trainset)

    def estimate(self, u, i):
        '''Perform the estimation/prediction.'''
        est = self.trainset.global_mean
        return est

In [28]:
# Create baseline model
model_baseline = MeanPrediction()
model_baseline

<__main__.MeanPrediction at 0x7fccd688bfa0>

In [29]:
# Create Neighbor-based model -- K-Nearest Neighbor
model_knn = KNNBasic(random_state=42)
model_knn

<surprise.prediction_algorithms.knns.KNNBasic at 0x7fccd61ffc10>

In [30]:
# Create matrix factorization model -- SVD-like
model_svd = SVD(n_factors=100, random_state=42)
model_svd

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fccd61ffbe0>

In [31]:
# Import the cross validation module
from surprise.model_selection import cross_validate

In [32]:
# Use full_data for cross validation
# Your results could be different because
# there is no random seed stated within this functions
cv_baseline = cross_validate(algo = model_baseline,
                             data = full_data,
                             cv = 5,
                             measures = ['rmse'])

In [33]:
# Extract CV results
cv_baseline_rmse = cv_baseline['test_rmse'].mean()
cv_baseline_rmse

1.1604906235618127

In [34]:
cv_knn = cross_validate(algo = model_knn,
                        data = full_data,
                        cv = 5,
                        measures = ['rmse'])

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [35]:
# Extract CV results
cv_knn_rmse = cv_knn['test_rmse'].mean()
cv_knn_rmse

1.1608164174801414

In [36]:
cv_svd = cross_validate(algo = model_svd,
                        data = full_data,
                        cv = 5,
                        measures = ['rmse'])

In [37]:
# Extract CV results
cv_svd_rmse = cv_svd['test_rmse'].mean()
cv_svd_rmse

1.1791790593249174

In [38]:
summary_df = pd.DataFrame({'Model': ['Baseline', 'KNN', 'SVD'],
                           'CV Performance - RMSE': [cv_baseline_rmse, cv_knn_rmse, cv_svd_rmse]})

summary_df

Unnamed: 0,Model,CV Performance - RMSE
0,Baseline,1.160491
1,KNN,1.160816
2,SVD,1.179179


In [52]:
# Create object
model_best = KNNBasic(n_factors=50, random_state=42)

# Retrain on whole train dataset
model_best.fit(train_data)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x7fccd5f916d0>

In [53]:
# import performance library
from surprise import accuracy

In [54]:
test_pred = model_best.test(test_data)
test_rmse = accuracy.rmse(test_pred)
test_rmse

RMSE: 1.1207


1.1207447203587535