In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator
import requests
import zipfile
from io import BytesIO

from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import cross_validate, split

In [2]:
import warnings
warnings.simplefilter('ignore')

## Вспомогательные функции и классы

In [3]:
rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

def train_test_split(X, ratio=0.2, user_col='reviewerID', item_col='asin',
                     rating_col='overall', time_col='unixReviewTime'):
    X.sort_values(by=[time_col], inplace=True)
    
    userIds = X[user_col].unique()
    X_train_data = []
    X_test_data = []
    y_train = []
    y_test = []
    for userId in tqdm(userIds):
        curUser = X[X[user_col] == userId]
        idx = int(curUser.shape[0] * (1 - ratio))
        X_train_data.append(curUser[[user_col, item_col]].iloc[:idx, :].values)
        X_test_data.append(curUser[[user_col, item_col]].iloc[idx:, :].values)
        y_train.append(curUser[rating_col].values[:idx])
        y_test.append(curUser[rating_col].values[idx:])
        
    X_train = pd.DataFrame(np.vstack(X_train_data), columns=[user_col, item_col])
    X_test = pd.DataFrame(np.vstack(X_test_data), columns=[user_col, item_col])
    y_train = np.hstack(y_train)
    y_test = np.hstack(y_test)
    return X_train, X_test, y_train, y_test

In [4]:
class ItemBased(BaseEstimator):
    def fit(self, X, y, user_col='reviewerID', item_col='asin'):
        X = X.copy()
        self.users = X[user_col].unique()
        self.items = X[item_col].unique()
        
        X['y'] = y
        self.mean_y_user = X.groupby(user_col)['y'].mean()
        self.mean_y_item = X.groupby(item_col)['y'].mean()

        X['y'] -= X[item_col].apply(lambda x: self.mean_y_item[x])
        

        self.item_ratings = pd.pivot_table(X, values='y', index=item_col,
                                           columns=user_col, fill_value=0)

        self.item_sim = cosine_similarity(self.item_ratings)

        self.item_pos = dict()
        for item in self.items:
            self.item_pos[item] = np.argwhere(self.item_ratings.index.values == item)[0][0]
        return self
    
    def predict_rating(self, pr_user, pr_item):
        
        if not pr_item in self.items or not pr_user in self.users:
            return 0
        numerator = self.item_sim[self.item_pos[pr_item]].dot(
                        self.item_ratings.loc[:, pr_user])   
        denominator = np.abs(self.item_sim[self.item_pos[pr_item]]).sum() - 1
        
        return self.mean_y_item[pr_item] + numerator / denominator
    
    def predict(self, X, user_col='reviewerID', item_col='asin'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return y

## Загрузка данных

In [5]:
df = pd.read_json('Digital_Music_5.json', lines=True)

In [6]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,5,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,5,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,4,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,5,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


## EDA

In [47]:
print(f'Length of data set = {len(df)}')

Length of data set = 169781


In [48]:
print(f"Unique ratings {list(df['overall'].unique())}") # Какие бывают рейтинги

Unique ratings [5, 4, 2, 3, 1]


In [49]:
print(f"Unique products {len(df['asin'].unique())}") # Количество уникальных  треков

Unique products 11797


In [50]:
print(f"Unique users {len(df['reviewerID'].unique())}") # Количество уникальных  треков

Unique users 16566


In [62]:
from datetime import datetime

def show_date(value):
    return datetime.utcfromtimestamp(value).strftime('%Y-%m-%d')

print(f"Ratings from {show_date(min(df['unixReviewTime']))} to {show_date(max(df['unixReviewTime']))}")

Ratings from 1998-07-09 to 2018-09-26


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df)

100%|██████████| 16566/16566 [07:05<00:00, 38.94it/s]


In [8]:
%%time
print('start fitting...')
ib = ItemBased().fit(X_train, y_train)
print('start predicting...')
print('rmse = {}'.format(rmse(y_test, ib.predict(X_test))))

start fitting...
start predicting...
rmse = 0.7262795807059841
CPU times: total: 1min 59s
Wall time: 52 s


In [13]:
df = df.rename(columns={"reviewerID": "userID", "asin": "itemID", "overall": "rating", "unixReviewTime": "timestamp"})

In [14]:
df[["userID", "itemID", "rating", "timestamp"]].head()

Unnamed: 0,userID,itemID,rating,timestamp
0,A2TYZ821XXK2YZ,3426958910,5,1370217600
1,A3OFSREZADFUDY,3426958910,5,1412985600
2,A2VAMODP8M77NG,3426958910,5,1392076800
3,AAKSLZ9IDTEH0,3426958910,4,1386374400
4,A3OH43OZJLKI09,5557706259,5,1465689600


## Имплементируем модель на surprise

In [15]:
reader = Reader(line_format="user item rating timestamp", rating_scale=(1, 5))
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

In [8]:
sim_options = {
    'name': 'pearson_baseline',
    'user_based': False
}

## Оценка модели

In [17]:
rec_algo = KNNBasic(sim_options=sim_options)
trainset = data.build_full_trainset()
rec_algo.fit(trainset)
kf = split.KFold(random_state=42, n_splits=5)
print(f'\nMean test RMSE ' + str(np.mean(cross_validate(rec_algo, data, measures=['RMSE', 'MAE'], 
                                                  cv=kf, verbose=1)['test_rmse']))[:4])

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5963  0.5746  0.5932  0.5969  0.5895  0.5901  0.0082  
MAE (testset)     0.3036  0.2973  0.3032  0.3037  0.3006  0.3017  0.0025  
Fit time      

## Попробуем SVD

In [16]:
rec_algo = SVD()
trainset = data.build_full_trainset()
rec_algo.fit(trainset)
kf = split.KFold(random_state=42, n_splits=5)
print(f'\nMean test RMSE ' + str(np.mean(cross_validate(rec_algo, data, measures=['RMSE', 'MAE'], 
                                                  cv=kf, verbose=1)['test_rmse']))[:4])

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5618  0.5519  0.5636  0.5681  0.5625  0.5616  0.0053  
MAE (testset)     0.3350  0.3314  0.3377  0.3369  0.3348  0.3352  0.0022  
Fit time          1.27    1.26    1.38    1.26    1.31    1.30    0.05    
Test time         0.12    0.12    0.12    0.13    0.12    0.12    0.00    

Mean test RMSE 0.56
