In [1]:
import pandas as pd
import warnings
warnings.simplefilter('ignore')
import numpy as np
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.base import BaseEstimator

In [2]:
rmse = lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred))

def train_test_split(X, ratio=0.2, user_col='reviewerID', item_col='asin',
                     rating_col='overall', time_col='unixReviewTime'):
    # сортируем оценки по времени
    X.sort_values(by=[time_col], inplace=True)
    # список всех юзеров
    userIds = X[user_col].unique()
    X_train_data = []
    X_test_data = []
    y_train = []
    y_test = []
    for userId in tqdm(userIds):
        curUser = X[X[user_col] == userId]
        # определяем позицию, по которой делим выборку и размещаем данные по массивам
        idx = int(curUser.shape[0] * (1 - ratio))
        X_train_data.append(curUser[[user_col, item_col]].iloc[:idx, :].values)
        X_test_data.append(curUser[[user_col, item_col]].iloc[idx:, :].values)
        y_train.append(curUser[rating_col].values[:idx])
        y_test.append(curUser[rating_col].values[idx:])
    # cтекуем данные по каждому пользователю в общие массивы
    X_train = pd.DataFrame(np.vstack(X_train_data), columns=[user_col, item_col])
    X_test = pd.DataFrame(np.vstack(X_test_data), columns=[user_col, item_col])
    y_train = np.hstack(y_train)
    y_test = np.hstack(y_test)
    return X_train, X_test, y_train, y_test

In [3]:
class ItemBased(BaseEstimator):
    def fit(self, X, y, user_col='reviewerID', item_col='asin'):
        X = X.copy()
        # сохраним текущих пользователей и имеющиеся предметы
        self.users = X[user_col].unique()
        self.items = X[item_col].unique()
        
        X['y'] = y
        # рассчитаем среднее значение рейтинга для пользователя и предмета
        self.mean_y_user = X.groupby(user_col)['y'].mean()
        self.mean_y_item = X.groupby(item_col)['y'].mean()
        
        # вычитаем среднюю оценку предмета
        X['y'] -= X[item_col].apply(lambda x: self.mean_y_item[x])
        
        # создаём векторы для каждого фильма с оценками пользователя
        # если пользователь не поставил оценку, то ставим 0
        self.item_ratings = pd.pivot_table(X, values='y', index=item_col,
                                           columns=user_col, fill_value=0)
        
        # считаем попарную схожесть между фильмами
        self.item_sim = cosine_similarity(self.item_ratings)
        
        # также сделаем словарь {значение item_col: index в item_ratings}
        self.item_pos = dict()
        for item in self.items:
            self.item_pos[item] = np.argwhere(self.item_ratings.index.values == item)[0][0]
        return self
    
    def predict_rating(self, pr_user, pr_item):
        # если в обучающей выборке нет такого предмета
        # или пользователя, то вернём 0
        if not pr_item in self.items or not pr_user in self.users:
            return 0
        
        # считаем числитель и знаменатель дроби из формулы предсказания
        numerator = self.item_sim[self.item_pos[pr_item]].dot(
                        self.item_ratings.loc[:, pr_user])   
        # вычитаем 1, так как схожесть предмета с самим собой равна 1,
        # но модель не должна это учитывать
        denominator = np.abs(self.item_sim[self.item_pos[pr_item]]).sum() - 1
        
        return self.mean_y_item[pr_item] + numerator / denominator
    
    def predict(self, X, user_col='reviewerID', item_col='asin'):
        y = X[[user_col, item_col]].apply(lambda row: self.predict_rating(row[0], row[1]), axis=1)
        return y

In [4]:
df = pd.read_json('Digital_Music_5.json', lines=True)

In [5]:
df.head()

Unnamed: 0,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,image
0,5,3.0,True,"06 3, 2013",A2TYZ821XXK2YZ,3426958910,{'Format:': ' Audio CD'},Garrett,"This is awesome to listen to, A must-have for ...",Slayer Rules!,1370217600,
1,5,,True,"10 11, 2014",A3OFSREZADFUDY,3426958910,{'Format:': ' Audio CD'},Ad,bien,Five Stars,1412985600,
2,5,,True,"02 11, 2014",A2VAMODP8M77NG,3426958910,{'Format:': ' Audio CD'},JTGabq,It was great to hear the old stuff again and I...,SLAYER!!!!!!!!!!!!!!!!!!!!!,1392076800,
3,4,3.0,False,"12 7, 2013",AAKSLZ9IDTEH0,3426958910,{'Format:': ' Audio CD'},john F&#039;n doe,well best of's are a bit poison normally but t...,slayer greatest hits! you mean everything righ...,1386374400,
4,5,,True,"06 12, 2016",A3OH43OZJLKI09,5557706259,{'Format:': ' Audio CD'},melinda a goodman,What can I say? This is Casting Crowns!!!This ...,"This is a good, blessing filled",1465689600,


In [6]:
df['overall'].unique()

array([5, 4, 2, 3, 1], dtype=int64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df)

100%|██████████| 16566/16566 [07:05<00:00, 38.94it/s]


In [8]:
%%time
print('start fitting...')
ib = ItemBased().fit(X_train, y_train)
print('start predicting...')
print('rmse = {}'.format(rmse(y_test, ib.predict(X_test))))

start fitting...
start predicting...
rmse = 0.7262795807059841
CPU times: total: 1min 59s
Wall time: 52 s


In [7]:
import requests
import zipfile
from io import BytesIO

from surprise import Dataset, Reader, KNNBasic
from surprise.model_selection import cross_validate, split

In [8]:
df = df.rename(columns={"reviewerID": "userID", "asin": "itemID", "overall": "rating", "unixReviewTime": "timestamp"})

In [9]:
df[["userID", "itemID", "rating", "timestamp"]]

Unnamed: 0,userID,itemID,rating,timestamp
0,A2TYZ821XXK2YZ,3426958910,5,1370217600
1,A3OFSREZADFUDY,3426958910,5,1412985600
2,A2VAMODP8M77NG,3426958910,5,1392076800
3,AAKSLZ9IDTEH0,3426958910,4,1386374400
4,A3OH43OZJLKI09,5557706259,5,1465689600
...,...,...,...,...
169776,A1SR2T84IXOMAQ,B01HJ91MTW,5,1534636800
169777,A2SR3DWJR1PYR6,B01HJ91MTW,5,1521331200
169778,A24V7X30NIMOIY,B01HJ91MTW,5,1520380800
169779,A1LW10GYP2EYM1,B01HJ91MTW,5,1518393600


In [10]:
# Определение формата
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

In [13]:
# Используем `KNNBasic` коллаборативную фильтрацию для user-based системы с корреляцией Пирсона
# https://surprise.readthedocs.io/en/stable/knn_inspired.html
    
sim_options = {
    'name': 'pearson_baseline',
    'user_based': False
}

# Тренируем модель
algo = KNNBasic(sim_options=sim_options)
trainset = data.build_full_trainset()
algo.fit(trainset)

# Делаем предикт
uid = str(196)
itemid = str(302)
actual_rating = 4
algo.predict(uid=uid, iid=itemid, r_ui=actual_rating, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
user: 196        item: 302        r_ui = 4.00   est = 4.70   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}


Prediction(uid='196', iid='302', r_ui=4, est=4.700019436803883, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})

In [14]:
kf = split.KFold(random_state=0, n_splits=5)
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=kf, verbose=True)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5828  0.5922  0.5946  0.5989  0.5842  0.5905  0.0061  
MAE (testset)     0.2986  0.3010  0.3038  0.3048  0.2999  0.3016  0.0023  
Fit time          2.23    2.15    2.12    2.04    2.08    2.13    0.07    
Test time         0.64    0.55    0.56    0.68    0.61  

{'test_rmse': array([0.58277994, 0.5922181 , 0.59462831, 0.59885764, 0.58421507]),
 'test_mae': array([0.2985965 , 0.300995  , 0.30378179, 0.30475714, 0.2998991 ]),
 'fit_time': (2.23476505279541,
  2.1527230739593506,
  2.1230201721191406,
  2.0404253005981445,
  2.0810375213623047),
 'test_time': (0.6439990997314453,
  0.5489993095397949,
  0.5560047626495361,
  0.6760079860687256,
  0.6090219020843506)}