In [10]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
import pandas as pd
from surprise import KNNWithMeans
from surprise import accuracy
from surprise import BaselineOnly

In [3]:
df = pd.read_csv(r'C:\Users\Sveta\projects\dataset\ratings.csv', sep=',', error_bad_lines=False, encoding="latin-1")
df.columns = ['userId', 'movieId', 'rating', 'timestamp']
df.drop(['timestamp'], axis=1, inplace=True)
df.head()



  df = pd.read_csv(r'C:\Users\Sveta\projects\dataset\ratings.csv', sep=',', error_bad_lines=False, encoding="latin-1")


Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [4]:
'''
Чтобы уменьшить размерность набора и не нарваться на «ошибку памяти», 
мы будем отфильтровывать редко оцениваемые фильмы и редко оценивающих пользователей.
'''
min_film_ratings = 25000
filter_film = df['movieId'].value_counts() > min_film_ratings
filter_film = filter_film[filter_film].index.tolist()

min_user_ratings = 500
filter_users = df['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

df_new = df[(df['movieId'].isin(filter_film)) & (df['userId'].isin(filter_users))]
print('Исходная форма набора данных:', df.shape)
print('Новая форма набора данных:', df_new.shape)

Исходная форма набора данных: (20000263, 3)
Новая форма набора данных: (517167, 3)


In [5]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
data = Dataset.load_from_df(df_new[['userId', 'movieId', 'rating']], reader)

train, test = train_test_split(data, test_size=0.2)
full_trainset = data.build_full_trainset

print('Число юзеров: ', train.n_users, '\n')
print('Число фильмов: ', train.n_items, '\n')

Число юзеров:  7441 

Число фильмов:  92 



In [6]:
train_iids = list(train.all_items())
iid_converter = lambda x: train.to_raw_iid(x)
train_raw_iids = list(map(iid_converter, train_iids))

Основные алгоритмы: NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, SVDpp, NMF, SlopeOne, Coclustering

In [7]:
sim_options = {'name':'pearson', 'user_based':False}
algo = KNNWithMeans(k = 10, min_k = 2, sim_option = sim_options)

algo.fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x1bc45770be0>

In [8]:
predictions = algo.test(test)
accuracy.rmse(predictions)

RMSE: 0.7925


0.792472915364479

In [9]:
results = cross_validate(algo = algo, 
data = data, measures=['RMSE'],
cv=5, n_jobs=-1, verbose=True)

Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7900  0.7886  0.7895  0.7876  0.7867  0.7885  0.0012  
Fit time          178.76  185.50  183.94  180.23  177.57  181.20  3.04    
Test time         648.56  648.84  644.43  648.24  649.62  647.94  1.81    


In [18]:
print('Using ALS')
bsl_options = {'method': 'als',
               'n_epochs': 10,
               'reg_u': 10,
               'reg_i': 7
               }
algo1 = BaselineOnly(bsl_options=bsl_options)
results1 = cross_validate(algo1, data, measures=['RMSE'], n_jobs=-1, cv=5, verbose=True)
print('Среднее rmse для первого алгоритма:', results1["test_rmse"].mean())

print('Using SGD')
bsl_options = {'method': 'sgd',
               'learning_rate': .005,
               }
algo2 = BaselineOnly(bsl_options=bsl_options)
results2 = cross_validate(algo2, data, measures=['RMSE'], n_jobs=-1, cv=5, verbose=True)
print('Среднее rmse для второго алгоритма:',results2["test_rmse"].mean())

Using ALS
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8085  0.7996  0.8049  0.8035  0.7997  0.8033  0.0033  
Fit time          0.45    0.55    0.45    0.48    0.46    0.48    0.04    
Test time         0.98    0.88    0.73    0.66    0.48    0.75    0.18    
Среднее rmse для первого алгоритма: 0.8032567327033716
Using SGD
Evaluating RMSE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8094  0.8015  0.8014  0.8038  0.8045  0.8041  0.0029  
Fit time          0.91    1.04    1.06    1.01    0.91    0.99    0.06    
Test time         0.79    0.77    0.66    0.59    0.41    0.64    0.14    
Среднее rmse для второго алгоритма: 0.8040993767528366


In [None]:
'''
Из-за того, что мы уменьшили исходный датасет, мы можем получить предполагаемую оценку только для пользователей, 
у которых более 500 оценок в данных. Выше мы обозначили их, как filter_users
'''
filter_users

[118205,
 8405,
 82418,
 121535,
 125794,
 74142,
 34576,
 131904,
 83090,
 59477,
 130767,
 79159,
 8963,
 15617,
 92011,
 71975,
 20132,
 46470,
 88820,
 63147,
 130459,
 120575,
 9544,
 31122,
 18611,
 125978,
 18138,
 91193,
 111549,
 68026,
 41267,
 51703,
 92269,
 70201,
 35128,
 105580,
 14705,
 54465,
 114406,
 136268,
 12131,
 53346,
 24688,
 107326,
 131347,
 26867,
 27469,
 119048,
 123606,
 86529,
 67346,
 22901,
 129583,
 131894,
 91867,
 7201,
 24219,
 62812,
 61168,
 51558,
 68063,
 97853,
 32344,
 80092,
 103223,
 107640,
 128258,
 79531,
 128309,
 92956,
 118754,
 76630,
 106441,
 59414,
 113668,
 122995,
 116189,
 50297,
 52260,
 72008,
 33736,
 52009,
 43194,
 117144,
 3907,
 137202,
 27053,
 31404,
 42929,
 119531,
 135425,
 66763,
 116317,
 64843,
 131961,
 2261,
 42204,
 903,
 69793,
 73611,
 49554,
 58953,
 95301,
 23173,
 4358,
 80920,
 16676,
 72983,
 4222,
 133811,
 55765,
 101044,
 34651,
 99754,
 52636,
 110758,
 134567,
 32514,
 75810,
 60159,
 57735,
 2139

In [19]:
algo.fit(train)
algo.predict(uid = 'TestUser1', iid = '130459')

Computing the msd similarity matrix...
Done computing similarity matrix.


Prediction(uid='TestUser1', iid='130459', r_ui=None, est=3.7342488996526746, details={'was_impossible': True, 'reason': 'User and/or item is unknown.'})