метрика

In [None]:
RMSE и MAE является наиболее популярными и широко используемыми — это мера отклонения рекомендации от фактической стоимости пользователя. Чем ниже значения MAE и RMSE, тем точнее механизм рекомендаций прогнозирует пользовательские рейтинги. Эти метрики удобны, когда рекомендации основаны на прогнозировании рейтинга или количества транзакций. Они дают нам представление о том, насколько точны наши прогнозы и, в свою очередь, насколько точны наши рекомендации.

Library

In [2]:
import pandas as pd
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

Data

In [3]:
tag = pd.read_csv('data/tag.csv')
rating = pd.read_csv('data/rating.csv')
genome_tags = pd.read_csv('data/genome_tags.csv')
link = pd.read_csv('data/link.csv')
movie = pd.read_csv('data/movie.csv')
genome_score = pd.read_csv('data/genome_scores.csv')

Collaboration 

In [3]:
rating.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39


In [4]:
movie.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
colabb_data = movie.merge(rating, how="left", on="movieId")

In [6]:
colabb_data.head(3)

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51


In [5]:
colabb_data.shape

(20000797, 6)

In [6]:
colabb_data["title"].nunique()

27262

In [15]:
colabb_data

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1999-12-11 13:36:47
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,6.0,5.0,1997-03-13 17:50:52
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,1996-06-05 13:37:51
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,10.0,4.0,1999-11-25 02:44:47
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,11.0,4.5,2009-01-02 01:13:41
...,...,...,...,...,...,...
20000792,131254,Kein Bund für's Leben (2007),Comedy,79570.0,4.0,2015-03-30 19:32:59
20000793,131256,"Feuer, Eis & Dosenbier (2002)",Comedy,79570.0,4.0,2015-03-30 19:48:08
20000794,131258,The Pirates (2014),Adventure,28906.0,2.5,2015-03-30 19:56:32
20000795,131260,Rentun Ruusu (2001),(no genres listed),65409.0,3.0,2015-03-30 19:57:46


In [None]:
movie_ids = [130219, 356, 4422, 541]
movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)"]

In [19]:
sample_df = colabb_data
user_movie_df = sample_df.pivot_table(index=["userId"], columns=["title"], values="rating")
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(sample_df[['userId', 'movieId', 'rating']], reader)

In [None]:
user_movie_df

In [21]:
train, test = train_test_split(data, test_size = 0.2)
svd_model = SVD()
svd_model.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd8171e10a0>

In [22]:
predictions = svd_model.test(test)

In [23]:
accuracy.rmse(predictions)

RMSE: 0.9355


0.9354773625778203

In [34]:
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
user_movie_df.head()

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9388  0.9390  0.9358  0.9231  0.9370  0.9347  0.0060  
MAE (testset)     0.7255  0.7269  0.7214  0.7162  0.7251  0.7230  0.0039  
Fit time          5.13    5.47    5.49    6.14    9.32    6.31    1.54    
Test time         0.12    0.11    0.17    0.13    0.15    0.14    0.02    


title,Blade Runner (1982),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4.0,,,
2.0,5.0,,,
3.0,5.0,,,
4.0,,,4.0,
7.0,,,4.0,


In [29]:
param_grid = {'n_epochs': [1, 3, 5, 7, 10], 'lr_all': [0.001, 0.002, 0.004, 0.005]}

gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs=-1, joblib_verbose=True)

gs.fit(data)

gs.best_score['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.8s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:  1.0min finished


0.9302564268834194

In [27]:
gs.best_params['rmse']

{'n_epochs': 5, 'lr_all': 0.002}

Content-Based

In [None]:
movie_details.drop(columns=['timestamp'],inplace=True)
total_ratings=movie_details.groupby(['movieId','genres']).sum()['rating'].reset_index()
df=movie_details.copy()
df.drop_duplicates(['title','genres'],inplace=True) 
df=df.merge(total_ratings,on='movieId')
df.drop(columns=['userId','rating_x','genres_y'],inplace=True)
df.rename(columns={'genres_x':'genres','rating_y':'rating'},inplace=True)
df.head()

In [None]:
df['rating']=df['rating'].astype(int)
df = df[df['rating']>100]
df['genres'].value_counts()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=1)
x = tfv.fit_transform(df['genres'])

In [None]:
from sklearn.metrics.pairwise import sigmoid_kernel
model = sigmoid_kernel(x, x)

In [None]:
df1=df.copy()
ti=[]
for i in df1['title']:
    ti.append(i.split(' (')[0])
df1['title']=ti

In [None]:
def recommendations(title):
    i_d=[]
    indices=pd.Series(df1.index,index=df1['title']).drop_duplicates()
    idx = indices[title]
    dis_scores = list(enumerate(model[idx]))
    dis_scores = sorted(dis_scores, key=lambda x: x[1], reverse=True)
    dis_scores = dis_scores[1:31]
    idn = [i[0] for i in dis_scores]
    final =df1.iloc[idn].reset_index()
    idn = [i for i in final['index']]
    for j in idn:
        if(j<15951):
            i_d.append(j)
    indices=pd.Series(df.index,index=df['title']).drop_duplicates()
    for i in range(1,8):
        if (idn):
            print(indices.iloc[i_d].index[i])