### Домашнее задание по теме «Рекомендации на основе содержания»

1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:
  TF-IDF на тегах и жанрах
  Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [210]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

%matplotlib inline

In [211]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [212]:
movies_with_rating = ratings.join(movies.set_index('movieId'), on='movieId') ### фильмы с рейтингами  

In [213]:
movies_with_tags = tags.movieId.unique()
movies_with_rating_tags=movies_with_rating[movies_with_rating.movieId.isin(movies_with_tags)] ### фильмы с тегами 

In [214]:
movies_with_rating_tags.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller
6,1,101,5.0,964980868,Bottle Rocket (1996),Adventure|Comedy|Crime|Romance


In [215]:
movies_agg = movies_with_rating_tags.groupby(by='movieId').agg(['mean', 'count', 'var']).rating.reset_index()

In [216]:
movies_agg.head()

Unnamed: 0,movieId,mean,count,var
0,1,3.92093,215,0.69699
1,2,3.431818,110,0.777419
2,3,3.259615,52,1.112651
3,5,3.071429,49,0.822917
4,7,3.185185,54,0.955625


In [217]:
movies_agg.fillna(0, inplace = True)
movies_with_rating_agg = movies_agg.merge(movies, on='movieId', how='left')
movies_with_rating_agg.head()

Unnamed: 0,movieId,mean,count,var,title,genres
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure|Children|Fantasy
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy|Romance
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy|Romance


In [218]:
grouped_tags = tags.groupby(by='movieId')

film_tags = {}
for key, value in grouped_tags.groups.items():
    film_tags[key] = tags.loc[value.values].tag.tolist()

In [219]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: film_tags[x.movieId], axis=1)

In [220]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [221]:
movies_with_rating_agg['genres'] = [change_string(g) for g in movies_with_rating_agg.genres.values]

In [222]:
movies_with_rating_agg['tags'] = movies_with_rating_agg.apply(lambda x: ' '.join(x.tags), axis=1)

In [223]:
movies_with_rating_agg['genres_tags'] = movies_with_rating_agg.apply(lambda x: x.genres + ' ' + x.tags, axis=1)

In [224]:
movies_with_rating_agg.drop(columns=['genres', 'tags'], inplace=True)

In [225]:
movies_with_rating_agg.head(10)

Unnamed: 0,movieId,mean,count,var,title,genres_tags
0,1,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,3.431818,110,0.777419,Jumanji (1995),Adventure Children Fantasy fantasy magic board...
2,3,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old
3,5,3.071429,49,0.822917,Father of the Bride Part II (1995),Comedy pregnancy remake
4,7,3.185185,54,0.955625,Sabrina (1995),Comedy Romance remake
5,11,3.671429,70,0.810766,"American President, The (1995)",Comedy Drama Romance politics president
6,14,3.833333,18,0.5,Nixon (1995),Drama politics president
7,16,3.926829,82,0.784703,Casino (1995),Crime Drama Mafia
8,17,3.776119,67,1.312754,Sense and Sensibility (1995),Drama Romance Jane Austen
9,21,3.494382,89,0.752809,Get Shorty (1995),Comedy Crime Thriller Hollywood


In [226]:
genres_tags = movies_with_rating_agg.genres_tags.tolist()

In [227]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(genres_tags)

In [228]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [229]:
X_tfidf = X_tfidf.toarray()

In [230]:
df_X_tfidf = pd.DataFrame(X_tfidf, index=movies_with_rating_agg.movieId)

In [231]:
df = movies_with_rating_agg.merge(df_X_tfidf, on='movieId')

#### Сделаем рекомендацию для одного из пользователей 

In [232]:
first_user_ratings = ratings[(ratings.userId == 1) & ratings.movieId.isin(movies_with_tags)]

In [233]:
df = df.set_index('movieId')

In [234]:
first_user_ratings = first_user_ratings.join(df, on = 'movieId')

In [235]:
first_user_ratings.head(10)

Unnamed: 0,userId,movieId,rating,timestamp,mean,count,var,title,genres_tags,0,...,1736,1737,1738,1739,1740,1741,1742,1743,1744,1745
0,1,1,4.0,964982703,3.92093,215,0.69699,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,4.0,964981247,3.259615,52,1.112651,Grumpier Old Men (1995),Comedy Romance moldy old,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,47,5.0,964983815,3.975369,203,0.850875,Seven (a.k.a. Se7en) (1995),Mystery Thriller mystery twist ending serial k...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,50,5.0,964982931,4.237745,204,0.641475,"Usual Suspects, The (1995)",Crime Mystery Thriller mindfuck suspense thril...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,1,101,5.0,964980868,3.782609,23,1.086957,Bottle Rocket (1996),Adventure Comedy Crime Romance crime off-beat ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1,110,4.0,964982176,4.031646,237,0.936494,Braveheart (1995),Action Drama War beautiful scenery epic histor...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,1,216,5.0,964981208,3.326531,49,1.13074,Billy Madison (1995),Comedy school Adam Sandler stop looking at me ...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,1,223,3.0,964980985,3.855769,104,0.896471,Clerks (1994),Comedy cynical hilarious independent film quir...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,1,235,4.0,964980908,3.678571,70,0.826346,Ed Wood (1994),Comedy Drama movie business,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,1,260,5.0,964981680,4.231076,251,0.76039,Star Wars: Episode IV - A New Hope (1977),Action Adventure SciFi classic space action ac...,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [236]:
X = first_user_ratings.drop(columns=['userId', 'rating', 'timestamp', 'title','genres_tags']).set_index('movieId')
y = first_user_ratings.loc[:, first_user_ratings.columns.isin(['movieId', 'rating'])].set_index('movieId')

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [238]:
model = Lasso().fit(X_train, y_train)

In [239]:
y_test_predict = model.predict(X_test)

###### Оцениваем RMSE

In [242]:
rmse = sqrt(mean_squared_error(y_test, y_test_predict))
print('RMSE: ', rmse)

RMSE:  0.9662821754658361


###### Предсказанные оценки 

In [253]:
reco_TOP10_for_user = df_for_reco[['mean', 'predicted_score']].sort_values('predicted_score', ascending=False)[:10]

In [254]:
reco_TOP10_for_user.sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7023,4.5,4.533655
6768,4.0,4.533655
7062,4.0,4.533655
7061,4.0,4.533655
7055,4.0,4.533655
7053,4.0,4.533655
6984,4.0,4.533655
193565,3.5,4.533655
929,3.5,4.533655
7049,3.0,4.533655


In [256]:
# Сортировка по средней оценке фильмов 
reco_TOP10_for_user.merge(movies.set_index('movieId'), on='movieId').sort_values('mean',ascending = False)

Unnamed: 0_level_0,mean,predicted_score,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7023,4.5,4.533655,"Wedding Banquet, The (Xi yan) (1993)",Comedy|Drama|Romance
6768,4.0,4.533655,Luther (2003),Drama
7062,4.0,4.533655,Birdman of Alcatraz (1962),Drama
7061,4.0,4.533655,Dark Victory (1939),Drama|Romance
7055,4.0,4.533655,Swing Time (1936),Comedy|Musical|Romance
7053,4.0,4.533655,Roberta (1935),Comedy|Musical|Romance
6984,4.0,4.533655,"Tale of Two Cities, A (1935)",Drama
193565,3.5,4.533655,Gintama: The Movie (2010),Action|Animation|Comedy|Sci-Fi
929,3.5,4.533655,Foreign Correspondent (1940),Drama|Film-Noir|Mystery|Thriller
7049,3.0,4.533655,Flying Down to Rio (1933),Comedy|Musical|Romance
