# Домашнее задание по теме «Рекомендации на основе содержания»
1. Использовать dataset MovieLens
2. Построить рекомендации (регрессия, предсказываем оценку) на фичах:

TF-IDF на тегах и жанрах

Средние оценки (+ median, variance, etc.) пользователя и фильма
3. Оценить RMSE на тестовой выборке

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from tqdm import tqdm_notebook

In [2]:
movies = pd.read_csv('../lecture-1/movies.csv')
ratings = pd.read_csv('../lecture-1/ratings.csv')
tags = pd.read_csv('../lecture-1/tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [6]:
movies.movieId.nunique(), tags.movieId.nunique()

(9742, 1572)

In [7]:
n_tags = tags.movieId.nunique()
n_movies = movies.movieId.nunique()
print(f'Тегов поставлено на {(n_tags/n_movies):.1%} фильмов')

Тегов поставлено на 16.1% фильмов


In [8]:
# приводим списки жанров к правильному виду
movies['genres'] = movies['genres'].apply(lambda row: row.replace('|', ' ').replace('-', ''))
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy Romance
3,4,Waiting to Exhale (1995),Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy


In [9]:
tags[tags['movieId'] == 2]

Unnamed: 0,userId,movieId,tag,timestamp
33,62,2,fantasy,1528843929
34,62,2,magic board game,1528843932
35,62,2,Robin Williams,1528843907
982,474,2,game,1137375552


In [10]:
# группируем теги и приводим их к такому же виду как и жанры, 
# учитываем только уникальные теги с помощью set()
grouped_tags = tags.groupby('movieId', as_index=False).agg({'tag' : lambda x: ' '.join(set(x))})
grouped_tags

Unnamed: 0,movieId,tag
0,1,fun pixar
1,2,magic board game Robin Williams fantasy game
2,3,moldy old
3,5,pregnancy remake
4,7,remake
...,...,...
1567,183611,Rachel McAdams funny Comedy
1568,184471,video game adaptation Alicia Vikander adventure
1569,187593,sarcasm Josh Brolin Ryan Reynolds
1570,187595,Emilia Clarke star wars


In [11]:
# объединяем таблицы
movies_with_tags = grouped_tags.merge(movies, how='inner', on='movieId')
movies_with_tags

Unnamed: 0,movieId,tag,title,genres
0,1,fun pixar,Toy Story (1995),Adventure Animation Children Comedy Fantasy
1,2,magic board game Robin Williams fantasy game,Jumanji (1995),Adventure Children Fantasy
2,3,moldy old,Grumpier Old Men (1995),Comedy Romance
3,5,pregnancy remake,Father of the Bride Part II (1995),Comedy
4,7,remake,Sabrina (1995),Comedy Romance
...,...,...,...,...
1567,183611,Rachel McAdams funny Comedy,Game Night (2018),Action Comedy Crime Horror
1568,184471,video game adaptation Alicia Vikander adventure,Tomb Raider (2018),Action Adventure Fantasy
1569,187593,sarcasm Josh Brolin Ryan Reynolds,Deadpool 2 (2018),Action Comedy SciFi
1570,187595,Emilia Clarke star wars,Solo: A Star Wars Story (2018),Action Adventure Children SciFi


In [12]:
count_vect = CountVectorizer()
tags_counts = count_vect.fit_transform(movies_with_tags.tag.values)
genres_counts = count_vect.fit_transform(movies_with_tags.genres.values)

In [13]:
# TF-IDF на тегах и жанрах
tfidf_transformer = TfidfTransformer()
tags_tfidf = tfidf_transformer.fit_transform(tags_counts)
genres_tfidf = tfidf_transformer.fit_transform(genres_counts)

In [14]:
tfidf_tags = pd.DataFrame(tags_tfidf.toarray())
tfidf_tags

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1734,1735,1736,1737,1738,1739,1740,1741,1742,1743
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# для жанров колонки назваем с префиксом g, чтобы в последствии не было одинаковых колонок
tfidf_genres = pd.DataFrame(genres_tfidf.toarray(), columns=[f'g{i}' for i in range(genres_tfidf.shape[1])])
tfidf_genres

Unnamed: 0,g0,g1,g2,g3,g4,g5,g6,g7,g8,g9,...,g12,g13,g14,g15,g16,g17,g18,g19,g20,g21
0,0.000000,0.398613,0.521641,0.511277,0.282182,0.000000,0.0,0.0,0.477459,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1,0.000000,0.495081,0.000000,0.635009,0.000000,0.000000,0.0,0.0,0.593008,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.643145,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.765744,0.000000,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.643145,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.765744,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,0.478136,0.000000,0.000000,0.000000,0.347747,0.498981,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1568,0.529263,0.543757,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.651313,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0
1569,0.596506,0.000000,0.000000,0.000000,0.433837,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.675252,0.0,0.0,0.0
1570,0.443963,0.456122,0.000000,0.585039,0.000000,0.000000,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.502572,0.0,0.0,0.0


In [17]:
# TF-IDF на тегах и жанрах
tfidf_data = pd.concat([tfidf_tags, tfidf_genres], axis=1)

#### Из-за большого количества колонок применяем метод понижения размерности PCA

In [18]:
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA

In [19]:
pca = PCA(n_components = 40)
XPCAreduced = pca.fit_transform(tfidf_data)
XPCAreduced.shape

(1572, 40)

In [20]:
tfidf_PCA_reduced = pd.concat([movies_with_tags.drop(columns=['tag', 'title', 'genres']), pd.DataFrame(XPCAreduced)], axis=1)
tfidf_PCA_reduced

Unnamed: 0,movieId,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,1,-0.395022,-0.136141,-0.052732,0.430401,-0.227363,0.166900,0.190362,0.550754,0.061262,...,-0.006591,0.022034,0.017001,-0.018609,0.009186,-0.015259,0.022762,0.004906,0.017760,-0.020658
1,2,-0.251831,-0.309053,0.083605,0.462880,-0.189076,0.157017,0.217463,0.517007,0.039395,...,0.005555,0.022548,-0.004432,-0.006092,-0.001145,-0.039174,0.025418,-0.000439,0.049073,-0.014335
2,3,-0.513119,0.560377,0.331783,-0.124493,0.128102,-0.056590,-0.023996,-0.038514,-0.025694,...,-0.006454,-0.012465,0.018367,0.010055,-0.016084,-0.017807,0.003875,-0.001694,0.018905,0.001676
3,5,-0.619736,0.443189,-0.432697,-0.093907,-0.164717,0.027934,-0.124273,-0.062908,0.014130,...,0.006645,-0.002750,0.046805,0.017156,-0.052259,-0.057343,0.021955,0.041050,0.097098,-0.024738
4,7,-0.521227,0.556943,0.329675,-0.131812,0.128344,-0.048201,-0.039094,-0.030386,-0.023373,...,0.002013,0.000657,0.036388,0.032417,-0.061971,-0.066908,0.017394,0.018423,0.093331,-0.008913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1567,183611,-0.379994,-0.240710,-0.193019,-0.225109,-0.034736,-0.147409,0.137132,-0.183439,0.105720,...,0.010066,-0.033168,-0.196513,0.095211,0.022034,0.015063,-0.108496,-0.057834,-0.135085,0.055240
1568,184471,-0.277189,-0.433265,0.133270,0.446732,-0.201900,-0.202172,0.125815,0.145400,0.048769,...,0.005053,0.012679,-0.011362,0.003709,0.010970,-0.000898,0.020265,0.002581,0.038923,-0.002332
1569,187593,-0.453584,-0.190704,-0.067727,0.192239,-0.239909,-0.381651,-0.209372,-0.335770,-0.118119,...,-0.003687,0.009707,0.007828,0.011323,0.022931,0.019278,0.011549,-0.003608,0.025866,-0.006810
1570,187595,-0.291184,-0.480607,0.133104,0.475614,-0.254749,-0.244193,0.011886,0.047647,-0.088762,...,-0.003976,0.024228,0.004569,0.001310,0.022535,-0.005713,0.000753,-0.011475,0.020203,-0.007739


In [21]:
# группируем таблицу рейтингов и считаем количество оценок
ratings.groupby('userId', as_index=False).rating.count().sort_values('rating', ascending=False)

Unnamed: 0,userId,rating
413,414,2698
598,599,2478
473,474,2108
447,448,1864
273,274,1346
...,...,...
441,442,20
568,569,20
319,320,20
575,576,20


#### возьмем пользователя, который поставил больше всего оценок

In [22]:
one_userId = 414
ratings_by_one_user = ratings[ratings['userId'] == one_userId]
ratings_by_one_user = ratings_by_one_user.drop(columns=['userId', 'timestamp'])

In [23]:
tfidf_PCA_reduced_with_user_rating  = ratings_by_one_user.merge(tfidf_PCA_reduced, on='movieId', how='inner')
tfidf_PCA_reduced_with_user_rating

Unnamed: 0,movieId,rating,0,1,2,3,4,5,6,7,...,30,31,32,33,34,35,36,37,38,39
0,1,4.0,-0.395022,-0.136141,-0.052732,0.430401,-0.227363,0.166900,0.190362,0.550754,...,-0.006591,0.022034,0.017001,-0.018609,0.009186,-0.015259,0.022762,0.004906,0.017760,-0.020658
1,2,3.0,-0.251831,-0.309053,0.083605,0.462880,-0.189076,0.157017,0.217463,0.517007,...,0.005555,0.022548,-0.004432,-0.006092,-0.001145,-0.039174,0.025418,-0.000439,0.049073,-0.014335
2,3,4.0,-0.513119,0.560377,0.331783,-0.124493,0.128102,-0.056590,-0.023996,-0.038514,...,-0.006454,-0.012465,0.018367,0.010055,-0.016084,-0.017807,0.003875,-0.001694,0.018905,0.001676
3,5,2.0,-0.619736,0.443189,-0.432697,-0.093907,-0.164717,0.027934,-0.124273,-0.062908,...,0.006645,-0.002750,0.046805,0.017156,-0.052259,-0.057343,0.021955,0.041050,0.097098,-0.024738
4,7,3.0,-0.521227,0.556943,0.329675,-0.131812,0.128344,-0.048201,-0.039094,-0.030386,...,0.002013,0.000657,0.036388,0.032417,-0.061971,-0.066908,0.017394,0.018423,0.093331,-0.008913
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
822,174055,5.0,0.072806,-0.391621,0.077012,-0.065671,-0.034912,-0.089952,-0.150979,-0.312136,...,-0.032906,-0.000916,-0.018103,-0.005627,0.033163,0.028739,0.016861,0.014639,0.013740,-0.029447
823,176371,5.0,-0.209291,-0.374026,0.116027,0.233026,-0.158492,-0.109357,-0.215583,-0.178333,...,0.002909,-0.040302,-0.071898,0.081226,-0.008865,-0.029427,-0.035648,-0.064251,-0.018037,0.023045
824,180031,4.0,0.030708,-0.224051,0.074841,0.393919,-0.238058,0.018822,0.109182,0.359324,...,0.004802,-0.017647,-0.056403,0.040151,-0.003540,-0.036315,0.004153,-0.025388,0.025598,-0.008168
825,180985,3.5,0.700376,0.131924,-0.059763,0.017443,-0.256609,0.042818,-0.014251,-0.041220,...,-0.059717,0.059270,-0.007813,0.030876,-0.042077,0.017275,-0.057299,-0.006464,0.135692,-0.393993


In [24]:
X = tfidf_PCA_reduced_with_user_rating.drop(['rating', 'movieId'], axis=1)
y = tfidf_PCA_reduced_with_user_rating['rating']

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

In [26]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression()

In [27]:
rms = np.sqrt(mean_squared_error(y_test, lr.predict(X_test)))
rms

0.7904986560333003