In [82]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm import tqdm_notebook

import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


Объединим тэги, жанры и средние оценки.

In [6]:
movies.shape

(9742, 3)

In [7]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [9]:
count_vect = CountVectorizer()
X_genre_counts = count_vect.fit_transform(movie_genres)

In [10]:
tfidf_transformer = TfidfTransformer()
X_genre_tfidf = tfidf_transformer.fit_transform(X_genre_counts)

In [11]:
X_genre_tfidf.shape

(9742, 20)

In [39]:
genres_tfidf = pd.concat([pd.DataFrame({'movieId': movies['movieId']}),
                          pd.DataFrame.sparse.from_spmatrix(X_genre_tfidf).add_prefix('g_')], axis=1)
genres_tfidf.head()

Unnamed: 0,movieId,g_0,g_1,g_2,g_3,g_4,g_5,g_6,g_7,g_8,...,g_10,g_11,g_12,g_13,g_14,g_15,g_16,g_17,g_18,g_19
0,1,0.0,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.0,0.48299,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.512361,0.0,0.620525,0.0,0.0,0.0,0.0,0.593662,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.570915,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.505015,0.0,0.0,0.466405,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь теги

In [12]:
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [13]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [14]:
movies_with_tags.dropna(inplace=True)

In [15]:
movies_with_tags.title.unique().shape

(1572,)

In [40]:
tag_strings = []
movies_ = []

for movie, group in tqdm_notebook(movies_with_tags.groupby('movieId')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies_.append(movie)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


  0%|          | 0/1572 [00:00<?, ?it/s]

In [41]:
tag_strings[:5]

['pixar pixar fun',
 'fantasy magicboardgame RobinWilliams game',
 'moldy old',
 'pregnancy remake',
 'remake']

In [42]:
movies_[:5]

[1, 2, 3, 5, 7]

In [43]:
len(movies_)

1572

In [44]:
count_vect = CountVectorizer()
X_tag_counts = count_vect.fit_transform(tag_strings)

In [45]:
tfidf_transformer = TfidfTransformer()
X_tag_tfidf = tfidf_transformer.fit_transform(X_tag_counts)

In [47]:
X_tag_tfidf.shape

(1572, 1472)

In [48]:
tags_tfidf = pd.concat([pd.DataFrame({'movieId': movies_}),
                        pd.DataFrame.sparse.from_spmatrix(X_tag_tfidf).add_prefix('t_')], axis=1)
tags_tfidf.head()

Unnamed: 0,movieId,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_1462,t_1463,t_1464,t_1465,t_1466,t_1467,t_1468,t_1469,t_1470,t_1471
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Теперь добавим оценки

In [49]:
ave_ratings=pd.DataFrame({'mean_r' : pd.Series(ratings.groupby('movieId').rating.mean()),
                          'median_r' : pd.Series(ratings.groupby('movieId').rating.median())})
ave_ratings.head()

Unnamed: 0_level_0,mean_r,median_r
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.92093,4.0
2,3.431818,3.5
3,3.259615,3.0
4,2.357143,3.0
5,3.071429,3.0


In [55]:
all_about_movies=movies[['movieId','title']].join(ave_ratings, on='movieId')
all_about_movies.head()

Unnamed: 0,movieId,title,mean_r,median_r
0,1,Toy Story (1995),3.92093,4.0
1,2,Jumanji (1995),3.431818,3.5
2,3,Grumpier Old Men (1995),3.259615,3.0
3,4,Waiting to Exhale (1995),2.357143,3.0
4,5,Father of the Bride Part II (1995),3.071429,3.0


In [61]:
all_about_movies=all_about_movies.merge(genres_tfidf, how='inner', on='movieId')
all_about_movies.head()

Unnamed: 0,movieId,title,mean_r,median_r,g_0_x,g_1_x,g_2_x,g_3_x,g_4_x,g_5_x,...,g_10_y,g_11_y,g_12_y,g_13_y,g_14_y,g_15_y,g_16_y,g_17_y,g_18_y,g_19_y
0,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),3.431818,3.5,0.0,0.512361,0.0,0.620525,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),3.259615,3.0,0.0,0.0,0.0,0.0,0.570915,0.0,...,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale (1995),2.357143,3.0,0.0,0.0,0.0,0.0,0.505015,0.0,...,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II (1995),3.071429,3.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [62]:
all_about_movies=all_about_movies.merge(tags_tfidf, how='inner', on='movieId')
all_about_movies.head()

Unnamed: 0,movieId,title,mean_r,median_r,g_0_x,g_1_x,g_2_x,g_3_x,g_4_x,g_5_x,...,t_1462,t_1463,t_1464,t_1465,t_1466,t_1467,t_1468,t_1469,t_1470,t_1471
0,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),3.431818,3.5,0.0,0.512361,0.0,0.620525,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),3.259615,3.0,0.0,0.0,0.0,0.0,0.570915,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,5,Father of the Bride Part II (1995),3.071429,3.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,Sabrina (1995),3.185185,3.0,0.0,0.0,0.0,0.0,0.570915,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
all_about_movies_users = pd.merge(all_about_movies,ratings[['movieId','userId','rating']], on='movieId')
all_about_movies_users.head()

Unnamed: 0,movieId,title,mean_r,median_r,g_0_x,g_1_x,g_2_x,g_3_x,g_4_x,g_5_x,...,t_1464,t_1465,t_1466,t_1467,t_1468,t_1469,t_1470,t_1471,userId,rating
0,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,4.0
1,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4.0
2,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,4.5
3,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,15,2.5
4,1,Toy Story (1995),3.92093,4.0,0.0,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,17,4.5


In [72]:
ratings.userId.unique().shape

(610,)

In [81]:
ratings.groupby('userId').rating.count().sort_values(ascending=False)[:10]

userId
414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
Name: rating, dtype: int64

In [79]:
user_414=all_about_movies_users[all_about_movies_users['userId']==414]

In [85]:
user_414.drop(columns=['rating','userId','title'])

Unnamed: 0,movieId,title,mean_r,median_r,g_0_x,g_1_x,g_2_x,g_3_x,g_4_x,g_5_x,...,t_1462,t_1463,t_1464,t_1465,t_1466,t_1467,t_1468,t_1469,t_1470,t_1471
146,1,Toy Story (1995),3.920930,4.00,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
281,2,Jumanji (1995),3.431818,3.50,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
361,3,Grumpier Old Men (1995),3.259615,3.00,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
407,5,Father of the Bride Part II (1995),3.071429,3.00,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
462,7,Sabrina (1995),3.185185,3.00,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48219,174055,Dunkirk (2017),3.423077,4.00,0.449869,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48237,176371,Blade Runner 2049 (2017),3.805556,4.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48256,180031,The Shape of Water (2017),3.687500,3.75,0.000000,0.609257,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48261,180985,The Greatest Showman (2017),3.000000,3.00,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [89]:
X_train, X_test, y_train, y_test = train_test_split(user_414.drop(columns=['rating','userId','title']),
                                                    user_414['rating'],test_size=0.3, random_state=42)

In [90]:
LR = LinearRegression().fit(X_train, y_train)

  "pandas.DataFrame with sparse columns found."


In [93]:
mean_squared_error(y_test,LR.predict(X_test))

  "pandas.DataFrame with sparse columns found."


96902524653665.38

Видимо линейно оценки пользователя не описываются. Кроме того, в данных присутствует лик, т.к. мы тренируем tfidf на полных данных.