In [103]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

%matplotlib inline

In [104]:
links = pd.read_csv('../1_introduction/links.csv')
movies = pd.read_csv('../1_introduction/movies.csv')
ratings = pd.read_csv('../1_introduction/ratings.csv')
tags = pd.read_csv('../1_introduction/tags.csv')

In [105]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [106]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [107]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [108]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [109]:
d=datetime.fromtimestamp(tags['timestamp'][1])

In [110]:
#Посмотрим на топ пользователей который ставят рейтинг и теги

In [111]:
ratings.groupby('userId')['rating'].count().sort_values(ascending=False)[:10]

userId
414    2698
599    2478
474    2108
448    1864
274    1346
610    1302
68     1260
380    1218
606    1115
288    1055
Name: rating, dtype: int64

In [112]:
tags.groupby("userId")["tag"].count().sort_values(ascending=False)[:10]

userId
474    1507
567     432
62      370
599     323
477     280
424     273
537     100
125      48
357      45
318      41
Name: tag, dtype: int64

In [113]:
#Пользователь 474 засветился и там и там - будем использовать его

In [114]:
tagsChosen = tags[tags["userId"] == 474][["movieId","tag","timestamp"]]

In [115]:
ratingsChosen = ratings[ratings["userId"] == 474][["movieId","rating","timestamp"]]

In [116]:
# Векторизируем теги

In [117]:
tagsChosen.tag = tagsChosen['tag'].apply(lambda x: ' '.join(x.replace(' ', '').replace('-','').split()))

In [118]:
countvec = CountVectorizer()
tagsChosenVec = countvec.fit_transform(tagsChosen.tag)
tfidf = TfidfTransformer()
tagsChosenTfidf = tfidf.fit_transform(tagsChosenVec)

tagsChosenTfidf = tagsChosenTfidf.toarray()

In [119]:
tagsChosenTfidf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [120]:
for column in range(tagsChosenTfidf.shape[1]):
    col_name = 't'+ str(column)
    tagsChosen[col_name] = pd.Series(tagsChosenTfidf[:,column])

In [121]:
# Векторизируем жанры

In [122]:
movies['genres'] = movies.apply(lambda x: ' '.join(x['genres'].replace(' ', '').replace('-','').split('|')), axis=1)

In [123]:
def extract_year(s):
    try:
        return int(s.rstrip()[-5:-1])
    except:
        return 0

In [124]:
movies['year'] = movies['title'].apply(extract_year)

In [125]:
movies = pd.get_dummies(movies, columns=['year'])

In [126]:
tfidfvec = TfidfVectorizer()
tfidfgenres = tfidfvec.fit_transform(movies.genres)
tfidfgenres = tfidfgenres.toarray()

In [127]:
for column in range(tfidfgenres.shape[1]):
    col_name = 'g'+ str(column)
    movies[col_name] = pd.Series(tfidfgenres[:,column])

In [128]:
#Средний рейтинг

In [129]:
rating_mean = ratings.groupby('movieId')[['rating']].mean()

In [130]:
movies=movies.join(rating_mean, on='movieId')

In [131]:
movies.rename(columns={'rating':'mean_rating'}, inplace=True)

In [132]:
userChosen = ratingsChosen.join(tagsChosen.set_index('movieId'), on='movieId', lsuffix='_rating', rsuffix='_tag')


In [133]:
userChosen = userChosen.join(movies.set_index('movieId'), on='movieId')

In [134]:
userChosen.drop(labels=['tag', 'title', 'genres', 'movieId','timestamp_rating', 'timestamp_tag'], axis=1, inplace=True)

In [135]:
userChosen.fillna(value=0, inplace=True)

In [136]:
userChosen.head()

Unnamed: 0,rating,t0,t1,t2,t3,t4,t5,t6,t7,t8,...,g11,g12,g13,g14,g15,g16,g17,g18,g19,mean_rating
73092,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093
73093,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.431818
73094,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
73094,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.071429
73095,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.542042,0.0,0.0,3.946078


In [137]:
#строим модель

In [138]:
X = userChosen.drop(labels=['rating'], axis=1)
y = userChosen.rating

In [139]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

In [147]:
knn = KNeighborsRegressor( 14, weights = 'distance' )

In [148]:
grid = GridSearchCV(model, {'n_neighbors':range(13,15,1)}, cv=5, scoring='neg_mean_squared_error')

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [143]:
grid.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                           metric='minkowski',
                                           metric_params=None, n_jobs=None,
                                           n_neighbors=5, p=2,
                                           weights='uniform'),
             iid='warn', n_jobs=None, param_grid={'n_neighbors': range(13, 15)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [144]:
grid.best_score_
grid.best_params_

{'n_neighbors': 14}

In [150]:
model.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

In [151]:
y = model.predict(X_test)

In [152]:
mean_squared_error(y, y_test)

0.5058417508417509

In [153]:
mean_absolute_error(y, y_test)

0.5486531986531986

In [154]:
# получили среднюю ошибку MSE - 0.5