In [109]:
import pandas as pd
import numpy as np

from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Подготовка данных

In [110]:
movies = pd.read_csv('Datasets/movies.csv')
ratings = pd.read_csv('Datasets/ratings.csv')

In [111]:
movies.drop(['genres'], axis=1, inplace=True)
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [112]:
ratings.drop(['timestamp'], axis=1, inplace=True)
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [113]:
user_item_matrix = ratings.pivot(index='movieId', columns='userId', values='rating')
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,,,4.0,,4.5,,,,...,4.0,,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,,,,,,4.0,,4.0,,,...,,4.0,,5.0,3.5,,,2.0,,
3,4.0,,,,,5.0,,,,,...,,,,,,,,2.0,,
4,,,,,,3.0,,,,,...,,,,,,,,,,
5,,,,,,5.0,,,,,...,,,,3.0,,,,,,


In [114]:
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0


In [115]:
users_votes = ratings.groupby('userId')['rating'].agg('count')

movies_votes = ratings.groupby('movieId')['rating'].agg('count')

In [116]:
# Создаем маски для датасетов, чтобы отсеять пользователей и фильмы с малым количеством рецензий.
user_mask = users_votes[users_votes > 50].index
movie_mask = movies_votes[movies_votes > 10].index

In [117]:
user_item_matrix = user_item_matrix.loc[movie_mask, :]
user_item_matrix = user_item_matrix.loc[:, user_mask]

In [118]:
user_item_matrix.shape

(2121, 378)

In [119]:
csr_data = csr_matrix(user_item_matrix.values)

In [120]:
user_item_matrix = user_item_matrix.rename_axis(None, axis=1).reset_index()
user_item_matrix.head()

Unnamed: 0,movieId,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0


# Обучение модели

In [121]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)

knn.fit(csr_data)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [122]:
recommendations = 10
search_word = 'Prisoners'

In [123]:
movie_search = movies[movies['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
591,731,Heaven's Prisoners (1996)
8255,104879,Prisoners (2013)


In [124]:
movie_id = movie_search.iloc[1]['movieId']

movie_id = user_item_matrix[user_item_matrix['movieId'] == movie_id].index[0]
movie_id

2023

In [125]:
distances, indices = knn.kneighbors(csr_data[movie_id], n_neighbors=recommendations + 1)

indices

array([[2023, 2065, 1754, 2058, 2089, 2064, 1965, 2061, 2101, 2004, 1867]],
      dtype=int64)

In [126]:
distances

array([[0.        , 0.38642028, 0.42489575, 0.51788606, 0.52341248,
        0.53575014, 0.5468749 , 0.55082756, 0.55320773, 0.56154367,
        0.56307652]])

In [127]:
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

indices_distances = list(zip(indices_list, distances_list))

In [128]:
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse=False)[1:]

indices_distances_sorted

[(2065, 0.386420276881549),
 (1754, 0.42489575290696524),
 (2058, 0.5178860644002119),
 (2089, 0.5234124772500012),
 (2064, 0.5357501388654846),
 (1965, 0.5468749009790038),
 (2061, 0.5508275566068768),
 (2101, 0.553207731490914),
 (2004, 0.5615436745611626),
 (1867, 0.563076523894052)]

In [129]:
recom_list = []

for pair in indices_distances_sorted:
    matrix_movie_id = user_item_matrix.iloc[pair[0]]['movieId']

    id = movies[movies['movieId'] == matrix_movie_id].index

    title = movies.iloc[id]['title'].values[0]
    dist = pair[1]

    recom_list.append({ 'Title': title, 'Distance': dist })

In [130]:
recom_df = pd.DataFrame(recom_list, index=range(1, recommendations + 1))

recom_df

Unnamed: 0,Title,Distance
1,Nightcrawler (2014),0.38642
2,Zodiac (2007),0.424896
3,Gone Girl (2014),0.517886
4,The Hateful Eight (2015),0.523412
5,Fury (2014),0.53575
6,In Time (2011),0.546875
7,"Maze Runner, The (2014)",0.550828
8,The Revenant (2015),0.553208
9,Django Unchained (2012),0.561544
10,Inglourious Basterds (2009),0.563077
