# `Рекомендательная система`

### Этап 1. Подготовка данных

In [27]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [7]:
# считываем файлы для обучения
movies = pd.read_csv('c:\\Python\\ML\\DataBases\\movies.csv')
ratings = pd.read_csv('c:\\Python\\ML\\DataBases\\ratings.csv')

# убираем ненужные столбцы
movies.drop('genres', inplace=True, axis=1)
ratings.drop('timestamp', inplace=True, axis=1)
# axis=1 значит, что рыбота идет со столбцами
# inplace=True - сохранить изменения

In [8]:
movies

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)
...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017)
9738,193583,No Game No Life: Zero (2017)
9739,193585,Flint (2017)
9740,193587,Bungo Stray Dogs: Dead Apple (2018)


In [9]:
ratings

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


In [10]:
# pivot - создание сводной таблицы (pivot table)
# по горизонтали будут фильмы, по вертикали - пользователи, значения - оценки
user_item_matrix = ratings.pivot(index = 'movieId', columns = 'userId', values= 'rating')

user_item_matrix.fillna(0, inplace=True)  # меняем в таблице значения Nan на 0
user_item_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,4.0,0.0,4.5,0.0,0.0,0.0,...,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Надо убрать неактивных пользователей и непопулярные фильмы

In [11]:
user_votes = ratings.groupby('userId')['rating'].agg('count')  # группируем пользователей по количеству оценок
rating_votes = ratings.groupby('movieId')['rating'].agg('count')  # группируем фильмы по количеству оценок

In [12]:
# создание фильтра
user_mask = user_votes[user_votes > 50].index  # возьмем только тех пользователей, которые поставили больше 50 оценок
movie_mask = rating_votes[rating_votes > 10].index  # возьмем только те фильмы, у которых больше 10 оценок

In [13]:
user_item_matrix = user_item_matrix.loc[movie_mask,:]  # оставляем тлько нужные фильмы, исходя из созданной маски
user_item_matrix = user_item_matrix.loc[:,user_mask]  # оставляем нужных пользователей исходя из созданной маски
user_item_matrix

userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
176371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
177765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
179819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Преобразование разряженной матрицы (sprace matrix) в сжатое хранение строкой (compressed sprace row)

In [14]:
# преобразование разряженной матрицы в формат csr
csr_data = csr_matrix(user_item_matrix.values)
print(csr_data[:2, :5])

  (0, 0)	4.0
  (0, 3)	4.5
  (1, 2)	4.0


In [15]:
user_item_matrix = user_item_matrix.rename_axis(None, axis=1).reset_index()  # создаем последовательную индексацию строчек
user_item_matrix

Unnamed: 0,movieId,1,4,6,7,10,11,15,16,17,...,600,601,602,603,604,605,606,607,608,610
0,1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
1,2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
2,3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
3,5,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
4,6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2116,174055,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2117,176371,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2118,177765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2119,179819,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Этап 2. Обучение модели

In [16]:
knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# metric - способ измерения расстояния между точками (здесь - косинусное сходство)
# algorithm - то, каким методом будем искать расстояния (здесь - полный перебор)
# n_neighbours - по скольким соседям будет вестись обучение модели
# n_jobs - сколько ядер процессора будет использовано для обучения (-1 все ядра)

knn.fit(csr_data)

### Этап 3. Составление рекомендаций

In [17]:
recommendations = 10  # сколько рекомендаций надо получить
search_word = 'Matrix'  # на основе какого фильма надо получить рекоммендации

In [18]:
# найдем фильм в заголовках датафрейма movies
# 'asdfbvabrabvi'.contains('fbb') - выводит есть ли заданная комбинация символов в строке или нет
movie_search = movies[movies['title'].str.contains(search_word)]
movie_search

Unnamed: 0,movieId,title
1939,2571,"Matrix, The (1999)"
4351,6365,"Matrix Reloaded, The (2003)"
4639,6934,"Matrix Revolutions, The (2003)"


In [19]:
movie_id = movie_search.iloc[0]['movieId']  # берем первое совпадение из списка фильмов

# далее по индексу фильма в датасете movies найдем соответствующий индекс в матрице предпочтений
movie_id = user_item_matrix[user_item_matrix['movieId'] == movie_id].index[0]
movie_id

901

In [20]:
# находим индексы и расстояния фильмов, которые похожи на наш запрос
distances, indices = knn.kneighbors(csr_data[movie_id], n_neighbors = recommendations + 1)

In [21]:
indices  # индксы рекомендованных фильмов

array([[ 901, 1002,  442,  454,  124,  735,  954, 1362, 1157, 1536,  978]],
      dtype=int64)

In [22]:
distances  # расстояния до рекомендованных фильмов

array([[0.        , 0.22982441, 0.25401128, 0.27565617, 0.27760886,
        0.28691008, 0.29111012, 0.31393358, 0.31405926, 0.31548004,
        0.31748544]])

In [23]:
# уберем лишние измерения через squeeze() и преобразуем массивы в списки с помощью tolist()
indices_list = indices.squeeze().tolist()
distances_list = distances.squeeze().tolist()

# далее с помощью функций zip и list преобразуем наши списки
indices_distances = list(zip(indices_list, distances_list)) # (номер, расстояние)
print(indices_distances[:3])

[(901, 0.0), (1002, 0.22982440568634488), (442, 0.25401128310081567)]


In [24]:
indices_distances_sorted = sorted(indices_distances, key = lambda x: x[1], reverse = False)  # сортируем значения от 1 (тк 0 это и есть сама точнно отсчета)

# убираем первый элемент с индексом 901 (потому что это и есть "Матрица")
indices_distances_sorted = indices_distances_sorted[1:]
indices_distances_sorted

[(1002, 0.22982440568634488),
 (442, 0.25401128310081567),
 (454, 0.27565616686043737),
 (124, 0.2776088577731709),
 (735, 0.2869100842838125),
 (954, 0.2911101181714415),
 (1362, 0.31393358217709477),
 (1157, 0.31405925934381695),
 (1536, 0.3154800434449465),
 (978, 0.31748544046311844)]

In [25]:
recom_list = []
for ind_dist in indices_distances_sorted:

    # искать movieId в матрице предпочтений
    matrix_movie_id = user_item_matrix.iloc[ind_dist[0]]['movieId']

    # выяснять индекс этого фильма в датафрейме movies
    id = movies[movies['movieId'] == matrix_movie_id].index

    # брать название фильма и расстояние до него
    title = movies.iloc[id]['title'].values[0]
    dist = ind_dist[1]

    # помещать каждую пару в питоновский словарь
    # который, в свою очередь, станет элементом списка recom_list
    recom_list.append({'Title' : title, 'Distance' : dist})
print(recom_list[0])

{'Title': 'Fight Club (1999)', 'Distance': 0.22982440568634488}


In [26]:
# индекс будем начинать с 1, как и положено рейтингу
recom_df = pd.DataFrame(recom_list, index = range(1, recommendations + 1))
recom_df  # список рекомендованных фильмов

Unnamed: 0,Title,Distance
1,Fight Club (1999),0.229824
2,Star Wars: Episode V - The Empire Strikes Back...,0.254011
3,Star Wars: Episode VI - Return of the Jedi (1983),0.275656
4,Star Wars: Episode IV - A New Hope (1977),0.277609
5,Saving Private Ryan (1998),0.28691
6,"Sixth Sense, The (1999)",0.29111
7,"Lord of the Rings: The Fellowship of the Ring,...",0.313934
8,Gladiator (2000),0.314059
9,"Lord of the Rings: The Return of the King, The...",0.31548
10,American Beauty (1999),0.317485
