# 1-2 Create spare matrix

In [6]:
import pandas as pd
import numpy as np

In [7]:
movies_raw = pd.read_csv('./data/movies_big.csv', dtype={'movieId': 'int32'})
ratings_raw = pd.read_csv('./data/ratings_big.csv', usecols=['userId', 'movieId', 'rating'], dtype={'movieId': 'int32', 'userId':'int32', 'rating':'float16'})

In [8]:
ratings_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float16
dtypes: float16(1), int32(2)
memory usage: 264.7 MB


In [9]:
all_movieId = ratings_raw['movieId'].unique()
all_userId = ratings_raw['userId'].unique()

all_movieId, all_userId

(array([   307,    481,   1091, ..., 117857, 133409, 142855], dtype=int32),
 array([     1,      2,      3, ..., 283226, 283227, 283228], dtype=int32))

In [26]:
def create_id_and_csr_index_dictionary(data_frame: pd.DataFrame, column: str, csr_id_first: bool = False) -> dict:
    unique_id_dict = pd.DataFrame(data_frame[column].unique()).to_dict()[0]
    if not csr_id_first:
        return dict(zip(unique_id_dict.values(), unique_id_dict.keys()))
    return unique_id_dict

movieId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'movieId')
userId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'userId')

In [14]:
ratings_raw_row_lists = [[row.userId, row.movieId, row.rating] for row in ratings_raw.itertuples()]
ratings_raw_row_lists[:5]

[[1, 307, 3.5], [1, 481, 3.5], [1, 1091, 1.5], [1, 1257, 4.5], [1, 1449, 4.5]]

In [15]:
len(ratings_raw_row_lists)

27753444

In [16]:
converted_into_rows_and_columns_list = [[userId_dict[row_list[0]], movieId_dict[row_list[1]], row_list[2]] for row_list in ratings_raw_row_lists]

In [17]:
from scipy.sparse import csr_matrix
ratings_csr_matrix = csr_matrix(converted_into_rows_and_columns_list)
ratings_csr_matrix

<27753444x3 sparse matrix of type '<class 'numpy.float64'>'
	with 83252358 stored elements in Compressed Sparse Row format>

In [20]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(ratings_csr_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [18]:
movies_raw.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


# 1-3 Recommendation system

In [29]:
csr_movieId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'movieId', True)

In [35]:
movies_raw.iloc[0]

movieId                                              1
title                                 Toy Story (1995)
genres     Adventure|Animation|Children|Comedy|Fantasy
Name: 0, dtype: object

In [59]:
from fuzzywuzzy import process

def recommend_movies(movie_name, number_of_recommendations):
    search_index = process.extractOne(movie_name, movies_raw['title'])
    search_movieId = movies_raw.iloc[search_index[2]]['movieId']
    search_csr_index = movieId_dict[search_movieId]
    print(search_movieId)
    print(search_csr_index)
    _distances, csr_indices = model_knn.kneighbors(
        ratings_csr_matrix[search_index[2]], n_neighbors=number_of_recommendations
    )
    print(csr_indices)
    recomendation_list = [
        movies_raw.query(f'movieId == {csr_movieId_dict[csr_index]}')['title'].values[0]
        + str(csr_movieId_dict[csr_index])
        + str(movies_raw.query(f'movieId == {csr_movieId_dict[csr_index]}')['movieId'].values)
        for csr_index in csr_indices[0]
        if csr_index != search_index[2]
    ]
    return recomendation_list

recommend_movies('toy story', 10)

1
42
[[  0   1   3 148   4 870   2   7 268 269]]


['Kalifornia (1993)481[481]',
 'Better Off Dead... (1985)1257[1257]',
 'Executive Decision (1996)494[494]',
 'Waiting for Guffman (1996)1449[1449]',
 'Citizen Kane (1941)923[923]',
 "Weekend at Bernie's (1989)1091[1091]",
 'Weird Science (1985)2134[2134]',
 'Air Force One (1997)1608[1608]',
 'Hunt for Red October, The (1990)1610[1610]']

In [None]:
# def main():
#     user_title_input = str(input('Please input a movie title'))
#     user_number_of_recomendations_input = int(input('Please amount of recomendations'))
#     print('Naive film search recomendations are:')
#     for recomendation in recommend_movies(user_title_input, user_number_of_recomendations_input):
#         print(recomendation)

# main()