# 1-2 Create spare matrix

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_raw = pd.read_csv('./data/movies_big.csv', dtype={'movieId': 'int32'})
ratings_raw = pd.read_csv('./data/ratings_big.csv', usecols=['userId', 'movieId', 'rating'], dtype={'movieId': 'int32', 'userId':'int32', 'rating':'float32'})

In [3]:
def create_id_and_csr_index_dictionary(data_frame: pd.DataFrame, column: str, csr_id_first: bool = False) -> dict:
    unique_id_dict = pd.DataFrame(data_frame[column].sort_values().unique()).to_dict()[0]
    if csr_id_first:
        return unique_id_dict
    return dict(zip(unique_id_dict.values(), unique_id_dict.keys()))

movieId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'movieId')
userId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'userId')

In [4]:
ratings_raw.sort_values(['movieId', 'userId']).head()

Unnamed: 0,userId,movieId,rating
42,4,1,4.0
939,10,1,5.0
1117,14,1,4.5
1291,15,1,4.0
1869,22,1,4.0


In [5]:
ratings_raw_row_lists = [[row.movieId, row.userId, row.rating] for row in ratings_raw.sort_values(['movieId', 'userId']).itertuples()]
ratings_raw_row_lists[:5]

[[1, 4, 4.0], [1, 10, 5.0], [1, 14, 4.5], [1, 15, 4.0], [1, 22, 4.0]]

In [6]:
ratings_raw_converted_into_rows_and_columns = [
    [movieId_dict[row_list[0]], userId_dict[row_list[1]], row_list[2]]
    for row_list in ratings_raw_row_lists
]
ratings_raw_converted_into_rows_and_columns[:5]

[[0, 3, 4.0], [0, 9, 5.0], [0, 13, 4.5], [0, 14, 4.0], [0, 21, 4.0]]

In [7]:
column = np.array([item[0] for item in ratings_raw_converted_into_rows_and_columns])
row = np.array([item[1] for item in ratings_raw_converted_into_rows_and_columns])
data = np.array([item[2] for item in ratings_raw_converted_into_rows_and_columns])
column[:5], row[:5], data[:5]

(array([0, 0, 0, 0, 0]),
 array([ 3,  9, 13, 14, 21]),
 array([4. , 5. , 4.5, 4. , 4. ]))

In [8]:
len(movieId_dict), len(userId_dict), len(column), len(row), len(data), 13466 * 278651

(53889, 283228, 27753444, 27753444, 27753444, 3752314366)

In [9]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

ratings_csr_matrix = csr_matrix((data, (row, column))).T

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(ratings_csr_matrix)

ratings_csr_matrix, ratings_csr_matrix.shape

(<53889x283228 sparse matrix of type '<class 'numpy.float64'>'
 	with 27753444 stored elements in Compressed Sparse Column format>,
 (53889, 283228))

# 1-3 Recommendation system

In [10]:
csr_movieId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'movieId', True)

In [11]:
from fuzzywuzzy import process

def recommend_movies(movie_name, number_of_recommendations):
    search_index = process.extractOne(movie_name, movies_raw['title'])
    _distances, csr_indices = model_knn.kneighbors(
        ratings_csr_matrix[search_index[2]], n_neighbors=number_of_recommendations+1
    )
    recomendation_list_movieIds = [
        csr_movieId_dict[csr_index]
        for csr_index in csr_indices[0]
        if csr_index != search_index[2]
    ]

    return movies_raw.query(f'movieId == {recomendation_list_movieIds}')

recommend_movies('toy story', 10)

Unnamed: 0,movieId,title,genres
257,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
352,356,Forrest Gump (1994),Comedy|Drama|Romance|War
360,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
476,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
582,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
640,648,Mission: Impossible (1996),Action|Adventure|Mystery|Thriller
767,780,Independence Day (a.k.a. ID4) (1996),Action|Adventure|Sci-Fi|Thriller
1184,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
1242,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
3028,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
