# 1-2 Create spare matrix

In [157]:
import pandas as pd
import numpy as np

In [158]:
movies_raw = pd.read_csv('./data/movies_big.csv', dtype={'movieId': 'int32'})
ratings_raw = pd.read_csv('./data/ratings_big.csv', usecols=['userId', 'movieId', 'rating'], dtype={'movieId': 'int32', 'userId':'int32', 'rating':'float32'})

In [159]:
avarage_rating = ratings_raw.groupby('movieId')['rating'].mean()
movies_with_rating_over_x = pd.DataFrame(avarage_rating).query('rating > 3.5').index
ratings_cleaned = ratings_raw.query(f'movieId == {list(movies_with_rating_over_x)}')
ratings_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16314048 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 311.2 MB


In [160]:
ratings_cleaned.head()

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
3,1,1257,4.5
4,1,1449,4.5
11,1,3020,4.0
12,1,3424,4.5


In [161]:
movies_cleaned = movies_raw.loc[
    movies_raw["movieId"].isin(list(ratings_cleaned["movieId"].values))
].reset_index().drop(columns=['index'])
movies_cleaned.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,6,Heat (1995),Action|Crime|Thriller
2,11,"American President, The (1995)",Comedy|Drama|Romance
3,16,Casino (1995),Crime|Drama
4,17,Sense and Sensibility (1995),Drama|Romance


In [162]:
def create_id_and_csr_index_dictionary(data_frame: pd.DataFrame, column: str, csr_id_first: bool = False) -> dict:
    unique_id_dict = pd.DataFrame(data_frame[column].sort_values().unique()).to_dict()[0]
    if csr_id_first:
        return unique_id_dict
    return dict(zip(unique_id_dict.values(), unique_id_dict.keys()))

movieId_dict = create_id_and_csr_index_dictionary(ratings_cleaned, 'movieId')
userId_dict = create_id_and_csr_index_dictionary(ratings_cleaned, 'userId')

In [163]:
ratings_cleaned.sort_values(['movieId', 'userId']).head()

Unnamed: 0,userId,movieId,rating
42,4,1,4.0
939,10,1,5.0
1117,14,1,4.5
1291,15,1,4.0
1869,22,1,4.0


In [164]:
ratings_cleaned_row_lists = [[row.movieId, row.userId, row.rating] for row in ratings_cleaned.sort_values(['movieId', 'userId']).itertuples()]
ratings_cleaned_row_lists[:5]

[[1, 4, 4.0], [1, 10, 5.0], [1, 14, 4.5], [1, 15, 4.0], [1, 22, 4.0]]

In [165]:
converted_into_rows_and_columns_list = [
    [movieId_dict[row_list[0]], userId_dict[row_list[1]], row_list[2]]
    for row_list in ratings_cleaned_row_lists
]
converted_into_rows_and_columns_list[:5]

[[0, 3, 4.0], [0, 9, 5.0], [0, 13, 4.5], [0, 14, 4.0], [0, 20, 4.0]]

In [166]:
column = np.array([item[0] for item in converted_into_rows_and_columns_list])
row = np.array([item[1] for item in converted_into_rows_and_columns_list])
data = np.array([item[2] for item in converted_into_rows_and_columns_list])
column[:5], row[:5], data[:5]

(array([0, 0, 0, 0, 0]),
 array([ 3,  9, 13, 14, 20]),
 array([4. , 5. , 4.5, 4. , 4. ]))

In [167]:
len(movieId_dict), len(userId_dict), len(column), len(row), len(data), 13466 * 278651

(13466, 278651, 16314048, 16314048, 16314048, 3752314366)

In [168]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

ratings_csr_matrix = csr_matrix((data, (row, column))).T

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(ratings_csr_matrix)

ratings_csr_matrix, ratings_csr_matrix.shape

(<13466x278651 sparse matrix of type '<class 'numpy.float64'>'
 	with 16314048 stored elements in Compressed Sparse Column format>,
 (13466, 278651))

# 1-3 Recommendation system

In [169]:
csr_movieId_dict = create_id_and_csr_index_dictionary(ratings_cleaned, 'movieId', True)

In [170]:
from fuzzywuzzy import process

def recommend_movies(movie_name, number_of_recommendations):
    search_index = process.extractOne(movie_name, movies_cleaned['title'])
    _distances, csr_indices = model_knn.kneighbors(
        ratings_csr_matrix[search_index[2]], n_neighbors=number_of_recommendations+1
    )
    recomendation_list_movieIds = [
        csr_movieId_dict[csr_index]
        for csr_index in csr_indices[0]
        if csr_index != search_index[2]
    ]

    return movies_cleaned.query(f'movieId == {recomendation_list_movieIds}')

recommend_movies('toy story', 10)

Unnamed: 0,movieId,title,genres
78,260,Star Wars: Episode IV - A New Hope (1977),Action|Adventure|Sci-Fi
115,356,Forrest Gump (1994),Comedy|Drama|Romance|War
118,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX
139,480,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
170,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
335,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
407,1210,Star Wars: Episode VI - Return of the Jedi (1983),Action|Adventure|Sci-Fi
464,1270,Back to the Future (1985),Adventure|Comedy|Sci-Fi
559,1580,Men in Black (a.k.a. MIB) (1997),Action|Comedy|Sci-Fi
1005,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
