# 1-2 Create spare matrix

In [1]:
import pandas as pd 

In [2]:
movies_raw = pd.read_csv('./data/movies_big.csv', dtype={'movieId': 'int32'})
ratings_raw = pd.read_csv('./data/ratings_big.csv', usecols=['userId', 'movieId', 'rating'], dtype={'movieId': 'int32', 'userId':'int32', 'rating':'float32'})

In [3]:
avarage_rating = ratings_raw.groupby('movieId')['rating'].mean()
avarage_rating.info()

<class 'pandas.core.series.Series'>
Int64Index: 53889 entries, 1 to 193886
Series name: rating
Non-Null Count  Dtype  
--------------  -----  
53889 non-null  float32
dtypes: float32(1)
memory usage: 631.5 KB


In [4]:
movies_with_rating_over_x = pd.DataFrame(avarage_rating).query('rating > 3.5').index
movies_with_rating_over_x

Int64Index([     1,      6,     11,     16,     17,     21,     25,     26,
                28,     29,
            ...
            193765, 193785, 193795, 193799, 193801, 193835, 193843, 193868,
            193872, 193874],
           dtype='int64', name='movieId', length=13466)

In [5]:
ratings_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 317.6 MB


In [6]:
ratings_cleaned = ratings_raw.query(f'movieId == {list(movies_with_rating_over_x)}')
ratings_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16314048 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 311.2 MB


In [7]:
ratings_pivot_table = ratings_cleaned.pivot(
    index="movieId", columns="userId", values="rating"
).fillna(0);



In [8]:
from scipy.sparse import csr_matrix
ratings_csr_matrix = csr_matrix(ratings_pivot_table.values)
ratings_csr_matrix

<13466x278651 sparse matrix of type '<class 'numpy.float32'>'
	with 16314048 stored elements in Compressed Sparse Row format>

In [9]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(ratings_csr_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [10]:
movies_raw.iloc[0]

movieId                                              1
title                                 Toy Story (1995)
genres     Adventure|Animation|Children|Comedy|Fantasy
Name: 0, dtype: object

In [11]:
ratings_pivot_table[:10]

userId,1,2,3,4,5,6,7,8,9,10,...,283218,283219,283220,283221,283222,283223,283224,283225,283227,283228
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
6,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
16,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
25,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [65]:
print(ratings_csr_matrix)

  (0, 3)	4.0
  (0, 9)	5.0
  (0, 13)	4.5
  (0, 14)	4.0
  (0, 20)	4.0
  (0, 23)	4.0
  (0, 25)	5.0
  (0, 29)	5.0
  (0, 30)	4.5
  (0, 35)	5.0
  (0, 39)	4.0
  (0, 40)	5.0
  (0, 48)	4.0
  (0, 51)	3.0
  (0, 52)	5.0
  (0, 53)	2.5
  (0, 55)	4.0
  (0, 59)	4.0
  (0, 63)	4.0
  (0, 66)	4.0
  (0, 67)	4.0
  (0, 69)	4.0
  (0, 70)	5.0
  (0, 75)	5.0
  (0, 95)	4.5
  :	:
  (13444, 156006)	4.0
  (13445, 156006)	4.0
  (13446, 156006)	5.0
  (13447, 156006)	5.0
  (13448, 115960)	4.0
  (13449, 73686)	4.0
  (13450, 183510)	5.0
  (13451, 189826)	5.0
  (13452, 53257)	4.0
  (13453, 56396)	5.0
  (13454, 226115)	4.0
  (13455, 31034)	5.0
  (13456, 175021)	4.5
  (13457, 80612)	4.0
  (13458, 115596)	5.0
  (13459, 212435)	4.0
  (13459, 261781)	5.0
  (13460, 212435)	4.0
  (13460, 261781)	5.0
  (13461, 74178)	4.5
  (13462, 4160)	4.0
  (13463, 73241)	4.0
  (13463, 136186)	4.5
  (13464, 228745)	4.0
  (13465, 178554)	5.0


In [69]:
movies_cleaned = movies_raw.loc[
    movies_raw["movieId"].isin(list(ratings_cleaned["movieId"].values))
].reset_index().drop(columns=['index'])
movies_cleaned.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,6,Heat (1995),Action|Crime|Thriller
2,11,"American President, The (1995)",Comedy|Drama|Romance
3,16,Casino (1995),Crime|Drama
4,17,Sense and Sensibility (1995),Drama|Romance


# 1-3 Recommendation system

In [71]:
from fuzzywuzzy import process

def recommend_movies(movie_name, number_of_recommendations):
    search_index = process.extractOne(movie_name, movies_cleaned["title"])
    _distances, csr_indices = model_knn.kneighbors(
        ratings_csr_matrix[search_index[2]], n_neighbors=number_of_recommendations
    )
    recomendation_list = [
        movies_cleaned.iloc[csr_index]["title"]
        for csr_index in csr_indices[0]
        if csr_index != search_index[2]
    ]
    for recomendation in recomendation_list:
        print(recomendation)

recommend_movies("toy story", 10)

Star Wars: Episode IV - A New Hope (1977)
Toy Story 2 (1999)
Back to the Future (1985)
Jurassic Park (1993)
Forrest Gump (1994)
Lion King, The (1994)
Star Wars: Episode VI - Return of the Jedi (1983)
Aladdin (1992)
Willy Wonka & the Chocolate Factory (1971)
