# 1-2 Create spare matrix

In [1]:
import pandas as pd
import numpy as np

In [2]:
movies_raw = pd.read_csv('./data/movies_big.csv', dtype={'movieId': 'int32'})
ratings_raw = pd.read_csv('./data/ratings_big.csv', usecols=['userId', 'movieId', 'rating'], dtype={'movieId': 'int32', 'userId':'int32', 'rating':'float32'})

In [5]:
all_movieId = ratings_raw['movieId'].unique()
all_userId = ratings_raw['userId'].unique()

all_movieId, all_userId

(array([   307,    481,   1091, ..., 117857, 133409, 142855], dtype=int32),
 array([     1,      2,      3, ..., 283226, 283227, 283228], dtype=int32))

In [6]:
def create_id_and_csr_index_dictionary(data_frame: pd.DataFrame, column: str, csr_id_first: bool = False) -> dict:
    unique_id_dict = pd.DataFrame(data_frame[column].unique()).to_dict()[0]
    if csr_id_first:
        return unique_id_dict
    return dict(zip(unique_id_dict.values(), unique_id_dict.keys()))

movieId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'movieId')
userId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'userId')

In [7]:
ratings_raw_row_lists = [[row.userId, row.movieId, row.rating] for row in ratings_raw.itertuples()]
ratings_raw_row_lists[:5]

[[1, 307, 3.5], [1, 481, 3.5], [1, 1091, 1.5], [1, 1257, 4.5], [1, 1449, 4.5]]

In [8]:
len(ratings_raw_row_lists)

27753444

In [9]:
converted_into_rows_and_columns_list = [[userId_dict[row_list[0]], movieId_dict[row_list[1]], row_list[2]] for row_list in ratings_raw_row_lists]
converted_into_rows_and_columns_list[:5]

[[0, 0, 3.5], [0, 1, 3.5], [0, 2, 1.5], [0, 3, 4.5], [0, 4, 4.5]]

In [10]:
column = np.array([item[0] for item in converted_into_rows_and_columns_list])
row = np.array([item[1] for item in converted_into_rows_and_columns_list])
data = np.array([item[2] for item in converted_into_rows_and_columns_list])
column[:5], row[:5], data[:5]

(array([0, 0, 0, 0, 0]),
 array([0, 1, 2, 3, 4]),
 array([3.5, 3.5, 1.5, 4.5, 4.5]))

In [11]:
len(column), len(row), len(data)

(27753444, 27753444, 27753444)

In [12]:
len(all_movieId), len(all_userId)

(53889, 283228)

In [20]:
from scipy.sparse import csr_matrix
ratings_csr_matrix = csr_matrix((data, (row, column)), shape=(27753444, 27753444))
ratings_csr_matrix

<27753444x27753444 sparse matrix of type '<class 'numpy.float64'>'
	with 27753444 stored elements in Compressed Sparse Row format>

In [21]:
print(ratings_csr_matrix)

  (0, 0)	3.5
  (0, 5)	4.0
  (0, 55)	4.0
  (0, 70)	5.0
  (0, 83)	3.0
  (0, 140)	5.0
  (0, 212)	4.0
  (0, 213)	4.0
  (0, 238)	4.0
  (0, 267)	4.0
  (0, 276)	3.0
  (0, 333)	4.0
  (0, 334)	4.0
  (0, 359)	4.0
  (0, 364)	5.0
  (0, 398)	3.0
  (0, 407)	2.0
  (0, 414)	5.0
  (0, 427)	4.0
  (0, 462)	3.0
  (0, 494)	2.5
  (0, 496)	4.0
  (0, 548)	5.0
  (0, 565)	4.0
  (0, 567)	5.0
  :	:
  (53864, 282118)	3.5
  (53865, 282118)	2.5
  (53866, 282118)	3.0
  (53867, 282118)	3.0
  (53868, 282118)	3.0
  (53869, 282118)	3.0
  (53870, 282118)	3.5
  (53871, 282118)	3.0
  (53872, 282118)	3.0
  (53873, 282118)	3.5
  (53874, 282373)	2.5
  (53875, 282373)	4.0
  (53876, 282373)	1.5
  (53877, 282373)	3.0
  (53878, 282373)	3.5
  (53879, 282373)	3.0
  (53880, 282373)	4.0
  (53881, 282373)	4.0
  (53882, 282373)	2.5
  (53883, 282373)	4.5
  (53884, 282402)	1.0
  (53885, 282731)	3.5
  (53886, 282999)	3.5
  (53887, 282999)	3.5
  (53888, 282999)	3.5


In [22]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=5)
model_knn.fit(ratings_csr_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

# 1-3 Recommendation system

In [23]:
csr_movieId_dict = create_id_and_csr_index_dictionary(ratings_raw, 'movieId', True)

In [25]:
csr_movieId_dict

{0: 307,
 1: 481,
 2: 1091,
 3: 1257,
 4: 1449,
 5: 1590,
 6: 1591,
 7: 2134,
 8: 2478,
 9: 2840,
 10: 2986,
 11: 3020,
 12: 3424,
 13: 3698,
 14: 3826,
 15: 3893,
 16: 170,
 17: 849,
 18: 1186,
 19: 1235,
 20: 1244,
 21: 1296,
 22: 1663,
 23: 1962,
 24: 2108,
 25: 2243,
 26: 2352,
 27: 2707,
 28: 2746,
 29: 2915,
 30: 3363,
 31: 640,
 32: 828,
 33: 960,
 34: 1221,
 35: 1321,
 36: 1645,
 37: 1825,
 38: 1985,
 39: 2024,
 40: 2028,
 41: 3171,
 42: 1,
 43: 2,
 44: 5,
 45: 6,
 46: 10,
 47: 11,
 48: 16,
 49: 19,
 50: 20,
 51: 23,
 52: 25,
 53: 31,
 54: 32,
 55: 34,
 56: 36,
 57: 39,
 58: 41,
 59: 44,
 60: 45,
 61: 47,
 62: 50,
 63: 61,
 64: 62,
 65: 65,
 66: 66,
 67: 70,
 68: 76,
 69: 85,
 70: 95,
 71: 100,
 72: 104,
 73: 110,
 74: 141,
 75: 145,
 76: 150,
 77: 151,
 78: 153,
 79: 158,
 80: 160,
 81: 161,
 82: 163,
 83: 165,
 84: 169,
 85: 172,
 86: 173,
 87: 175,
 88: 180,
 89: 181,
 90: 185,
 91: 186,
 92: 193,
 93: 198,
 94: 204,
 95: 216,
 96: 223,
 97: 231,
 98: 253,
 99: 255,
 100: 25

In [27]:
from fuzzywuzzy import process

def recommend_movies(movie_name, number_of_recommendations):
    search_index = process.extractOne(movie_name, movies_raw['title'])
    search_movieId = movies_raw.iloc[search_index[2]]['movieId']
    search_csr_index = movieId_dict[search_movieId]
    print(search_movieId)
    print(search_csr_index)
    _distances, csr_indices = model_knn.kneighbors(
        ratings_csr_matrix[search_index[2]], n_neighbors=number_of_recommendations
    )
    print(csr_indices)
    recomendation_list = [
        # movies_raw.query(f'movieId == {csr_movieId_dict[csr_index]}')['title'].values[0]
        # + str(csr_movieId_dict[csr_index])
        # + str(movies_raw.query(f'movieId == {csr_movieId_dict[csr_index]}')['movieId'].values)
        movies_raw.iloc[csr_index]['title']
        for csr_index in csr_indices[0]
        # if csr_index != search_index[2]
    ]
    return recomendation_list

recommend_movies('toy story', 10)

1
42
[[   0 2016 1701 2285 3147 1213 1328 1130 1698 1135]]


['Toy Story (1995)',
 'Song of the South (1946)',
 'B. Monkey (1998)',
 'Desperately Seeking Susan (1985)',
 'Smashing Time (1967)',
 'Dead Alive (Braindead) (1992)',
 'Sling Blade (1996)',
 'Raw Deal (1948)',
 'Music From Another Room (1998)',
 'Here Comes Cookie (1935)']

In [None]:
# def main():
#     user_title_input = str(input('Please input a movie title'))
#     user_number_of_recomendations_input = int(input('Please amount of recomendations'))
#     print('Naive film search recomendations are:')
#     for recomendation in recommend_movies(user_title_input, user_number_of_recomendations_input):
#         print(recomendation)

# main()