In [8]:
import pandas as pd
df = pd.read_csv('/kaggle/input/netflix-prize-lite/netflix_interactions_lite.csv', usecols=[1,2,3])
df.head()

Unnamed: 0,userId,rating,movieId
0,785314,1.0,8
1,243963,3.0,8
2,1447783,4.0,8
3,1912665,1.0,8
4,1744889,1.0,8


In [9]:
movies_df = pd.read_csv('/kaggle/input/netflix-prize-data/movie_titles.csv', header=None, encoding="ISO-8859-1",
                        usecols=[0, 1, 2], names=['movieId', 'year', 'title'])

movies_df.sample(20)

Unnamed: 0,movieId,year,title
9842,9843,1979.0,Dallas: Season 3
11154,11155,2000.0,UFC Hits: Ultimate Fighting Championship
7613,7614,1963.0,Any Number Can Win
15430,15431,1954.0,Creature from the Black Lagoon: Special Edition
15705,15706,1989.0,Going Overboard
6437,6438,1951.0,The Man in the White Suit
11331,11332,1955.0,Blackboard Jungle
17166,17167,1994.0,Dennis Miller - Live from Washington
2012,2013,2002.0,Borderline
7010,7011,2002.0,A Loving Father


In [10]:
movies_df.movieId = movies_df.movieId.astype(str)
movie_tite_dict = dict(zip(movies_df.movieId, movies_df.title))

In [11]:
del movies_df

In [12]:
piv_df = pd.pivot_table(df, index="userId", columns='movieId', values="rating", aggfunc='mean')
piv_df.fillna(0, inplace=True)
piv_df.head()

movieId,8,28,30,58,77,83,108,111,118,143,...,17622,17627,17671,17672,17692,17697,17703,17709,17762,17764
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,4.0,0.0,2.0,5.0,0.0,0.0
7,5.0,4.0,5.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
79,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,0.0,4.0,4.0,1.0,0.0,0.0,0.0,4.0
97,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,5.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
134,0.0,5.0,0.0,5.0,4.0,0.0,0.0,0.0,5.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,4.0,0.0


In [21]:
def item_based_corr(movie_id):
    # get the movie columns 
    # gives the vector ratings given by all users for this movie
    movie_id = movie_id.lower()
    movie_rating = piv_df[int(movie_id)]

    # understand the relation with this movie
    similar_movies = piv_df.corrwith(movie_rating)
    similar_movies_df = pd.DataFrame(similar_movies, columns=['corr'])

    return similar_movies_df.sort_values(by=['corr'], ascending=False)

**There is a chance that all the movies from movies dataframe might not be available for recommendation, because we have used only 39% of the training data, because the data set is very big to run on kaggle notebook hardware configurations**

In [24]:
movie_id = '28'

print("Query Movie : ", movie_tite_dict['28'])

reco_df = item_based_corr(movie_id).head(11)
reco_df.reset_index(inplace=True)
reco_df['title'] = reco_df.movieId.apply(lambda x : movie_tite_dict[str(x)])
reco_df

Query Movie :  Lilo and Stitch


Unnamed: 0,movieId,corr,title
0,28,1.0,Lilo and Stitch
1,2690,0.473073,The Emperor's New Groove
2,16303,0.431418,Atlantis: The Lost Empire
3,2171,0.424684,Brother Bear (Theatrical Widescreen Version)
4,9160,0.423905,Treasure Planet
5,940,0.421193,Hercules
6,8743,0.40281,Ice Age
7,6362,0.385231,Spirit: Stallion of the Cimarron
8,3414,0.383469,Pocahontas
9,5607,0.379307,The Hunchback of Notre Dame


## Recommender System based on Cosine Similarity

In [25]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

csr_matrix = sparse.csr_matrix(piv_df.T.values)
csr_matrix



<1800x65000 sparse matrix of type '<class 'numpy.float64'>'
	with 37193851 stored elements in Compressed Sparse Row format>

In [26]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=11, metric='cosine')
knn.fit(csr_matrix)

In [30]:
def  nearest_neighbors_reco(movie_id):
    # get the movie columns 
    # gives the vector ratings given by all users for this movie
    movie_id = movie_id.lower()
    movie_rating = piv_df[int(movie_id)]

    d, ind = knn.kneighbors(movie_rating.values.reshape(1, -1), n_neighbors=11)

    for i in range(6):
        print(movie_tite_dict[str(piv_df.columns[ind.flatten()[i]])] , d.flatten()[i])

In [31]:
movie_id = "8"
print("Query Movie : ", movie_tite_dict[movie_id])
nearest_neighbors_reco(movie_id)

Query Movie :  What the #$*! Do We Know!?
What the #$*! Do We Know!? 0.0
I Heart Huckabees 0.7074333097751231
Eternal Sunshine of the Spotless Mind 0.7079928018521593
Bowling for Columbine 0.7115794921466648
Fahrenheit 9/11 0.7151674736244626
Being John Malkovich 0.7166976144049864
