In [12]:
import pandas as pd

# Read and concatenate the data files
file_paths = ['/kaggle/input/netflix-prize-data/combined_data_1.txt'
#               , '/kaggle/input/netflix-prize-data/combined_data_2.txt'
#               ,'/kaggle/input/netflix-prize-data/combined_data_3.txt'
#               ,'/kaggle/input/netflix-prize-data/combined_data_4.txt'
             ]

dfs = []
for file_path in file_paths:
    df = pd.read_csv(file_path, header=None, names=['userId', 'rating'], usecols=[0, 1])
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

# Identify indices of NaN values in 'rating' column
nan_indices = df[df['rating'].isna()].index.to_list()

# Extract ratings data
ratings = []
for i in range(len(nan_indices)):
    start_index = nan_indices[i] + 1
    end_index = nan_indices[i + 1] - 1 if i + 1 < len(nan_indices) else None
    temp_df = df.loc[start_index:end_index].reset_index(drop=True)
    temp_df['movieId'] = df.loc[nan_indices[i], 'userId'][:-1]
    ratings.append(temp_df)

df = pd.concat(ratings, ignore_index=True)
del ratings, dfs

In [13]:
df = df.sample(100000)
df.head()

Unnamed: 0,userId,rating,movieId
11096271,9393,2.0,2152
21456795,1924584,1.0,4040
23245450,1429446,4.0,4356
16669888,2622198,5.0,3239
10566073,877113,4.0,2077


In [14]:
movies_df = pd.read_csv('/kaggle/input/netflix-prize-data/movie_titles.csv', header=None, encoding="ISO-8859-1",
                        usecols=[0, 1, 2], names=['movieId', 'year', 'title'])

movies_df.sample(20)

Unnamed: 0,movieId,year,title
2524,2525,1942.0,Sullivan's Travels
596,597,1977.0,The Brady Bunch Variety Hour
7293,7294,1996.0,Fools Rush In
5409,5410,2000.0,Shooting War: World War II Combat Cameramen
9510,9511,1987.0,Someone to Watch Over Me
11548,11549,1992.0,The Commish: Season 2
11419,11420,1990.0,Miami Blues
13167,13168,1972.0,Beware! The Blob
14657,14658,2000.0,Wrestlemania Anthology: Vol. 4
5725,5726,1980.0,Ram Balram


In [15]:
movies_df.movieId = movies_df.movieId.astype(str)
movie_tite_dict = dict(zip(movies_df.movieId, movies_df.title))

In [16]:
del movies_df

In [17]:
piv_df = pd.pivot_table(df, index="userId", columns='movieId', values="rating", aggfunc='mean')
piv_df.fillna(0, inplace=True)
piv_df.head()

movieId,1,10,1000,1001,1002,1004,1006,1008,1009,101,...,988,989,99,990,992,993,994,996,997,999
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1000033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000105,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1000195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def item_based_corr(movie_id):
    # get the movie columns 
    # gives the vector ratings given by all users for this movie
    movie_id = movie_id.lower()
    movie_rating = piv_df[movie_id]

    # understand the relation with this movie
    similar_movies = piv_df.corrwith(movie_rating)
    similar_movies_df = pd.DataFrame(similar_movies, columns=['corr'])

    return similar_movies_df.sort_values(by=['corr'], ascending=False)

**There is a chance that all the movies from movies dataframe might not be available for recommendation, because we have used only 10% of the training data, because the data set is very big to run on kaggle notebook hardware configurations**

In [19]:
movie_id = '3282'

print("Query Movie : ", movie_tite_dict[movie_id])

reco_df = item_based_corr(movie_id).head(5)
# reco_df['title'] = reco_df.movieId.apply(lambda x : movie_tite_dict[x])
reco_df.reset_index(inplace=True)
reco_df['title'] = reco_df.movieId.apply(lambda x : movie_tite_dict[x])
reco_df

Query Movie :  Sideways


Unnamed: 0,movieId,corr,title
0,3282,1.0,Sideways
1,642,0.021848,Mystery Science Theater 3000: The Hellcats
2,2083,0.021658,Blood Alley
3,1039,0.020067,Lawn Dogs
4,1191,0.01949,Most Wanted


## Recommender System based on Cosine Similarity

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

csr_matrix = sparse.csr_matrix(piv_df.T.values)
csr_matrix

<3483x77014 sparse matrix of type '<class 'numpy.float64'>'
	with 100000 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=5, metric='cosine')
knn.fit(csr_matrix)

In [26]:
def  nearest_neighbors_reco(movie_id):
    # get the movie columns 
    # gives the vector ratings given by all users for this movie
    movie_id = movie_id.lower()
    movie_rating = piv_df[movie_id]

    d, ind = knn.kneighbors(movie_rating.values.reshape(1, -1), n_neighbors=6)

    for i in range(6):
        print(movie_tite_dict[piv_df.columns[ind.flatten()[i]]] , d.flatten()[i])

In [28]:
movie_id = "3282"
print("Query Movie : ", movie_tite_dict[movie_id])
nearest_neighbors_reco(movie_id)

Query Movie :  Sideways
Sideways 5.773159728050814e-15
Mystery Science Theater 3000: The Hellcats 0.9777017145839646
Blood Alley 0.9779352109179656
Lawn Dogs 0.9793046265100074
Most Wanted 0.9799272718779143
The Big Hit 0.9814673191612203
