In [93]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import time
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

In [94]:
df = pd.read_csv('../ml-25m/ratings.csv')
df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [95]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000095 entries, 0 to 25000094
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 762.9 MB


In [96]:
df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [97]:
df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,25000100.0,25000100.0,25000100.0,25000100.0
mean,81189.28,21387.98,3.533854,1215601000.0
std,46791.72,39198.86,1.060744,226875800.0
min,1.0,1.0,0.5,789652000.0
25%,40510.0,1196.0,3.0,1011747000.0
50%,80914.0,2947.0,3.5,1198868000.0
75%,121557.0,8623.0,4.0,1447205000.0
max,162541.0,209171.0,5.0,1574328000.0


In [98]:
df.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [99]:
df1 = df[['userId', 'movieId', 'rating']]
df1_rating_tm = pd.DataFrame(df1.groupby('rating').size(), columns = ['count'])
df1_rating_tm

Unnamed: 0_level_0,count
rating,Unnamed: 1_level_1
0.5,393068
1.0,776815
1.5,399490
2.0,1640868
2.5,1262797
3.0,4896928
3.5,3177318
4.0,6639798
4.5,2200539
5.0,3612474


In [100]:
def shrinking_data(no_of_top_users):
    user_likes = df1.groupby('userId')['movieId'].count()
    top_users = user_likes.nlargest(no_of_top_users).index
    print(f"Top Users Index Values: {top_users}")
    top_users_df = df1[df1['userId'].isin(top_users)].reset_index(drop=True)
    print(f'Top {no_of_top_users} Dataframe')
    return top_users_df

In [101]:
no_of_top_users = 10000
df1 = shrinking_data(no_of_top_users)
df1 

Top Users Index Values: Index([ 72315,  80974, 137293,  33844,  20055, 109731,  92046,  49403,  30879,
       115102,
       ...
        49556,  50562,  52524,  65645,  67013,  68282,  69884,  85677, 100158,
       105868],
      dtype='int64', name='userId', length=10000)
Top 10000 Dataframe


Unnamed: 0,userId,movieId,rating
0,3,1,4.0
1,3,29,4.5
2,3,32,4.5
3,3,50,5.0
4,3,111,4.0
...,...,...,...
9102110,162519,4748,2.0
9102111,162519,4789,1.0
9102112,162519,4855,5.0
9102113,162519,4857,5.0


In [102]:
len(df1)

9102115

In [103]:
df1.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [104]:
num_of_user = len(df1['userId'].unique())
num_of_movie = len(df1['movieId'].unique())
print(f'There are {num_of_user} users and {num_of_movie} movies')
print(f"Min value of 'userId': {df1['userId'].min()} and Max value of 'userId': {df1['userId'].max()}")
print(f"Min value of 'movieId': {df1['movieId'].min()} and Max value of 'movieId': {df1['movieId'].max()}")
print(f"Min value of 'rating': {df1['rating'].min()} and Max value of 'rating': {df1['rating'].max()}")

There are 10000 users and 57606 movies
Min value of 'userId': 3 and Max value of 'userId': 162519
Min value of 'movieId': 1 and Max value of 'movieId': 209163
Min value of 'rating': 0.5 and Max value of 'rating': 5.0


In [105]:
df1.movieId.values

array([   1,   29,   32, ..., 4855, 4857, 5060], dtype=int64)

In [106]:
x_train, x_test, y_train, y_test = train_test_split(df1[['userId', 'movieId']], df1['rating'], test_size=0.2, random_state=42, stratify=df1['rating'])
df_train = pd.concat([x_train, y_train], axis=1).reset_index(drop=True)
df_test = pd.concat([x_test, y_test], axis=1).reset_index(drop=True)
print(df_train)
print(df_test)

         userId  movieId  rating
0        108943      480     2.0
1        151793     2579     3.5
2         54631     1912     4.0
3        138566      762     1.0
4        104048     2291     4.0
...         ...      ...     ...
7281687  111183    76093     4.0
7281688   78438     3683     0.5
7281689   95133     4246     3.5
7281690  140703   170705     4.5
7281691  144980     3252     4.0

[7281692 rows x 3 columns]
         userId  movieId  rating
0        132375     1997     5.0
1         42202    30810     4.5
2        118906      783     2.5
3         47153   134120     0.5
4        141831     2379     4.0
...         ...      ...     ...
1820418  134386     5277     3.5
1820419  112101    27808     3.0
1820420  132649     4018     3.0
1820421   93913    53550     4.0
1820422   37636     1233     4.0

[1820423 rows x 3 columns]


In [107]:
def create_mappings(dataframe_name):
    # user to movie mapping
    user2movie = dataframe_name.groupby('userId')['movieId'].unique().to_dict()

    # movie to user mapping
    movie2user = dataframe_name.groupby('movieId')['userId'].unique().to_dict()

    return user2movie, movie2user
user2movie, movie2user = create_mappings(df1)

In [108]:
print(len(user2movie))
print(len(movie2user))

10000
57606


In [109]:
def usermovierating_mappings(dataframe_name):
    # start time
    start_time = time.time() 
    # User-Movie to Rating Mapping
    usermovie2rating = dataframe_name.pivot_table(index='userId', columns='movieId', values='rating')
    
    # Fill NaN values with 0
    usermovie2rating.fillna(0, inplace=True)
    
    # end time
    end_time = time.time() 
    
    # Calculate the processing time
    processing_time = end_time - start_time  
    print(f"Processing time: {processing_time} seconds")

    return usermovie2rating

In [110]:
usermovie_to_rating_train = usermovierating_mappings(df_train)
usermovie_to_rating_train

Processing time: 53.51198053359985 seconds


movieId,1,2,3,4,5,6,7,8,9,10,...,209119,209129,209131,209135,209147,209151,209153,209155,209159,209163
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,4.0,2.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,0.0,0.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,1.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162484,0.0,3.0,0.0,3.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162495,3.0,3.0,3.5,0.0,2.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162508,4.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,4.5,2.5,0.5,2.0,0.0,4.5,2.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [111]:
usermovie_to_rating_test = usermovierating_mappings(df_test)
usermovie_to_rating_test

Processing time: 24.381258010864258 seconds


movieId,1,2,3,4,5,6,7,8,9,10,...,208909,208911,208939,208941,208955,209051,209073,209085,209121,209123
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
72,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
162484,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162508,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
162516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
# Define a KNN model on cosine similarity
cf_knn_model= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10, n_jobs=-1)


# Fitting the model on our matrix
cf_knn_model.fit(usermovie_to_rating_train)

In [113]:
movie_metadata = pd.read_csv("../ml-25m/movies.csv")

movie_metadata.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [114]:
def datamerge(df1, df2):
 
    merging = df1.merge(df2, on='movieId')
    
    return merging

In [115]:
movie_data = datamerge(df_train, movie_metadata)
movie_data.head(5)

Unnamed: 0,userId,movieId,rating,title,genres
0,108943,480,2.0,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
1,29113,480,4.0,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
2,18021,480,4.0,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
3,64189,480,4.0,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller
4,22774,480,1.0,Jurassic Park (1993),Action|Adventure|Sci-Fi|Thriller


In [116]:
movie_data.shape

(7281692, 5)

In [117]:
movie = pd.read_csv("../ml-25m/movies.csv")
movie

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [146]:
# Function to get recommendations for a movie
def movie_recommendation(movie_id, no_of_nearest_neighbors):
    # Find the nearest neighbors of the given movie
    distances, indices = cf_knn_model.kneighbors(usermovie_to_rating_train.T.iloc[:, movie_id].values.reshape(1, -1))
    # Get the indices of similar movies
    similar_movies_indices = indices.flatten()[1:]  # Exclude the first index (which is the movie itself)
    # Get the movie IDs of similar movies
    similar_movies_ids = usermovie_to_rating_train.columns[similar_movies_indices].tolist()
    print(f'similar_movie_ids: {similar_movies_ids}')
    print(f'indices:{indices}')
    print(f'indices.squeeze():{indices.squeeze()}')
    print(f'indices.squeeze().tolist():{indices.squeeze().tolist()}')
    print(f'distances:{distances}')
    print(f'distances.squeeze():{distances.squeeze()}')
    print(f'distances.squeeze().tolist():{distances.squeeze().tolist()}')
    print(f'zip(indices.squeeze().tolist(),distances.squeeze().tolist()):{list(zip(indices.squeeze().tolist(),distances.squeeze().tolist()))}')
    similar_movies_ids = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1][::-1]
    
    cf_recs = []
    for i in similar_movies_ids:
        cf_recs.append({'Title':movie['title'][i[0]],'Distance':i[1]})
        # cf_recs.append({'Movie Id': i, 'Title':movie['title'][i]})
    
    # Select top number of recommendations needed
    df = pd.DataFrame(cf_recs, index = range(1,no_of_nearest_neighbors))
    
    return  df

# Example usage:
chosen_movie_id = 1
recommended_movies = movie_recommendation(chosen_movie_id, 10)  # Replace 123 with the movie ID for which you want recommendations
print('chosen movie based on movie id: ', movie['title'][chosen_movie_id])
print("Recommended movies:") 
recommended_movies

similar_movie_ids: [6107, 6726, 5848, 3308, 345, 26018, 26107, 492, 6831]
indices:[[   1 5982 6590 5724 3207  340 8507 8564  487 6694]]
indices.squeeze():[   1 5982 6590 5724 3207  340 8507 8564  487 6694]
indices.squeeze().tolist():[1, 5982, 6590, 5724, 3207, 340, 8507, 8564, 487, 6694]
distances:[[0.         0.55519423 0.62416634 0.62437951 0.62786083 0.62895396
  0.63186772 0.63536878 0.63692959 0.63989957]]
distances.squeeze():[0.         0.55519423 0.62416634 0.62437951 0.62786083 0.62895396
 0.63186772 0.63536878 0.63692959 0.63989957]
distances.squeeze().tolist():[0.0, 0.5551942270541257, 0.6241663393745089, 0.6243795129073715, 0.6278608296284365, 0.6289539564229432, 0.6318677150849188, 0.6353687844503045, 0.636929585124733, 0.6398995737215138]
zip(indices.squeeze().tolist(),distances.squeeze().tolist()):[(1, 0.0), (5982, 0.5551942270541257), (6590, 0.6241663393745089), (5724, 0.6243795129073715), (3207, 0.6278608296284365), (340, 0.6289539564229432), (8507, 0.6318677150849188),

Unnamed: 0,Title,Distance
1,Liquid Sky (1982),0.555194
2,Millennium Actress (Sennen joyû) (2001),0.624166
3,Houseboat (1958),0.62438
4,Pitch Black (2000),0.627861
5,"Adventures of Priscilla, Queen of the Desert, ...",0.628954
6,Samurai I: Musashi Miyamoto (Miyamoto Musashi)...,0.631868
7,Eegah (1962),0.635369
8,Manhattan Murder Mystery (1993),0.63693
9,"White Hunter, Black Heart (1990)",0.6399


In [120]:
def movie_recommender_engine(movie_name, matrix, model_name, no_of_nearest_neighbors):
    
    # Extract input movie ID
    movie_id = process.extractOne(movie_name, movie['title'])[2]
    
    # Calculate neighbour distances
    distances, indices =  distances, indices = model_name.kneighbors(matrix.iloc[:, movie_id].values.reshape(1, -1))
    
    similar_movies_ids = sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),key=lambda x: x[1])[:0:-1]
    # List to store recommendations
    cf_recs = []
    for i in similar_movies_ids:
        cf_recs.append({'Title':movie['title'][i[0]],'Distance':i[1]})
    
    # Select top number of recommendations needed
    df = pd.DataFrame(cf_recs, index = range(1,no_of_nearest_neighbors))
     
    return df

In [122]:
movie_recommender_engine('Toy Story', usermovie_to_rating_train.T, cf_knn_model, 10)

Unnamed: 0,Title,Distance
1,Signs of Life (Lebenszeichen) (1968),0.537263
2,"Pride of the Yankees, The (1942)",0.535089
3,"Matter of Life and Death, A (Stairway to Heave...",0.53382
4,Duck Soup (1933),0.533171
5,Carrie (2002),0.530177
6,To Sir with Love (1967),0.527006
7,"Man of the Year, The (O Homem do Ano) (2003)",0.523993
8,Cronos (1993),0.520136
9,"Big Knife, The (1955)",0.504154
