In [279]:
import pandas as pd
import sys
import numpy as np
import warnings
from sklearn.cluster import MeanShift, estimate_bandwidth
warnings.filterwarnings('ignore')
np.set_printoptions(threshold=sys.maxsize)
pd.set_option('display.max_columns', None)

In [280]:
movies=pd.read_csv('movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [281]:
ratings=pd.read_csv('ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [282]:
unique_genres=[]
each_movie_genres = movies["genres"].to_numpy()
for genres in each_movie_genres.tolist():
    movie_genre_list=genres.split('|')
    unique_genres.append(movie_genre_list)
unique_genres = list(set([genre for movie_genre_list in unique_genres for genre in movie_genre_list]))
unique_genres

['Children',
 'Thriller',
 'Horror',
 'Animation',
 '(no genres listed)',
 'IMAX',
 'Fantasy',
 'Documentary',
 'Drama',
 'Adventure',
 'Crime',
 'Western',
 'Action',
 'Comedy',
 'Romance',
 'Film-Noir',
 'Musical',
 'Mystery',
 'Sci-Fi',
 'War']

In [283]:
for genre in unique_genres:
    movies[genre]=''
for genre in unique_genres:
    movies[genre] = pd.np.where(movies['genres'].str.contains(genre), 1, 0)
movies.head()

Unnamed: 0,movieId,title,genres,Children,Thriller,Horror,Animation,(no genres listed),IMAX,Fantasy,Documentary,Drama,Adventure,Crime,Western,Action,Comedy,Romance,Film-Noir,Musical,Mystery,Sci-Fi,War
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1,0,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [284]:
users = list(set(ratings["userId"].to_numpy()))

In [285]:
def build_ratings_vector_of_user(userId):
    user_ratings_df = ratings.loc[ratings['userId']==userId]
    user_ratings_vector = user_ratings_df['rating'].to_numpy().reshape(user_ratings_df.shape[0],1)
    return user_ratings_vector

In [286]:
def build_movies_df_of_user(userId):
    user_ratings_df = ratings.loc[ratings['userId']==userId]
    movies_watched_list = user_ratings_df['movieId'].tolist()
    user_movies_df = movies.loc[movies['movieId'].isin(movies_watched_list)]
    user_movies_df.drop(['movieId','title','genres'], inplace=True, axis=1)
    return user_movies_df

In [287]:
def build_genre_count_of_all_movies(user_movies_df):
    return user_movies_df.sum(axis=0)

In [288]:
def build_user_profile(user_ratings_vector, user_movies_df, genre_count_of_all_movies):
    user_ratings_genres = user_ratings_vector * user_movies_df.values
    user_ratings_genres_total = user_ratings_genres.sum(axis=0)
    user_profile = user_ratings_genres_total / np.array(genre_count_of_all_movies)
    user_profile = pd.DataFrame({'Genre':genre_count_of_all_movies.index, 'Rating':user_profile, 'Count': build_genre_count_of_all_movies(user_movies_df)}) 
    user_profile['Percentage'] = user_profile['Count'] * (100/user_profile['Count'].sum())
    
    return user_profile

In [289]:
def get_user_profile_by_id(userId):
    user_ratings_vector = build_ratings_vector_of_user(userId)
    user_movies_df = build_movies_df_of_user(userId)
    genre_count_of_all_movies = build_genre_count_of_all_movies(user_movies_df)
    user_profile = build_user_profile(user_ratings_vector, user_movies_df, genre_count_of_all_movies)
    return user_profile

In [290]:
def get_unwatched_movies_sample_df(sample_size, userId):
    user_movies_df = build_movies_df_of_user(userId)
    movies_watched = user_movies_df.index.tolist()
    movies_unwatched = movies[~movies.index.isin(movies_watched)]
    movies_unwatched_df = movies_unwatched.sample(n = sample_size)
    movies_unwatched_df.drop(['movieId','title','genres'], inplace=True, axis=1)
    return movies_unwatched_df

In [291]:
def build_user_profile_ratings_vector(user_profile):
    user_profile = user_profile.reindex(['Children', 'Thriller', 'Horror', 'Animation', '(no genres listed)', 'IMAX', 'Fantasy', 'Documentary', 'Drama', 'Adventure', 'Crime', 'Western', 'Action', 'Comedy', 'Romance', 'Film-Noir', 'Musical', 'Mystery', 'Sci-Fi', 'War'])
    user_profile_ratings_vector = user_profile[['Rating']].T.to_numpy()
    return user_profile_ratings_vector

In [292]:
def predict_ratings_sampled_movies(user_profile_ratings_vector, movies_unwatched_df):
    movies_unwatched = movies_unwatched_df.values
    num_features = np.sum(movies_unwatched,axis=1)
    user_profile_ratings_vector = np.nan_to_num(user_profile_ratings_vector)
    movie_ratings = user_profile_ratings_vector * movies_unwatched
    movie_ratings = np.sum(movie_ratings,axis=1)
    movie_ratings = movie_ratings / num_features
    return movie_ratings

In [293]:
def show_predicted_ratings_of_sample(movies_unwatched_df,movie_ratings):
    unwatched_movies_list = movies_unwatched_df.index.tolist()
    predicted_ratings_df = movies.loc[movies.index.isin(unwatched_movies_list)]
    predicted_ratings_df = predicted_ratings_df.reindex(np.array(unwatched_movies_list))
    predicted_ratings_df = predicted_ratings_df[['movieId','title','genres']]
    predicted_ratings_df = predicted_ratings_df.assign(predicted_rating=movie_ratings)
    return predicted_ratings_df

In [294]:
def predict_from_random_sample(sample_size, userId, user_profile):
    movies_unwatched_df = get_unwatched_movies_sample_df(sample_size, userId)
    user_profile_ratings_vector = build_user_profile_ratings_vector(user_profile)
    movie_ratings = predict_ratings_sampled_movies(user_profile_ratings_vector, movies_unwatched_df)
    predicted_ratings_df = show_predicted_ratings_of_sample(movies_unwatched_df,movie_ratings)
    return predicted_ratings_df

In [295]:
def generate_clusters(scores):
    prev = None
    cluster = []
    for score in numbers:
        if not prev or score - prev <= 15:
            cluster.append(score)
        else:
            yield cluster
            cluster = [score]
        prev = score
    if cluster:
        yield cluster

In [296]:
def target_movies(user_profile, userId):
    user_profile['genre_preference_score'] = user_profile['Rating'] * user_profile['Percentage']
    user_profile = user_profile.sort_values(by=['genre_preference_score'], ascending=False)
    scores = np.nan_to_num(user_profile['genre_preference_score']).tolist()
    scores.reverse()
    clusters = dict(enumerate(generate_clusters(scores)))
    #print(clusters)
    num_most_preferred_genres = len(clusters[len(clusters)-1])
    most_preferred_genres = user_profile.head(num_most_preferred_genres).index.tolist()
    print('Most preferred Genres:',most_preferred_genres)
    
    user_ratings_df = ratings.loc[ratings['userId']==userId]
    movies_watched_list = user_ratings_df['movieId'].tolist()
    user_movies_df = movies.loc[movies['movieId'].isin(movies_watched_list)]
    user_movies_unwatched_df = movies.drop(user_movies_df.index)
    
    movies_preferred_df = pd.DataFrame(columns = list(user_movies_unwatched_df.columns))
    for genre in most_preferred_genres:
        movies_preferred_sample = user_movies_unwatched_df.loc[(user_movies_unwatched_df[genre] == 1)]
        movies_preferred_df = movies_preferred_df.append(movies_preferred_sample)
    
    movies_preferred_df['number_of_preferred_genres'] = movies_preferred_df[most_preferred_genres].sum(axis=1).astype(int)
    
    movies_preferred_df = movies_preferred_df.sort_values(by=['number_of_preferred_genres'], ascending=False)
    highest_number_of_preferred_genres = movies_preferred_df.iloc[0]['number_of_preferred_genres']
    movies_preferred_df = movies_preferred_df.loc[movies_preferred_df['number_of_preferred_genres']==highest_number_of_preferred_genres]
    
    movies_preferred_df.drop(['movieId','title','genres','number_of_preferred_genres'], inplace=True, axis=1)
    
    user_profile_ratings_vector = build_user_profile_ratings_vector(user_profile)
    movie_ratings = predict_ratings_sampled_movies(user_profile_ratings_vector, movies_preferred_df)
    predicted_ratings_df = show_predicted_ratings_of_sample(movies_preferred_df,movie_ratings).sort_values(by=['predicted_rating'], ascending=False).head(25)
    
    return predicted_ratings_df

In [297]:
userId = 345
sample_size = 10

In [298]:
user_profile = get_user_profile_by_id(userId)
user_profile

Unnamed: 0,Genre,Rating,Count,Percentage
Children,Children,3.75,8,0.940071
Thriller,Thriller,3.788136,118,13.86604
Horror,Horror,3.486111,108,12.690952
Animation,Animation,2.428571,7,0.822562
(no genres listed),(no genres listed),,0,0.0
IMAX,IMAX,3.0,1,0.117509
Fantasy,Fantasy,3.580645,31,3.642773
Documentary,Documentary,3.0,6,0.705053
Drama,Drama,4.06962,158,18.566392
Adventure,Adventure,3.676471,34,3.9953


In [299]:
predicted_ratings_df = predict_from_random_sample(sample_size, userId, user_profile)
predicted_ratings_df

Unnamed: 0,movieId,title,genres,predicted_rating
5756,5868,This Is Elvis (1981),Documentary|Drama|Musical,3.282466
59025,199924,Vicious (2019),Thriller,3.788136
45423,170355,Mulholland Dr. (1999),Drama|Mystery|Romance,4.025156
52219,184817,Lunch Meat (1987),Drama|Horror,3.777866
15832,83435,"Hatful of Rain, A (1957)",Drama,4.06962
54287,189225,Love And Other Cults (2017),Comedy|Drama,3.787717
36272,149820,Corporate Affairs (2008),Comedy,3.505814
28948,132832,The Chase (1946),Crime,3.982143
25197,123196,Ritual (2002),Horror,3.486111
472,477,What's Love Got to Do with It? (1993),Drama|Musical,3.423699


In [300]:
target_movies(user_profile,userId)

Most preferred Genres: ['Drama', 'Thriller', 'Horror']


Unnamed: 0,movieId,title,genres,predicted_rating
15389,81132,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...,3.96102
15389,81132,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...,3.96102
15389,81132,Rubber (2010),Action|Adventure|Comedy|Crime|Drama|Film-Noir|...,3.96102
9587,31636,"Bunker, The (2001)",Drama|Horror|Mystery|Thriller|War,3.92499
9587,31636,"Bunker, The (2001)",Drama|Horror|Mystery|Thriller|War,3.92499
9587,31636,"Bunker, The (2001)",Drama|Horror|Mystery|Thriller|War,3.92499
19682,102138,"Black Camel, The (Charlie Chan in the Black Ca...",Crime|Drama|Horror|Mystery|Thriller,3.89415
19682,102138,"Black Camel, The (Charlie Chan in the Black Ca...",Crime|Drama|Horror|Mystery|Thriller,3.89415
15004,79498,"Town That Dreaded Sundown, The (1976)",Crime|Drama|Horror|Mystery|Thriller,3.89415
23852,119695,The Evictors (1979),Crime|Drama|Horror|Mystery|Thriller,3.89415
