In [58]:
import numpy as np
import pandas as pd

In [59]:
meta = pd.read_csv('/kaggle/input/the-movies-dataset/movies_metadata.csv')
meta

In [60]:
meta = meta[['id','original_title','original_language','genres']]
meta

In [61]:
meta = meta.rename(columns={'id':'movieId',
                           'original_title':'title',
                           'original_language':'language'})
meta

In [62]:
meta = meta.loc[meta['language'] == 'en',:]
meta

In [63]:
meta.movieId = pd.to_numeric(meta.movieId)
meta.movieId

In [64]:
def str_to_set(x):
    genre_set = set()
    for item in eval(x):
        genre_set.add(item['name'])
    return genre_set

In [65]:
meta.genres = meta.genres.apply(str_to_set)
meta

In [66]:
keywords = pd.read_csv('/kaggle/input/the-movies-dataset/keywords.csv')
keywords

In [67]:
keywords.keywords = keywords.keywords.apply(str_to_set)
keywords

In [68]:
keywords = keywords.rename(columns={'id':'movieId'})
keywords.movieId = pd.to_numeric(keywords.movieId)
keywords

In [69]:
meta = pd.merge(meta,keywords,on='movieId',how='inner')
meta

In [70]:
dk = meta.loc[meta.title == 'The Dark Knight'].iloc[0]
dkr = meta.loc[meta.title == 'The Dark Knight Rises'].iloc[0]
pd.concat([dk, dkr], axis=1).T

In [71]:
def jaccard_similarity(s1, s2):
    if len(s1|s2) == 0:
        return 0
    return len(s1&s2)/len(s1|s2)

In [72]:
jaccard_similarity(dk.genres|dk.keywords,dkr.genres|dkr.keywords)

In [73]:
ratings = pd.read_csv( '/kaggle/input/the-movies-dataset/ratings_small.csv' )
ratings

In [74]:
ratings.movieId = pd.to_numeric(ratings.movieId)


In [75]:
ratings = pd.merge(ratings, meta[['movieId', 'title']], on='movieId', how='inner')


In [76]:
matrix = ratings.pivot_table(index= 'userId', columns='title', 
values='rating')
matrix

In [77]:
def pearson_similarity(u1, u2):
    u1_c = u1 - u1.mean()
    u2_c = u2 - u2.mean()
    denom = np.sqrt(np.sum(u1_c ** 2) * np.sum(u2_c ** 2))
    if denom != 0:
        return np.sum(u1_c * u2_c)/denom
    else:
        return 0

In [78]:
dk_rating = matrix['The Dark Knight']
pk_rating = matrix['Prom Night']
pearson_similarity(dk_rating, pk_rating)

In [79]:
def find_similar_movies (input_title, matrix, n, alpha):
    input_meta = meta.loc[ meta[ 'title'] == input_title].iloc[ 0]
    input_set = input_meta.genres | input_meta.keywords

    result = []
 
    for this_title in matrix.columns:
        if this_title == input_title:
               continue
        this_meta = meta.loc[ meta['title'] == this_title].iloc[0]
        this_set = this_meta.genres | this_meta.keywords
        pearson = pearson_similarity(matrix[this_title], matrix[input_title])
        jaccard = jaccard_similarity(this_set, input_set)
        score = alpha * pearson + (1-alpha) * jaccard
        result.append((this_title, pearson, jaccard, score))
    result.sort(key=lambda r: r[3], reverse=True)
    return result[:n]


In [80]:
result = find_similar_movies('The Dark Knight', matrix, 10, 0.3)
pd.DataFrame(result, columns = ['title', 'pearson', 'jaccard', 'score'])
