# Imports

In [2]:
#import libraries and data
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

tags = pd.read_csv('ml-25m/genome-tags.csv')
movies = pd.read_csv('ml-25m/movies.csv')
user_tags = pd.read_csv('ml-25m/tags.csv')
tag_scores = pd.read_csv('ml-25m/genome-scores.csv')
ratings = pd.read_csv('ml-25m/ratings.csv')
links = pd.read_csv('ml-25m/links.csv')

# Data Preprocessing

In [3]:
# joins for convenient filtering
movies_and_ratings = pd.merge(movies, ratings, on='movieId')
tags_and_tag_scores = pd.merge(tags,tag_scores, on='tagId')
tats_ = tags_and_tag_scores.pivot_table(index='movieId', columns='tag', values='relevance',fill_value=0.0)

In [4]:
# feature matrix for all movies
tats = pd.merge(tats_,movies_and_ratings[['title','movieId']].drop_duplicates(),on='movieId')
tats

Unnamed: 0,movieId,007,007 (series),18th century,1920s,1930s,1950s,1960s,1970s,1980s,...,world war i,world war ii,writer's life,writers,writing,wuxia,wwii,zombie,zombies,title
0,1,0.02875,0.02375,0.06250,0.07575,0.14075,0.14675,0.06350,0.20375,0.20200,...,0.01425,0.03050,0.03500,0.14125,0.05775,0.03900,0.02975,0.08475,0.02200,Toy Story (1995)
1,2,0.04125,0.04050,0.06275,0.08275,0.09100,0.06125,0.06925,0.09600,0.07650,...,0.01575,0.01250,0.02000,0.12225,0.03275,0.02100,0.01100,0.10525,0.01975,Jumanji (1995)
2,3,0.04675,0.05550,0.02925,0.08700,0.04750,0.04775,0.04600,0.14275,0.02850,...,0.01950,0.02225,0.02300,0.12200,0.03475,0.01700,0.01800,0.09100,0.01775,Grumpier Old Men (1995)
3,4,0.03425,0.03800,0.04050,0.03100,0.06500,0.03575,0.02900,0.08650,0.03200,...,0.02800,0.01675,0.03875,0.18200,0.07050,0.01625,0.01425,0.08850,0.01500,Waiting to Exhale (1995)
4,5,0.04300,0.05325,0.03800,0.04100,0.05400,0.06725,0.02775,0.07650,0.02150,...,0.02050,0.01425,0.02550,0.19225,0.02675,0.01625,0.01300,0.08700,0.01600,Father of the Bride Part II (1995)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13811,205072,0.02050,0.01775,0.11400,0.03650,0.31225,0.03675,0.10700,0.37925,0.01725,...,0.03650,0.03025,0.12900,0.13975,0.42425,0.03400,0.02350,0.41725,0.09100,Zombieland: Double Tap (2019)
13812,205076,0.03825,0.03150,0.03200,0.05325,0.20850,0.07050,0.06625,0.27825,0.00950,...,0.03225,0.04675,0.03175,0.23025,0.06300,0.04175,0.04125,0.07275,0.02350,Downton Abbey (2019)
13813,205383,0.04100,0.04025,0.02750,0.07850,0.19750,0.17825,0.17125,0.30475,0.16825,...,0.03250,0.02400,0.03575,0.20400,0.08525,0.04600,0.02900,0.11725,0.03925,El Camino: A Breaking Bad Movie (2019)
13814,205425,0.04525,0.04125,0.04250,0.07425,0.11550,0.10500,0.08275,0.13575,0.16125,...,0.04550,0.01425,0.03925,0.21700,0.06000,0.07250,0.01500,0.11050,0.02850,Dave Chappelle: Sticks & Stones (2019)


# Hybrid Filtering

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

features = list(tats.columns[1:-1])
user_Ids = list(movies_and_ratings['userId'].drop_duplicates())
RMSE = []

for userId in [9171]:
# for userId in user_Ids:

    # pulling only movies user has rated
    user_movies = pd.merge(movies_and_ratings[movies_and_ratings['userId']==userId],tats,on='movieId')
    
    # creating feature vector for user, linear combo of rated movies weighted by rating
    user_v_ = user_movies[features].mul(user_movies['rating'],axis=0).mean()
    user_v = pd.Series('user',index=['movieId']).append(user_v_).append(pd.Series('user',index=['title']))

    # appending to movie feature matrix in order to perform cosine similarity pairwise
    user=tats.append(user_v,ignore_index=True)

    ids=list(user['movieId'])
    titles=list(user['title'])

    # perform cosine similarity pairwise
    sims = cosine_similarity(user[features])[-1]
    sorted_sims=[u[0] for u in sorted([(i,j,k) for i,j,k in zip(sims,ids,titles)], key=lambda x: x[0], reverse=1)]
    sorted_ids=[u[1] for u in sorted([(i,j,k) for i,j,k in zip(sims,ids,titles)], key=lambda x: x[0], reverse=1)]
    sorted_recs=[u[2] for u in sorted([(i,j,k) for i,j,k in zip(sims,ids,titles)], key=lambda x: x[0], reverse=1)]

    # scale values to fit neatly within [0,1]
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    transformed_ = scaler.fit_transform(np.log(np.array(sorted_sims[1:])).reshape(-1, 1))*5
    transformed = [5]+list(transformed_.flatten())

    # create predicted df to make the rmse calculation easy
    predicted = pd.DataFrame()
    predicted['sim']=sorted_sims
    predicted['movieId']=sorted_ids
    predicted['rec']=sorted_recs
    predicted['predicted']=transformed

    # easy rmse calculation
    comparison = pd.merge(user_movies[['rating','title_x','movieId']],predicted,on='movieId').sort_values('rating',ascending=False)
    rmse = ((comparison.rating-comparison.predicted)**2).mean()**.5
    RMSE.append(rmse)
    

In [431]:
np.mean(RMSE)

0.972560574367145

In [430]:
len(RMSE)

4676

In [11]:
rmse

0.7043220959602238

In [17]:
predicted[1:21]

Unnamed: 0,sim,movieId,rec,predicted
1,0.958624,179135,Blue Planet II (2017),5.0
2,0.953039,176371,Blade Runner 2049 (2017),4.963714
3,0.950226,204698,Joker (2019),4.945365
4,0.949715,202439,Parasite (2019),4.942019
5,0.947231,184257,Making a Murderer (2015),4.925758
6,0.943733,185029,A Quiet Place (2018),4.902783
7,0.939081,192245,Bad Times at the El Royale (2018),4.8721
8,0.938705,176601,Black Mirror,4.869615
9,0.937441,196997,Black Mirror: Bandersnatch (2018),4.861247
10,0.936535,192379,First Man (2018),4.855241
