# Content-based Movie Rating Estimation

Import required packages.

In [None]:
%run liblecture.py

import math
import numpy as np
from numpy import linalg as LA
import pandas as pd

### Movies Weight Matrix on Genres

Read movie metadata from a csv file.

In [None]:
movies = pd.read_csv('movielens/movies_w_imgurl.csv')
movies.head()

Split genres and stack genres into one column.

In [None]:
movieGenres = pd.DataFrame(data=movies['genres'].str.split('|').apply(pd.Series, 1).stack(), columns=['genre'])
movieGenres.index = movieGenres.index.droplevel(1)

In [None]:
movieGenres

Count movies that have each genre and then compute IDF of genres.

In [None]:
genres = pd.DataFrame(data=movieGenres.groupby('genre')['genre'].count())
genres.columns = ['movieCount']

totalItems = movies.shape[0]

genres['idf'] = genres['movieCount'].apply(lambda x: math.log10(totalItems/x))

genres.head()

Join genre's IDF to movie genre DataFrame.

In [None]:
movieGenreWeights = movieGenres.join(genres['idf'], on='genre')
movieGenreWeights

In [None]:
movieWeights = movies[['movieId']]

for genre in genres.index:
    movieGenreIdf = movieGenreWeights[movieGenreWeights['genre'] == genre][['idf']]
    movieGenreIdf = movieGenreIdf.rename(columns={'idf':genre})
    movieWeights = movieWeights.join(movieGenreIdf)

movieWeights.fillna(0, inplace=True)

### Movie-Movie Cosine Similarity Matrix

Compute $l_2$-norm of movies.

In [None]:
movieNorms = pd.DataFrame(data = LA.norm(movieWeights.iloc[:,1:].values, ord=2, axis=1), index=movieWeights.index, columns=['norm2'])
movieNorms

Normalize movie vector so that similarity can be computed simply by inner product between vectors.

$$ cosine(u, v)=\frac{\sum_{\forall i}{u_i v_i}}{||u||_2||v||_2}=\sum_{\forall i}{\frac{u_i v_i}{||u||_2||v||_2}}=\sum_{\forall i}{\frac{u_i}{||u||_2}\frac{v_i}{||v||_2}}=u'\cdot v'$$

In [None]:
normalizedMovieWeights = movieWeights.iloc[:, 1:].divide(movieNorms['norm2'], axis=0)

normalizedMovieWeights

Create item-item similarity matrix

In [None]:
sims = pd.DataFrame(data=np.matmul(normalizedMovieWeights, normalizedMovieWeights.T))

sims.index = movieWeights['movieId']
sims.columns = movieWeights['movieId']

sims

## Recommend Movies based on Predicted Ratings

Read ratings as train and test datasets.

In [None]:
ratings = pd.read_csv('ratings-9_1.csv')

train = ratings[ratings['type'] == 'train'][['userId', 'movieId', 'rating']]
test = ratings[ratings['type'] == 'test'][['userId', 'movieId', 'rating']]

Set test user ID

In [None]:
userId = 33

Check top rated movies of the test user

In [None]:
userRatings = train[train['userId'] == userId][['movieId', 'rating']] 

topRatings = userRatings.sort_values(by='rating', ascending=False).head(20)

topRatings

displayMovies(movies, topRatings['movieId'].values, topRatings['rating'].values)

Predict item ratings for the test users.

In [None]:
recSimSums = sims.loc[userRatings['movieId'].values, :].sum().values

recSimSums = recSimSums + 1

recWeightedRatingSums = np.matmul(sims.loc[userRatings['movieId'].values, :].T.values, userRatings['rating'].values)

recItemRatings = pd.DataFrame(data = np.divide(recWeightedRatingSums, recSimSums), index=sims.index)

recItemRatings.columns = ['pred']

recItemRatings

# np.matmul()

Check recommended items

In [None]:
top30Movies = recItemRatings.sort_values(by='pred', ascending=False).head(30)

displayMovies(movies, top30Movies.index, top30Movies['pred'].values)

Compute MAE and RMSE for the test user.

In [None]:
userTestRatings = pd.DataFrame(data=test[test['userId'] == userId])

temp = userTestRatings.join(recItemRatings.loc[userTestRatings['movieId']], on='movieId')

mae = getMAE(temp['rating'], temp['pred'])
rmse = getRMSE(temp['rating'], temp['pred'])

print(f"MAE : {mae:.4f}")
print(f"RMSE: {rmse:.4f}")