In [81]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.base import BaseEstimator

### Load Data

> $Y, R \in \mathbb{R}^{1682 \times 943}$

In [2]:
data = loadmat('ex8_movies.mat')
Y, R = data['Y'].astype(float), data['R'].astype(float)

### Item-Based Adjusted Cosine Similarity

> $\displaystyle s_{i,j} = \frac{\sum_{x} (r_{xi} - \bar{r}_x) (r_{xj} - \bar{r}_x)}
{\sqrt{\sum_{x}(r_{xi} - \bar{r}_x)^2} \sqrt{\sum_{x}(r_{xj} - \bar{r}_x)^2}}$

$r_{xi}$ is a rating of user $x$ on item $i$.

$r_{xj}$ is a rating of user $x$ on item $j$.

$\bar{r}_{x}$ is the mean of all the rating of user $x$.

In [3]:
Y_meaned = np.zeros_like(Y)
users_mean_rating = np.average(Y, axis=0, weights=R)
Y_meaned[R == 1] = (Y - users_mean_rating)[R == 1]
cosine_similarity(Y_meaned)

array([[ 1.        , -0.05836654, -0.06555991, ...,  0.        ,
         0.05337722, -0.03320282],
       [-0.05836654,  1.        ,  0.03355092, ...,  0.        ,
         0.00192627,  0.03645238],
       [-0.06555991,  0.03355092,  1.        , ...,  0.        ,
         0.        ,  0.03323839],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.05337722,  0.00192627,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.03320282,  0.03645238,  0.03323839, ...,  0.        ,
         0.        ,  1.        ]])

### Collaborative Filtering
> $\displaystyle \hat{r}_{xi} = \frac{\sum_{j \in N(i,x)} s_{ij} \cdot r_{xj}}{\sum_{j \in N(i,x)} s_{ij}}$

$s_{ij}$ is the **similarity** of items $i$ and $j$.

$r_{xj}$ is the **rating** of user $x$ on item $j$.

$N(i,x)$ is the set of **items similar** to item $i$ that were **rated** by user $x$.

In [4]:
def co_filtering(item, user, k=10):
    Y_meaned_rated_by_user = Y_meaned[R[:, user] == 1]
    nn = NearestNeighbors(n_neighbors=k, metric='cosine')
    nn.fit(Y_meaned_rated_by_user)
    nn_distances, nn_indices = nn.kneighbors(Y[[item],:], n_neighbors=k)
    nn_ratings = Y_meaned_rated_by_user[nn_indices, user]
    return (nn_distances[0] @ nn_ratings[0]) / nn_distances.sum()

co_filtering(3,10)

0.3629208348880235

### Including Global Effects

> $\displaystyle \hat{r}_{xi} = b_{xi} + \frac{\sum_{j \in N(i,x)} s_{ij} \cdot (r_{xj} - b_{xj})}{\sum_{j \in N(i,x)} s_{ij}}$

Where we use the **baseline** $b_{xi}$.

$b_{xi} = \mu + b_x + b_i$

$\mu$ is the **overall mean rating**.

$b_x$ is the **rating deviation** of user $x$:
> $b_x$ = (average rating of user $x$) - $\mu$

$b_i$ is the **rating deviation** of item $i$:
> $b_i$ = (average rating of item $i$) - $\mu$

In [73]:
overall_mean_rating = np.average(Y, weights=R)
users_mean_rating = np.average(Y, axis=0, weights=R)
users_mean_deviation = users_mean_rating - overall_mean_rating
items_mean_rating = np.average(Y, axis=1, weights=R)
items_mean_deviation = items_mean_rating - overall_mean_rating

def baseline(item, user):
    return overall_mean_rating + users_mean_deviation[user] + items_mean_deviation[item]

def co_filtering_baselined(item, user, k=10):
    Y_meaned_rated_by_user = Y_meaned[R[:, user] == 1] # items - users_mean_rating filtered by user_rating
    Y_rated_by_user = Y[R[:, user] == 1] # items filtered by user_rating
    nn = NearestNeighbors(n_neighbors=k, metric='cosine')
    nn.fit(Y_meaned_rated_by_user)
    nn_distances, nn_indices = nn.kneighbors(Y_meaned[[item],:], n_neighbors=k)
    nn_baselines = overall_mean_rating + users_mean_deviation[user] + items_mean_deviation[R[:,user]==1][nn_indices]
    nn_ratings = Y_rated_by_user[nn_indices, user] - nn_baselines
    return baseline(item, user) + (nn_distances[0] @ nn_ratings[0]) / nn_distances.sum()

co_filtering_baselined(3,10)

2.6563863076335448

In [75]:
Y

array([[5., 4., 0., ..., 5., 0., 0.],
       [3., 0., 0., ..., 0., 0., 5.],
       [4., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [78]:
co_filtering_baselined(1,0,k=10)

3.3193893724887324

### Root Mean Squared Error (RMSE)

> $\displaystyle RMSE = \sqrt{\frac{\sum_{(i,x)}(\hat{r}_{xi} - r_{xi})^2}{|R|}}$

$R$ is the ratings matrix, $R \in \mathbb{R}^{m \times n}$.

$\hat{r}_{xi}$ is a **Predicted** rating of user $x$ on item $i$.

$r_{xi}$ is a **True** rating of user $x$ on item $i$.

$|R|$ is the total number of ratings.