In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
from scipy.io import loadmat
from scipy.spatial.distance import pdist, squareform

### Load Data

> $Y, R \in \mathbb{R}^{1682 \times 943}$

In [2]:
data = loadmat('ex8_movies.mat')
Y, R = data['Y'].astype(float), data['R'].astype(float)

### Item-Based Adjusted Cosine Similarity

> $\displaystyle s_{i,j} = \frac{\sum_{x} (r_{xi} - \bar{r}_x) (r_{xj} - \bar{r}_x)}
{\sqrt{\sum_{x}(r_{xi} - \bar{r}_x)^2} \sqrt{\sum_{x}(r_{xj} - \bar{r}_x)^2}}$

$r_{xi}$ is a rating of user $x$ on item $i$.

$r_{xj}$ is a rating of user $x$ on item $j$.

$\bar{r}_{x}$ is the mean of all the rating of user $x$.

In [3]:
def adjusted_cosine_matrix(Y, R):
    sum_user_ratings = np.sum(R, axis=0)
    mean_user_ratings = np.sum(Y, axis=0) / np.where(sum_user_ratings > 0, sum_user_ratings, np.ones_like(sum_user_ratings))
    Y_meaned = np.where(R == 1, Y - mean_user_ratings, np.zeros_like(Y))
    Y_norm = np.linalg.norm(Y_meaned, axis=1, keepdims=True)
    return (Y_meaned @ Y_meaned.T) / (Y_norm @ Y_norm.T)

In [4]:
adjusted_cosine_matrix(Y,R)

array([[ 1.        , -0.05836654, -0.06555991, ...,  0.        ,
         0.05337722, -0.03320282],
       [-0.05836654,  1.        ,  0.03355092, ...,  0.        ,
         0.00192627,  0.03645238],
       [-0.06555991,  0.03355092,  1.        , ...,  0.        ,
         0.        ,  0.03323839],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  1.        ,
         0.        ,  0.        ],
       [ 0.05337722,  0.00192627,  0.        , ...,  0.        ,
         1.        ,  0.        ],
       [-0.03320282,  0.03645238,  0.03323839, ...,  0.        ,
         0.        ,  1.        ]])

In [4]:
# Too Slow
def adj_cos(Y, R):
    sum_user_ratings = np.sum(R, axis=0)
    mean_user_ratings = np.sum(Y, axis=0) / np.where(sum_user_ratings > 0, sum_user_ratings, np.ones_like(sum_user_ratings))
    Y_meaned = np.where(R == 1, Y - mean_user_ratings, np.zeros_like(Y))
    return 1-squareform(pdist(Y_meaned, metric='cosine'))

### Collaborative Filtering
> $\displaystyle \hat{r}_{xi} = \frac{\sum_{j \in N(i,x)} s_{ij} \cdot r_{xj}}{\sum_{j \in N(i,x)} s_{ij}}$

$s_{ij}$ is the **similarity** of items $i$ and $j$.

$r_{xj}$ is the **rating** of user $x$ on item $j$.

$N(i,x)$ is the set of **items similar** to item $i$ that were **rated** by user $x$.

In [5]:
def co_filtering(Y, R, S, x, i, k=10):
    user_rated_indexes = np.nonzero(R[:, x] == 1)[0]
    user_rated_sims = S[user_rated_indexes, i]
    knn_indexes = np.argsort(user_rated_sims)[:-k-1:-1]
    knn_sims = user_rated_sims[knn_indexes]
    knn_ratings = Y[user_rated_indexes[knn_indexes], x]
    return (knn_sims @ knn_ratings) / knn_sims.sum()

### Including Global Effects

> $\displaystyle \hat{r}_{xi} = b_{xi} + \frac{\sum_{j \in N(i,x)} s_{ij} \cdot (r_{xj} - b_{xj})}{\sum_{j \in N(i,x)} s_{ij}}$

Where we use the **baseline** $b_{xi}$.

$b_{xi} = \mu + b_x + b_i$

$\mu$ is the **overall mean rating**.

$b_x$ is the **rating deviation** of user $x$:
> $b_x$ = (average rating of user $x$) - $\mu$

$b_i$ is the **rating deviation** of item $i$:
> $b_i$ = (average rating of item $i$) - $\mu$

In [6]:
def baseline_matrix(Y, R):
    sum_global_ratings = np.sum(R)
    mean_global_ratings = np.sum(Y) / np.where(sum_global_ratings > 0,
                                                     sum_global_ratings,
                                                     np.ones_like(sum_global_ratings))
    sum_user_ratings = np.sum(R, axis=0)
    mean_user_ratings = np.sum(Y, axis=0) / np.where(sum_user_ratings > 0,
                                                     sum_user_ratings,
                                                     np.ones_like(sum_user_ratings))
    sum_item_ratings = np.sum(R, axis=1)
    mean_item_ratings = np.sum(Y, axis=1) / np.where(sum_item_ratings > 0,
                                                     sum_item_ratings,
                                                     np.ones_like(sum_item_ratings))
    user_deviations = np.reshape(mean_user_ratings - mean_global_ratings, (1,-1))
    item_deviations = np.reshape(mean_item_ratings - mean_global_ratings, (-1,1))
    return mean_global_ratings + item_deviations + user_deviations

In [7]:
def co_filtering_with_baseline(Y, R, B, S, x, i, k=10):
    user_rated_indexes = np.nonzero(R[:, x] == 1)[0]
    user_rated_sims = S[user_rated_indexes, i]
    knn_indexes = np.argsort(user_rated_sims)[:-k-1:-1]
    knn_sims = user_rated_sims[knn_indexes]
    knn_ratings = Y[user_rated_indexes[knn_indexes], x]
    knn_deviations = B[user_rated_indexes[knn_indexes], x]
    return (knn_sims @ (knn_ratings - knn_deviations)) / knn_sims.sum()

In [8]:
def predict(Y, R, x, i, baseline=True, k=10):
    S = adjusted_cosine_matrix(Y, R)
    if baseline:
        B = baseline_matrix(Y, R)
        return B[i, x] + co_filtering_with_baseline(Y, R, B, S, x, i, k)
    return co_filtering(Y, R, S, x, i, k)

In [9]:
def predict_matrix(Y, R, baseline=True, k=10):
    S = adjusted_cosine_matrix(Y, R)
    if baseline:
        B = baseline_matrix(Y, R)
    predictions = np.zeros_like(Y)
    for i in np.arange(Y.shape[0]):
        for x in np.arange(Y.shape[1]):
            if baseline:
                predictions[i, x] = B[i, x] + co_filtering_with_baseline(Y, R, B, S, x, i, k)
            else:
                predictions[i, x] = co_filtering(Y, R, S, x, i, k)
    return predictions

In [10]:
# !!! Virer element initial des rated_indexes !!!
predict(Y,R,0,0,k=1)

5.0

### Root Mean Squared Error (RMSE)

> $\displaystyle RMSE = \frac{1}{|R|} \sqrt{\sum_{(i,x)}(\hat{r}_{xi} - r_{xi})^2}$

$R$ is the ratings matrix, $R \in \mathbb{R}^{m \times n}$.

$\hat{r}_{xi}$ is a **Predicted** rating of user $x$ on item $i$.

$r_{xi}$ is a **True** rating of user $x$ on item $i$.

$|R|$ is the total number of ratings.

In [11]:
def cost(Y, R, coords, k=10):
    S = adjusted_cosine_matrix(Y, R)
    B = baseline_matrix(Y, R)
    R_mod = R.copy()
    Y_mod = Y.copy()
    total = 0
    
    for i,x in coords:
        R_mod[i,x] = 0
        Y_mod[i,x] = 0
        
    for i,x in coords:
        prediction = B[i, x] + co_filtering_with_baseline(Y_mod, R_mod, B, S, x, i, k)
        rating = Y[i, x]
        total += np.square(prediction - rating)
        
    return np.sqrt(total) / len(coords)

In [12]:
coords = list(zip(*np.nonzero(R==1)))
np.random.shuffle(coords)
coords = coords[:10000]
cost(Y, R, coords, k=15)

0.0195511361746001

In [13]:
coords = list(zip(*np.nonzero(R==1)))
np.random.shuffle(coords)
test = coords[:10]
R_mod = R.copy()
Y_mod = Y.copy()
for i,x in test:
    R_mod[i,x] = 0
    Y_mod[i,x] = 0
S = adjusted_cosine_matrix(Y_mod, R_mod)
B = baseline_matrix(Y_mod, R_mod)
for i, x in test:
    p = B[i, x] + co_filtering_with_baseline(Y_mod, R_mod, B, S, x, i, k=1)
    v = Y[i, x]
    print(p, v)

2.9379116465863455 2.0
4.587160594896984 2.0
3.13093463913136 3.0
3.7041781631848636 5.0
3.9254043349103314 5.0
4.727683178534571 3.0
2.386843853820598 3.0
3.9686034392912974 4.0
1.001655172413793 5.0
3.2151162790697674 3.0
