# Recommendation systems


# Collaborative Filtering with Gradient Descent 

In this part, I will build a collaborative filtering model to predict movie ratings using collaborative filtering and stochastic gradient descent following the steps below:

    - Encoding rating data
    - Initializing parameters
    - Calculating the cost function
    - Calculating gradient
    - Using an optimization algorithm (gradient descent) 
    - Predicting on new data
    - Putting it all together.

In [2]:
import numpy as np
import pandas as pd

## Data
We will use the MovieLens dataset. For demonstration we will use the small one, which has 10,000 ratings and 1,300 tag applications applications applied to 9,000 movies by 700 users. Last updated 10/2016.

In [4]:
path = "ml-latest-small/"
data = pd.read_csv(path + "ratings.csv")
data.shape

(100004, 4)

In [5]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


## Encoding rating data

In [6]:
# Encodes a pandas column with continous ids.
def proc_col(col):
    uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx[x] for x in col]), len(uniq)

In [8]:
# Encodes rating data with continous user and movie ids 
def encode_data(df):
    user2id, user_col, num_users = proc_col(df["userId"])
    df["userId"] = user_col
    movie2id, movie_col, num_movies = proc_col(df["movieId"])
    df["movieId"] = movie_col
    return df, num_users, num_movies

In [17]:
df = data[:10].copy()
df, num_users, num_movies = encode_data(df)

In [18]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,2.5,1260759144
1,0,1,3.0,1260759179
2,0,2,3.0,1260759182
3,0,3,2.0,1260759185
4,0,4,4.0,1260759205


## Initializing parameters

In [12]:
def create_embedings(n, K):
    """ 
    Create a numpy random matrix of shape n, K
    The random matrix should be initialized with uniform values in (0, 6/K)
    """
    np.random.seed(3)
    emb = 6*np.random.random((n, K)) / K
    return emb

# here is an example on how the prediction matrix would look like with 7 users and 5 movies
np.dot(create_embedings(7,3), create_embedings(5,3).transpose())

array([[ 3.55790894,  4.69774849,  0.92361109,  1.58739544,  3.00593239],
       [ 4.69774849,  7.44656163,  1.18135616,  2.64524868,  4.74559066],
       [ 0.92361109,  1.18135616,  0.24548062,  0.34025121,  0.69616965],
       [ 1.58739544,  2.64524868,  0.34025121,  1.61561   ,  2.41361975],
       [ 3.00593239,  4.74559066,  0.69616965,  2.41361975,  3.82505541],
       [ 2.02000808,  3.29656257,  0.43174569,  2.065911  ,  3.07264619],
       [ 2.07691001,  3.02887291,  0.53270924,  1.02482544,  1.90251125]])

## Encoding Y as a sparse matrix
This code helps you encode a $Y$ as a sparse matrix from the dataframe. 

In [14]:
from scipy import sparse
def df2matrix(df, nrows, ncols, column_name="rating"):
    """ 
    Returns a sparse matrix constructed from a dataframe
    This code assumes the df has columns: MovieID,UserID,Rating
    """
    values = df[column_name].values
    ind_movie = df['movieId'].values
    ind_user = df['userId'].values
    return sparse.csc_matrix((values,(ind_user, ind_movie)),shape=(nrows, ncols))

In [19]:
df, num_users, num_movies = encode_data(df)
Y = df2matrix(df, num_users, num_movies)

In [20]:
print(Y)

  (0, 0)	2.5
  (0, 1)	3.0
  (0, 2)	3.0
  (0, 3)	2.0
  (0, 4)	4.0
  (0, 5)	2.0
  (0, 6)	2.0
  (0, 7)	2.0
  (0, 8)	3.5
  (0, 9)	2.0


In [21]:
def sparse_multiply(df, emb_user, emb_movie):
    """ 
    This function returns U*V^T element wise multi by R as a sparse matrix.
    It avoids creating the dense matrix U*V^T
    """
    df["Prediction"] = np.sum(emb_user[df["userId"].values]*emb_movie[df["movieId"].values], axis=1)
    return df2matrix(df, emb_user.shape[0], emb_movie.shape[0], column_name="Prediction")

## Calculating the cost function

In [22]:
def cost(df, emb_user, emb_movie):
    """ 
    Computes mean square error
    First compute prediction. Prediction for user i and movie j is
    emb_user[i]*emb_movie[j]
    
    """
    Y =  df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    Y_pred = sparse_multiply(df, emb_user, emb_movie)
    error = ((Y - Y_pred).multiply(Y - Y_pred)).sum()/len(df)
    return error

## Calculating gradient

In [27]:
def gradient(df, Y, emb_user, emb_movie):
    """ 
    Computes the gradient.
    First compute prediction. Prediction for user i and movie j is
    emb_user[i]*emb_movie[j]
    
    """
    N = len(df)
    Y_pred = sparse_multiply(df, emb_user, emb_movie)
    delta = Y - Y_pred
    grad_user = - 2 * delta.dot(emb_movie)/N
    grad_movie = - 2 * delta.transpose().dot(emb_user)/N
    return grad_user, grad_movie

## Using gradient descent with momentum

In [25]:
def gradient_descent(df, emb_user, emb_movie, iterations=100, learning_rate=0.01, df_val=None):
    """ 
    Computes gradient descent with momentum (0.9) for a number of iterations.
    Prints training cost and validation cost (if df_val is not None) every 50 iterations.
    Return the trained user embedding and trained movie embedding
    
    """
    Y = df2matrix(df, emb_user.shape[0], emb_movie.shape[0])
    val_cost = None
    v_user, v_movie = gradient(df, Y, emb_user, emb_movie)
    a = 1 
    b = .9
    for i in range(iterations):
        grad_user, grad_movie = gradient(df, Y, emb_user, emb_movie)
        v_user = b*v_user + (1-b) * grad_user
        v_movie = b*v_movie + (1-b) * grad_movie
        emb_user = a*emb_user - learning_rate * v_user
        emb_movie = a*emb_movie - learning_rate * v_movie
        train_cost = cost(df, emb_user, emb_movie)
        if df_val is not None:
            val_cost = cost(df_val, emb_user, emb_movie)
        if i % 100 == 0:  print(i, train_cost, val_cost)
    return emb_user, emb_movie

## Predicting on new data
To predict ratings on new data, first we write a function that encodes new data. If a new user or item is present that row should be remove. Collaborative Filtering is not good at handling new users or new items. 

In [23]:
def proc_col2(col_v, col_t):
    uniq = col_t.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col_v])

def encode_new_data(df_val, df_train):
    """ Encodes df_val with the same encoding as df_train."""
    user2id, user_col = proc_col2(df_val["userId"], df_train["userId"])
    df_val["userId"] = user_col
    movie2id, movie_col = proc_col2(df_val["movieId"], df_train["movieId"])
    df_val["movieId"] = movie_col
    df_val = df_val[df_val["userId"] >= 0]
    df_val = df_val[df_val["movieId"] >= 0]
    return df_val

## Implement the algorithm on the MovieLens data

In [24]:
# Randomize data to get a training and validation set
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()
df_train, num_users, num_movies = encode_data(train.copy())
df_val = encode_new_data(val.copy(), train.copy())
print(len(val), len(df_val))

20205 19506


In [28]:
K = 50
emb_user = create_embedings(num_users, K)
emb_movie = create_embedings(num_movies, K)
emb_user, emb_movie = gradient_descent(df_train, emb_user, emb_movie, iterations=5000, learning_rate=1, df_val=df_val)

0 12.3912697565 12.4972528597
100 7.19647749397 7.32717811981
200 4.06667286733 4.18644911311
300 2.82714856064 2.9265984566
400 2.18432213988 2.27357112634
500 1.80386825989 1.89004707348
600 1.55677647499 1.64321239751
700 1.38498356468 1.4733369701
800 1.25936026021 1.3505219879
900 1.16390352791 1.25838191288
1000 1.08913509711 1.18723943047
1100 1.02909524861 1.13103033855
1200 0.979855196014 1.08577372753
1300 0.938723808957 1.04875711586
1400 0.90379751299 1.01807454809
1500 0.873691981655 0.992351389558
1600 0.847375863071 0.970573656415
1700 0.824064547736 0.951978612296
1800 0.803150671359 0.935982622827
1900 0.784157825718 0.92213237287
2000 0.766709291974 0.910071121032
2100 0.750506620938 0.899514854707
2200 0.735314627737 0.89023509517
2300 0.720950423194 0.882046251785
2400 0.707274806458 0.874796145991
2500 0.694184885924 0.86835879239
2600 0.681607265684 0.86262883366
2700 0.669491535695 0.857517232152
2800 0.657804089977 0.852947953691
2900 0.646522430493 0.8488554558

In [29]:
train_mse = cost(df_train, emb_user, emb_movie)
val_mse = cost(df_val, emb_user, emb_movie)
print(train_mse, val_mse)

0.458364441491 0.816002721257
