# Collaborative Filtering with Netflix Data

## Imports

In [3]:

%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np
import torch
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
import random
from pathlib import Path
PATH = Path("./data/")
list(PATH.iterdir())

[PosixPath('data/clean_train.csv'),
 PosixPath('data/combined_data_1.txt'),
 PosixPath('data/combined_data_3.txt'),
 PosixPath('data/README'),
 PosixPath('data/clean_data.csv'),
 PosixPath('data/qualifying.txt'),
 PosixPath('data/probe.txt'),
 PosixPath('data/movie_titles.csv'),
 PosixPath('data/clean_test.csv'),
 PosixPath('data/combined_data_2.txt'),
 PosixPath('data/combined_data_4.txt')]

## Data Cleaning

In [2]:
from pathlib import Path
import pandas as pd
import numpy as np

In [3]:
#PATH = Path("/data2/yinterian/ml-latest-small/")
PATH = Path("./data/")
list(PATH.iterdir())

[PosixPath('data/clean_train.csv'),
 PosixPath('data/combined_data_1.txt'),
 PosixPath('data/combined_data_3.txt'),
 PosixPath('data/README'),
 PosixPath('data/clean_data.csv'),
 PosixPath('data/qualifying.txt'),
 PosixPath('data/probe.txt'),
 PosixPath('data/movie_titles.csv'),
 PosixPath('data/clean_test.csv'),
 PosixPath('data/combined_data_2.txt'),
 PosixPath('data/combined_data_4.txt')]

In [4]:
! head -n 5 ./data/combined_data_1.txt

1:
1488844,3,2005-09-06
822109,5,2005-05-13
885013,4,2005-10-19
30878,4,2005-12-26


In [5]:
df1 = pd.read_csv(PATH/"combined_data_1.txt", names = ['userId','rating','date'], index_col = None)
df2 = pd.read_csv(PATH/"combined_data_2.txt", names = ['userId','rating','date'], index_col = None)
df3 = pd.read_csv(PATH/"combined_data_3.txt", names = ['userId','rating','date'], index_col = None)
df4 = pd.read_csv(PATH/"combined_data_4.txt", names = ['userId','rating','date'], index_col = None)

KeyboardInterrupt: 

In [None]:
def append_frames(*args):
    df = args[0]
    for frame in args[1:]: 
        df.append(frame)
        # del frame -- TODO: needed or not?
    return df

In [None]:
df = df1
del df1 #deallocation
df = df.append(df2)
del df2 #deallocation
df = df.append(df3)
del df3 #deallocation
df = df.append(df4)
del df4 #deallocation

In [None]:
df.shape

In [None]:
movie_list = []
for i, row in df.iterrows():
    if pd.isna(row['rating']):
        movie_id = row['userId'].strip(':')    
    movie_list.append(movie_id)

In [None]:
len(movie_list)

In [None]:
df['movieId'] = np.asarray(movie_list)

In [None]:
df_nonull = df[pd.notnull(df['rating'])]

In [None]:
df_nonull.isnull().values.any()

In [None]:
df_nonull.to_csv(PATH/"clean_data.csv",index=False)

In [None]:
df = pd.read_csv(PATH/"clean_data.csv")

In [None]:
df.head()

## Encode Data

In [None]:
# split train and validation before encoding
df['date'] =pd.to_datetime(df.date)
df = df.sort_values(by='date')
df.head()

In [None]:
test_size = int(len(df)-len(df)*0.8)
train_size = len(df) - test_size
test_df = df.tail(test_size)
train_df = df.head(train_size)
del df

In [None]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continous ids. 
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids. 
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
train_df = encode_data(train_df)
test_df = encode_data(test_df, train_df)
train_df.head()

In [None]:
# train_df.to_csv(PATH/"clean_train.csv",index=False)
# test_df.to_csv(PATH/"clean_test.csv",index=False)

## Restart Point

In [35]:
train_df = pd.read_csv(PATH/"clean_train.csv")
test_df = pd.read_csv(PATH/"clean_test.csv")

In [41]:
train_df.head()

Unnamed: 0,userId,rating,movieId
0,0,4,0
1,0,5,1
2,0,3,2
3,0,2,3
4,0,2,4


In [42]:
train_df.drop('date',axis=1,inplace=True)
test_df.drop('date',axis=1,inplace=True)

ValueError: labels ['date'] not contained in axis

In [43]:
type(train_df['userId'][0])

numpy.int64

In [44]:
train_df = train_df.astype('int32',copy=False)
test_df = test_df.astype('int32', copy=False)

In [45]:
type(train_df['userId'][0])

numpy.int32

## DataSet and DataLoader

In [54]:
# TODO rewrite to not read in whole dataset
from torch.utils import data
class NetflixDataset(data.Dataset):
    def __init__(self, df, transform=None):
        self.length = len(df)
        self.y = torch.FloatTensor(df['rating'].values)
        self.x = torch.IntTensor(df.drop('rating',axis=1).values)
        self.transform = transform
        
    def __len__(self):
        return self.length
    
    def __getitem__(self,index):
        sample = {'x':self.x[index], 'y':self.y[index]}
        return sample

In [55]:
train_ds = NetflixDataset(train_df)
test_ds = NetflixDataset(test_df)

In [76]:
from torch.utils.data import Dataset, DataLoader

batch_size = 100000
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4)
# for test we use shuffle=False
valid_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False,num_workers=4)

## Neural Network Model

In [77]:
# Note here there is no matrix multiplication, we could potentially make the embeddings of different sizes.
# Here we could get better results by keep playing with regularization.
    
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super().__init__()
        self.embs = nn.ModuleList([nn.Embedding(num_users, emb_size), nn.Embedding(num_items, emb_size)])
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        self.drop2 = nn.Dropout(0.0)
        
    def forward(self, u, v):
        U = self.embs[0](u.long())
        V = self.embs[1](v.long())
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.drop2(x)
        x = self.lin2(x)
        return x

## Training

In [82]:
def train_loop(model, epochs, lr=0.01, wd=0.0):
    optim = get_optimizer(model, lr = lr, wd = wd)
    for i in range(epochs): 
        loss = train_model(model, optim, train_dl)
        print("loss ", loss)
        val_loss(model, valid_dl)
        
def get_optimizer(model, lr = 0.01, wd = 0.0):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optim = torch.optim.Adam(parameters, lr=lr, weight_decay=wd)
    return optim
        
def train_model(model, optim, train_dl=train_dl, verbose=False):
    model.train()
    total = 0
    sum_loss = 0
    for sample in train_dl:
        x = sample['x'].to(device)
        y = sample['y'].unsqueeze(1).to(device)
        batch = y.shape[0]
        y_hat = model(x[:,0], x[:,1])
        loss = F.mse_loss(y_hat, y)
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
        if verbose: print(sum_loss/total)
    return sum_loss/total

def val_loss(model, valid_dl):
    model.eval()
    total = 0
    sum_loss = 0
    correct = 0
    for sample in valid_dl:
        x = sample['x'].to(device)
        y = sample['y'].unsqueeze(1).to(device)
        batch = y.shape[0]
        y_hat = model(x[:,0], x[:,1])
        loss = F.mse_loss(y_hat, y)
        sum_loss += batch*(loss.item())
        total += batch
    print("val loss", sum_loss/total)
    return sum_loss/total

In [83]:
num_users = len(train_df.userId.unique())
num_items = len(train_df.movieId.unique())

In [84]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = CollabFNet(num_users, num_items, emb_size=100)
if torch.cuda.device_count()>1:
    print("Let's use", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
model.to(device)

CollabFNet(
  (embs): ModuleList(
    (0): Embedding(405041, 100)
    (1): Embedding(17424, 100)
  )
  (lin1): Linear(in_features=200, out_features=10, bias=True)
  (lin2): Linear(in_features=10, out_features=1, bias=True)
  (drop1): Dropout(p=0.1)
  (drop2): Dropout(p=0.0)
)

In [85]:
# TODO rewrite to use triangular learning rate 
train_loop(model, epochs=10, lr=0.01, wd=0.00001)

loss  1.043450249876293
val loss 1.67005981919454
loss  0.9444056959106168
val loss 1.6107172830865173
loss  0.9390826001260538
val loss 1.5945229202252087
loss  0.9343954729525075
val loss 1.6046662632691466
loss  0.9311020747321468
val loss 1.5703325326673567
loss  0.9246867347840229
val loss 1.5993807762712664
loss  0.9151195001520052
val loss 1.5178861931799699
loss  0.8938262929312832
val loss 1.5120143849815928
loss  0.8890971845925336
val loss 1.524979538207603
loss  0.8806401561676023
val loss 1.5263998119163806


# References
* This notebook is based on [lesson 5 of Jeremy Howard's Deep Learning Course](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson5-movielens.ipynb)