In [None]:
"""
Loads train and test data
Makes repspective ratings matrix
Can run code (with proper uncommenting) for both bias and bias removed model
Does training and validation on the model
Gives top 5 recommendations from test users in the model
"""

In [1]:
import numpy as np
from scipy.sparse import rand as sprand
from scipy.sparse import lil_matrix
import scipy
import torch
from torch.autograd import Variable
import pandas as pd

In [2]:
#load in data
#change name to dataset as want to load different one. For example r5train to r4train
#do this for the different experiments
names = ['user_id', 'item_id', 'rating', 'timestamp']
df_train = pd.read_csv('ml-10M100K/r5.train', sep='::', names=names,engine='python')
df_test = pd.read_csv('ml-10M100K/r5.test', sep='::', names=names,engine='python')

In [3]:
df_train.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,122,5.0,838985046
1,1,185,5.0,838983525
2,1,231,5.0,838983392
3,1,292,5.0,838983421
4,1,316,5.0,838983392


In [4]:
#normal get the matrix of the users to the items ratings
def get_movielens_ratings(df):
    n_users = max(df.user_id.unique())
    n_items = max(df.item_id.unique())

    interactions = lil_matrix( (n_users,n_items), dtype=float) #np.zeros((n_users, n_items))
    for row in df.itertuples():
        interactions[row[1] - 1, row[2] - 1] = row[3]
    return interactions

In [5]:
#used for dataset r5 due to discrepancy in size between r5train and r5 test users
def get_movielens_ratings_other(df):
    n_users = max(df.user_id.unique())
    n_items = max(df.item_id.unique())

    interactions = lil_matrix( (71567,65133), dtype=float) #np.zeros((n_users, n_items))
    for row in df.itertuples():
        interactions[row[1] - 1, row[2] - 1] = row[3]
    return interactions

In [6]:
ratings = get_movielens_ratings_other(df_train)
ratings.shape

(71567, 65133)

In [7]:
test_ratings = get_movielens_ratings(df_test)
test_ratings.shape

(71567, 65133)

In [8]:
#used to make the model
#for use with bias in forward use the firsr return statement in method uncomment it) 
#for removal of bias use the "pred=" code and below and return pred
class MatrixFactorization(torch.nn.Module):
    
    def __init__(self, n_users, n_items, n_factors=5):
        super().__init__()
        self.user_factors = torch.nn.Embedding(n_users, 
                                               n_factors,
                                               sparse=False)
        self.item_factors = torch.nn.Embedding(n_items, 
                                               n_factors,
                                               sparse=False)
        self.user_biases = torch.nn.Embedding(n_users, 
                                              1,
                                              sparse=False)
        self.item_biases = torch.nn.Embedding(n_items,
                                              1,
                                              sparse=False)
        # Also should consider fitting overall bias (self.mu term) and both user and item bias vectors
        # Mu is 1x1, user_bias is 1xn_users. item_bias is 1xn_items
    
    # For convenience when we want to predict a sinble user-item pair. 
    def predict(self, user, item):
        # Need to fit bias factors
        #return (pred + self.user_factors(user) * self.item_factors(item)).sum(1)
        pred = self.user_biases(user) + self.item_biases(item)
        pred += (self.user_factors(user) * self.item_factors(item)).sum(1)
        return pred
    
    # Much more efficient batch operator. This should be used for training purposes
    def forward(self, users, items):
      #use line below for bias incorporated
      #  return torch.mm(self.user_factors(users),torch.transpose(self.item_factors(items),0,1))
        #use below for bias removal
        pred = torch.mm(self.user_factors(users),torch.transpose(self.item_factors(items),0,1))
        #print (users)
        #print (items)
        i=torch.transpose(self.item_biases(items),0,1)
        pred += i.expand_as(pred)
        return pred


In [9]:
model = MatrixFactorization(ratings.shape[0], ratings.shape[1], n_factors=2)

In [10]:
loss_func = torch.nn.MSELoss()

In [11]:
#change weight_decay for regulirization of the model while training
reg_loss_func = torch.optim.Adam(model.parameters(), lr=1e-6, weight_decay=1e-1)

In [12]:
def get_batch(batch_size,ratings):
    # Sort our data and scramble it
    rows, cols = ratings.shape
    p = np.random.permutation(rows)
    
    # create batches
    sindex = 0
    eindex = batch_size
    while eindex < rows:
        batch = p[sindex:eindex]
        temp = eindex
        eindex = eindex + batch_size
        sindex = temp
        yield batch
    
    if eindex >= rows:
        batch = range(sindex,rows)
        yield batch    

In [13]:
EPOCH = 2
BATCH_SIZE = 1000 #50
LR = 0.001

In [14]:
#runs validation in batches to handle some memory issues
#appends the MSE to list of batch and takes the average of that for the overal MSE on test set 
def run_validation():
    l = []
    for i,batch in enumerate(get_batch(BATCH_SIZE, test_ratings)):
            # Turn data into variables
            interactions= Variable(torch.FloatTensor(test_ratings[batch, :].toarray()))
            rows = Variable(torch.LongTensor(batch))
            cols = Variable(torch.LongTensor(np.arange(ratings.shape[1])))
            try:
                predictions = model(rows, cols)
            except:
                break
            l.append ( loss_func(predictions, interactions).data[0])
            
    l = sum(l)/len(l)
    return l

In [15]:
#trains model and calls validation on each epoch
def run_epoch():
    for i,batch in enumerate(get_batch(BATCH_SIZE, ratings)):
        # Set gradients to zero
        reg_loss_func.zero_grad()
        
        # Turn data into variables
        interactions = Variable(torch.FloatTensor(ratings[batch, :].toarray()))
        rows = Variable(torch.LongTensor(batch))
        cols = Variable(torch.LongTensor(np.arange(ratings.shape[1])))
    
        # Predict and calculate loss
        predictions = model(rows, cols)
        loss = loss_func(predictions, interactions)
        #print (predictions.shape)
        #print (interactions.shape)
    
        # Backpropagate
        loss.backward()
    
        # Update the parameters
        reg_loss_func.step()
    print("train loss is "+str(loss))
   
    
    #interactions = Variable(torch.FloatTensor(test_ratings[,:].toarray()))
    #rows = Variable(torch.LongTensor(users.toarray()))
    #cols = Variable(torch.LongTensor(np.arange(ratings.shape[1])))
    loss=run_validation()
    # Predict and calculate loss
    #predictions = model(rows, cols)
    #loss = loss_func(predictions, interactions)
    print("test loss is "+str(loss))

In [16]:
for i in range(EPOCH):
    print(i)
    run_epoch()

0
train loss is Variable containing:
 2.8232
[torch.FloatTensor of size 1]

test loss is 2.4715017312102847
1
train loss is Variable containing:
 2.8227
[torch.FloatTensor of size 1]

test loss is 2.47257090277142


In [26]:
#part 3
#gets predictions on model on test set by batch
#gets predictions by user and takes the max 5 of those predictions into a list
#writes user id and list of recommendations to file 
def recommend():
    num=0
    with open('assign5_r5results.tsv', 'w') as file:
        for i,batch in enumerate(get_batch(BATCH_SIZE, test_ratings)):
                # Turn data into variables
                #interactions= Variable(torch.FloatTensor(test_ratings[batch, :].toarray()))
                rows = Variable(torch.LongTensor(batch))
                cols = Variable(torch.LongTensor(np.arange(ratings.shape[1])))
                predictions = model(rows, cols)
                for j,line in enumerate(predictions):
                    a=predictions[j].data.numpy()
                    ind = np.argpartition(a, -5)[-5:]
                    file.write(str(num))
                    for k in ind:
                        file.write("\t"+str(k))
                    num+=1
                    file.write("\n")
    
    

In [27]:
#calls recommendations function to write to file
recommend()