### 1. Linear User Integration

In [34]:
import torch, torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

class LinearGRU(nn.Module):
    def __init__(self, n_users,n_items, emb_size=None, hidden_units=1000,dropout = 0):
        super(self.__class__, self).__init__()
        self.n_users = n_users
        self.n_items = n_items
        if emb_size == None:
            emb_size = hidden_units
        ## todo why embeding?
        self.user_emb = nn.Embedding(n_users,emb_size)
        self.item_emb = nn.Embedding(n_items,emb_size)
        self.gru = nn.GRU(input_size = n_users+n_items,hidden_size = hidden_units,dropout = dropout,batch_first=True)
        
    def forward(self, user_vectors, item_vectors):
        
#         print(user_vectors.size())
#         print(user_vectors.squeeze(0).size())
        user_vectors = user_vectors.unsqueeze(-1)
        item_vectors = item_vectors.unsqueeze(-1)
        sequence_size = user_vectors.size()[1]
        users = self.user_emb(user_vectors.squeeze(0)).view(-1,sequence_size,self.n_users)
        items = self.item_emb(item_vectors.squeeze(0)).view(-1,sequence_size,self.n_items)
        
        gru_output,_ = self.gru(torch.cat([users,items],dim=-1))

        output = F.log_softmax(gru_output, dim=-1)
        return output
    


In [38]:
network = LinearGRU(3,3,3,3)
users = np.array([[1,1,1,1]])
items = np.array([[0,1,2,1]])
print(users.shape)

(1, 4)


In [39]:
network(Variable(torch.from_numpy(users)),Variable(torch.from_numpy(items)))

Variable containing:
(0 ,.,.) = 
 -1.2552 -1.0975 -0.9642
 -1.4716 -0.9283 -0.9803
 -1.5249 -1.3298 -0.6581
 -1.6313 -1.0054 -0.8246
[torch.FloatTensor of size 1x4x3]

In [49]:
# Libraries and provided functions
import pandas as pd
import zipfile
import wget
from io import StringIO 
import numpy as np
import scipy as sp
from scipy import sparse
import scipy.sparse.linalg
from tqdm import tqdm # Very useful library to see progress bar during range iterations: just type `for i in tqdm(range(10)):`
from matplotlib import pyplot as plt
%matplotlib inline

from collections import namedtuple
import sys

def get_movielens_data(local_file=None):
    '''Downloads movielens data, normalizes users and movies ids,
    returns data in sparse CSR format.
    '''
    if not local_file:
        print('Downloading data...')
        zip_file_url = 'http://files.grouplens.org/datasets/movielens/ml-10m.zip'
        zip_contents = wget.download(zip_file_url)
        print('Done.')
    else:
        zip_contents = local_file
    
    print('Loading data into memory...')
    with zipfile.ZipFile(zip_contents) as zfile:
        zdata = zfile.read('ml-10M100K/ratings.dat').decode()
        delimiter = ';'
        zdata = zdata.replace('::', delimiter) # makes data compatible with pandas c-engine
        ml_data = pd.read_csv(StringIO(zdata), sep=delimiter, header=None, engine='c',
                                  names=['userid', 'movieid', 'rating', 'timestamp'],
                                  usecols=['userid', 'movieid', 'rating','timestamp'])
    print(ml_data.head())
    # normalize indices to avoid gaps
    ml_data['movieid'] = ml_data.groupby('movieid', sort=False).grouper.group_info[0]
    ml_data['userid'] = ml_data.groupby('userid', sort=False).grouper.group_info[0]
    ml_data['timestamp'] = ml_data.groupby('timestamp', sort=False).grouper.group_info[0]
    print(ml_data.head())
    # build sparse user-movie matrix
    data_shape = ml_data[['userid', 'movieid']].max() + 1
    data_matrix = sp.sparse.csr_matrix((ml_data['rating'],
                                       (ml_data['userid'], ml_data['movieid'])),
                                        shape=data_shape, dtype=np.float64)
    
    print('Done.')
    return data_matrix

def split_data(data, test_ratio=0.2):
    '''Randomly splits data into training and testing datasets. Default ratio is 80%/20%.
    Returns datasets in namedtuple format for convenience. Usage:
    
    train_data, test_data = split_data(data_matrix)
    
    or
    
    movielens_data = split_data(data_matrix)
    
    and later in code: 
    
    do smth with movielens_data.train 
    do smth with movielens_data.test
    '''
    
    num_users = data.shape[0]
    idx = np.zeros((num_users,), dtype=bool)
    sel = np.random.choice(num_users, int(test_ratio*num_users), replace=False)
    np.put(idx, sel, True)
    
    Movielens_data = namedtuple('MovieLens10M', ['train', 'test'])
    movielens_data = Movielens_data(train=data[~idx, :], test=data[idx, :])
    return movielens_data

Data = get_movielens_data("ml-10m.zip")

Loading data into memory...
   userid  movieid  rating  timestamp
0       1      122     5.0  838985046
1       1      185     5.0  838983525
2       1      231     5.0  838983392
3       1      292     5.0  838983421
4       1      316     5.0  838983392
   userid  movieid  rating  timestamp
0       0        0     5.0          0
1       0        1     5.0          1
2       0        2     5.0          2
3       0        3     5.0          3
4       0        4     5.0          2
Done.


In [48]:
Data.todense()

matrix([[5., 5., 5., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 3., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])