In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.utils import shuffle

In [2]:
# data is from https://grouplens.org/datasets/movielens/
!wget -nc https://files.grouplens.org/datasets/movielens/ml-20m.zip

File ‘ml-20m.zip’ already there; not retrieving.



In [3]:
!unzip -n ml-20m.zip

Archive:  ml-20m.zip


In [4]:
!ls

ml-20m	ml-20m.zip  sample_data


In [5]:
df = pd.read_csv('ml-20m/ratings.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [6]:
df.userId = pd.Categorical(df.userId)
df['new_user_id'] = df.userId.cat.codes

In [7]:
df.movieId = pd.Categorical(df.movieId)
df['new_movie_id'] = df.movieId.cat.codes

In [8]:
# Get user IDs and movie IDs, and ratings as separate arrays
user_ids = df['new_user_id'].values
movie_ids = df['new_movie_id'].values
ratings = df['rating'].values - 2.5

In [9]:
# Get number of users and movies
N = len(set(user_ids))
M = len(set(movie_ids))

# Set embedding dimension
D = 10

In [10]:
# neural network
class Model(nn.Module):
    def __init__(self, n_users, n_items, embed_dim, n_hidden=1024):
        super(Model, self).__init__()
        self.N = n_users
        self.M = n_items
        self.D = embed_dim

        self.u_emb = nn.Embedding(n_users, embed_dim)
        self.m_emb = nn.Embedding(n_items, embed_dim)
        self.fc1 = nn.Linear(2*embed_dim, n_hidden)
        self.fc2 = nn.Linear(n_hidden, 1)

    def forward(self, u, m):
        u = self.u_emb(u)
        m = self.m_emb(m)
        out = torch.cat([u, m], 1)
        out = self.fc1(out)
        out = F.relu(out)
        out = self.fc2(out)
        return out

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda:0


In [12]:
model = Model(N, M, D)
model.to(device)

Model(
  (u_emb): Embedding(138493, 10)
  (m_emb): Embedding(26744, 10)
  (fc1): Linear(in_features=20, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1, bias=True)
)

In [13]:
# Loss and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [14]:
# Shuffle the data in corresponding order
user_ids, movie_ids, ratings = shuffle(user_ids, movie_ids, ratings)

In [15]:
# Convert to tensors
user_ids_t = torch.from_numpy(user_ids).long()
movie_ids_t = torch.from_numpy(movie_ids).long()
ratings_t = torch.from_numpy(ratings)

In [16]:
# Make dataset
Ntrain = int(0.8 * len(ratings))
train_dataset = torch.utils.data.TensorDataset(user_ids_t[:Ntrain], movie_ids_t[:Ntrain], ratings_t[:Ntrain])
test_dataset = torch.utils.data.TensorDataset(user_ids_t[Ntrain:], movie_ids_t[Ntrain:], ratings_t[Ntrain:])

In [17]:
# Data loaders
batch_size = 512
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [18]:
# a function to encapsulate the training loop
def batch_gd(model, criterion, optimizer, train_iter, test_iter, epochs):
    train_losses = np.zeros(epochs)
    test_losses = np.zeros(epochs)

    for it in range(epochs):
        t0 = datetime.now()
        train_loss = []
        for users, movies, targets in train_loader:
            targets = targets.view(-1, 1).float()

            # move data to GPU
            users, movies, targets = users.to(device), movies.to(device), targets.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward pass
            outputs = model(users, movies)
            loss = criterion(outputs, targets)

            # backward and optimize
            loss.backward()
            optimizer.step()

            train_loss.append(loss.item())

        # Get train loss and test loss
        train_loss = np.mean(train_loss)

        test_loss = []
        for users, movies, targets in test_loader:
            users, movies, targets = users.to(device), movies.to(device), targets.to(device)
            targets = targets.view(-1, 1).float()
            outputs = model(users, movies)
            loss = criterion(outputs, targets)
            test_loss.append(loss.item())
        test_loss = np.mean(test_loss)

        # Save losses
        train_losses[it] = train_loss
        test_losses[it] = test_loss

        dt = datetime.now() - t0
        print(f'Epoch {it+1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Duration: {dt}')
    return train_losses, test_losses

In [19]:
# profile this using %prun
%prun train_losses, test_losses = batch_gd( \
    model, criterion, optimizer, train_loader, test_loader, 25)

Epoch 1/25, Train Loss: 0.8176, Test Loss: 0.7538, Duration: 0:05:10.184710
Epoch 2/25, Train Loss: 0.7389, Test Loss: 0.7375, Duration: 0:04:54.270160
Epoch 3/25, Train Loss: 0.7214, Test Loss: 0.7219, Duration: 0:04:55.098102
Epoch 4/25, Train Loss: 0.7046, Test Loss: 0.7093, Duration: 0:04:53.348147
Epoch 5/25, Train Loss: 0.6923, Test Loss: 0.7031, Duration: 0:04:54.939437
Epoch 6/25, Train Loss: 0.6840, Test Loss: 0.6980, Duration: 0:04:53.476527
Epoch 7/25, Train Loss: 0.6779, Test Loss: 0.6950, Duration: 0:04:53.078147
Epoch 8/25, Train Loss: 0.6732, Test Loss: 0.6927, Duration: 0:04:50.716733
Epoch 9/25, Train Loss: 0.6692, Test Loss: 0.6907, Duration: 0:04:51.531793
Epoch 10/25, Train Loss: 0.6659, Test Loss: 0.6901, Duration: 0:04:50.166748
Epoch 11/25, Train Loss: 0.6630, Test Loss: 0.6894, Duration: 0:04:51.036161
Epoch 12/25, Train Loss: 0.6605, Test Loss: 0.6878, Duration: 0:04:50.484786
Epoch 13/25, Train Loss: 0.6583, Test Loss: 0.6873, Duration: 0:04:50.896582
Epoch 14

In [19]:
# Plot the train and test losses
plt.plot(train_losses, label='train loss')
plt.plot(test_losses, label='test loss')
plt.legend()
plt.show()