In [12]:
# Import the tables of the data set as dataframes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DATA_DIR = './../data' 
STUDY_DIR = DATA_DIR + '/study'


users = pd.read_csv(f'{DATA_DIR}/users.csv.gz')

# use study for less data, for testing
events = pd.read_csv(f'{STUDY_DIR}/events.csv.gz')
transactions = pd.read_csv(f'{STUDY_DIR}/transactions.csv.gz')

In [13]:
#np.percentile(transactions['start_time'].total_seconds, 80)
transactions = transactions[~transactions['topic_id'].isna()]
transactions['topic_id'] = transactions['topic_id'].astype(int)

# only keep users with at least 10 interactions
transactions = transactions.groupby('user_id').filter(lambda x: len(x) >= 10)

user_ids = list(transactions['user_id'].unique())
topic_ids = list(transactions['topic_id'].unique())

N = len(user_ids)
M = len(topic_ids)


# train test split
transactions['start_time'] = pd.to_datetime(transactions['start_time'])

cut = np.percentile(transactions['start_time'], 80)

train_transactions = transactions[transactions['start_time'] < cut]
test_transactions = transactions[transactions['start_time'] >= cut]

In [14]:
import torch
from torch.utils.data import Dataset
import random

class StudentsTopicsOneHot(Dataset):
    def __init__(self, df, negative_frac=1.0):

     
        interactions = list(df.groupby(['user_id', 'topic_id']).count().index)

        all_pairings = {(user, topic) for user in user_ids for topic in topic_ids}
        positives = set(interactions)
        no_interaction = all_pairings - positives
        negatives = random.sample(list(no_interaction), int(negative_frac*len(positives)))

        self.data = [(x[0], x[1], 1.0) for x in positives] + [(x[0], x[1], 0.0) for x in negatives]
        

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        user, topic, y = self.data[index]

        user = user_ids.index(user)
        user = torch.tensor(user)

        topic = topic_ids.index(topic)
        topic = torch.tensor(topic)

        y = torch.tensor([y])
        return user, topic, y


train_dataset = StudentsTopicsOneHot(train_transactions)
train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=5, shuffle=True)

test_dataset = StudentsTopicsOneHot(test_transactions)
test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=5, shuffle=True)


In [15]:
import os
import torch
from torch import optim, nn
import lightning.pytorch as pl


# TODO: fill these
num_students = N
num_topics = M

student_embedding_dim = 32
topic_embedding_dim = 32


class NCFNetwork(pl.LightningModule):
    def __init__(self, num_students, num_topics, student_embedding_dim, topic_embedding_dim):
        super().__init__()
        self.student_embedding_layer = nn.Embedding(num_students, student_embedding_dim)
        self.topic_embedding_layer = nn.Embedding(num_topics, topic_embedding_dim)
        
        self.network = nn.Sequential(
            nn.Linear(student_embedding_dim+topic_embedding_dim, 32),
            nn.ReLU(),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.Sigmoid(),
        )

        self.loss = nn.BCELoss()

        self.save_hyperparameters()
        
        #self.valid_accuracy = Accuracy()
        #self.test_accuracy = Accuracy()

    def training_step(self, batch, batch_idx):
        # training_step defines the train loop.
        # it is independent of forward
        student_x, topic_x, y = batch

        student_emb = self.student_embedding_layer(student_x)
        topic_emb = self.topic_embedding_layer(topic_x)

        x = torch.cat((student_emb, topic_emb), 1)


        y_pred = self.network(x)
        
        loss = self.loss(y_pred, y)
        # Logging to TensorBoard (if installed) by default
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=1e-3)
        return optimizer


# init
ncf = NCFNetwork(num_students, num_topics, student_embedding_dim, topic_embedding_dim)

In [16]:
trainer = pl.Trainer(max_epochs=10)
trainer.fit(model=ncf, train_dataloaders=train_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                    | Type       | Params
-------------------------------------------------------
0 | student_embedding_layer | Embedding  | 5.3 K 
1 | topic_embedding_layer   | Embedding  | 1.6 K 
2 | network                 | Sequential | 2.8 K 
3 | loss                    | BCELoss    | 0     
-------------------------------------------------------
9.7 K     Trainable params
0         Non-trainable params
9.7 K     Total params
0.039     Total estimated model params size (MB)
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.
