<a href="https://colab.research.google.com/github/davhofer/recommender/blob/main/experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def keepalive():
  x = 1
  while True:
    x += 1
    x *= 2
    x -= x//2
    x -= 1

In [3]:
import gc 


In [None]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!git clone https://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@github.com/davhofer/recommender.git recommender

import sys
sys.path.insert(0, '/content/recommender')

!pip install --upgrade -r recommender/requirements.txt

fatal: destination path 'recommender' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [1]:
import pandas as pd

DATA_DIR = '/content/gdrive/MyDrive' 
DATA_DIR = '../data'

STUDY_DIR = DATA_DIR + '/study'

users = pd.read_csv(f'{DATA_DIR}/users.csv.gz')
topics = pd.read_csv(f'{DATA_DIR}/topics_translated.csv')
documents = pd.read_csv(f'{DATA_DIR}/documents.csv.gz')

# use study for less data, testing
events = pd.read_csv(f'{DATA_DIR}/events.csv.gz')
transactions = pd.read_csv(f'{DATA_DIR}/transactions.csv.gz')

In [2]:
# seeding everything for deterministic results

import os
import numpy as np
import torch
import random

SEED = 131

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [3]:
from data import preprocess_events, LeaveOneOutSplitter, create_topic_features, create_user_features


def create_datasplitter(MATH, GERMAN, USE_FEATURES):
  TEST_USER_FRAC = 0.85
  VAL_USER_FRAC = 0.15
  TRAIN_NEGATIVE_FRAC = 2.0

  events_preprocessed = preprocess_events(events, topics, math=MATH, german=GERMAN)
  user_features = None if not USE_FEATURES else create_user_features(users, transactions)
  topic_features = None if not USE_FEATURES else create_topic_features(topics, documents, events)

  NUM_USER_FEATURES = 0
  NUM_TOPIC_FEATURES = 0
  if USE_FEATURES:
    NUM_USER_FEATURES = user_features.shape[1]
    NUM_TOPIC_FEATURES = topic_features.shape[1]

  data_splitter = LeaveOneOutSplitter(
      events_preprocessed,
      device=None,
      use_features=USE_FEATURES,
      user_features=user_features if USE_FEATURES else None,
      topic_features=topic_features if USE_FEATURES else None,
      test_user_frac=TEST_USER_FRAC,
      val_user_frac=VAL_USER_FRAC,
      train_negative_frac=TRAIN_NEGATIVE_FRAC,
      test_sample_strat="newest"
  )
  return data_splitter

In [4]:
events_preprocessed_german = preprocess_events(events, topics, math=False, german=True)

In [5]:
events_preprocessed_math = preprocess_events(events, topics, math=True, german=False)

In [6]:
german_ids = list(events_preprocessed_german['topic_id'].unique())
math_ids = list(events_preprocessed_math['topic_id'].unique())

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [8]:
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl 
import torch
from torch import optim, nn
import pytorch_lightning as pl
import numpy as np
import pandas as pd 
from evaluation import HitRate_NDCG_MRR, metrics_per_topic, HitRate_NDCG_MRR_from_CSV, getMRR, getHitRatio, getNDCG

from ncf_model import NCFNetwork
from data import LeaveOneOutDS

BATCH_SIZE = 64


In [11]:
from pytorch_lightning.callbacks import EarlyStopping, TQDMProgressBar

def run_model(USE_FEATURES, PREDICTIVE_FACTORS, STUDENT_EMBEDDING_DIM, TOPIC_EMBEDDING_DIM, data_splitter, joint, epochs=10, patience=3):

    train_ds = LeaveOneOutDS(data_splitter.get_data(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())
    val_ds = LeaveOneOutDS(data_splitter.get_val_data(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())
    test_ds = LeaveOneOutDS(data_splitter.get_test_data(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())

    train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    ncf = NCFNetwork(
        num_students=data_splitter.num_students,
        num_topics=data_splitter.num_topics,
        student_embedding_dim=STUDENT_EMBEDDING_DIM,
        topic_embedding_dim=TOPIC_EMBEDDING_DIM,
        predictive_factors=PREDICTIVE_FACTORS,
        use_features=USE_FEATURES,
        intermediate_size_divisor=2,
        output_MLP_num_layers=3,
        num_user_features=data_splitter.num_user_features,
        num_topic_features=data_splitter.num_topic_features,
        loss=nn.BCELoss(),
        joint=joint,
        topic_ids=data_splitter.get_topic_ids(),
        german_ids=german_ids,
        math_ids=math_ids
        )
    

    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=0.0,
        patience=patience,
        verbose=True,
        mode="min"
    )

    trainer = pl.Trainer(
            accelerator="auto",
            devices=1,
            accumulate_grad_batches=1,
            max_epochs=epochs,
            callbacks=[TQDMProgressBar(refresh_rate=10), early_stop_callback]
    )

    trainer.fit(model=ncf, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
    trainer.test(model=ncf, dataloaders=test_dataloader)

    return ncf.eval_results

In [12]:
# NOTE: use the flags below to select what dataset to create and to train on!
USE_MATH = True 
USE_GERMAN = False 
USE_FEATURES = True
#############################################################################

data_splitter = create_datasplitter(USE_MATH, USE_GERMAN, USE_FEATURES)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df.apply(get_val, axis=1)


Sampled initial validation and test interactions
Completed test dataset
Completed validation dataset
Completed train dataset
Adding features...


In [14]:
joint = USE_MATH and USE_GERMAN

results = run_model(USE_FEATURES, 16, 128, 32, data_splitter, joint, patience=4, epochs=10)

  rank_zero_warn(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name                    | Type       | Params
-------------------------------------------------------
0 | student_embedding_layer | Embedding  | 785 K 
1 | topic_embedding_layer   | Embedding  | 1.4 K 
2 | user_embed_MLP          | Sequential | 17.6 K
3 | user_feature_MLP        | Sequential | 82    
4 | topic_embed_MLP         | Sequential | 1.1 K 
5 | topic_feature_MLP       | Sequential | 5     
6 | network                 | Sequential | 6.1 K 
7 | loss                    | BCELoss    | 0     
-------------------------------------------------------
811 K     Trainable params
0         Non-trainable params
811 K     Total params
3.246     Total estimated model params size (MB)


intermediate layer size (concatenated): 85
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00, 302.56it/s]HitRate@10 0.5
NDCG@10 0.18004653514196872
MRR@10 0.08571428571428572
                                                                            

  rank_zero_warn(
  for user, topic in user_predict:
  rank_zero_warn(


Epoch 0: 100%|██████████| 1092/1092 [00:07<00:00, 150.44it/s, v_num=30, train_loss_step=0.320]

  for user, topic in user_predict:


HitRate@10 0.5505984766050055
NDCG@10 0.31004013586089185
MRR@10 0.237301155500285
Epoch 0: 100%|██████████| 1092/1092 [00:11<00:00, 97.28it/s, v_num=30, train_loss_step=0.320, val_loss=0.366, train_loss_epoch=0.505]

Metric val_loss improved. New best score: 0.366


Epoch 1: 100%|██████████| 1092/1092 [00:07<00:00, 146.57it/s, v_num=30, train_loss_step=0.454, val_loss=0.366, train_loss_epoch=0.505]

  for user, topic in user_predict:


HitRate@10 0.5212187159956474
NDCG@10 0.3024305921110445
MRR@10 0.23594573121232534
Epoch 1: 100%|██████████| 1092/1092 [00:11<00:00, 97.75it/s, v_num=30, train_loss_step=0.454, val_loss=0.267, train_loss_epoch=0.452] 

Metric val_loss improved by 0.099 >= min_delta = 0.0. New best score: 0.267
`Trainer.fit` stopped: `max_epochs=2` reached.


Epoch 1: 100%|██████████| 1092/1092 [00:11<00:00, 97.50it/s, v_num=30, train_loss_step=0.454, val_loss=0.267, train_loss_epoch=0.452]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(



Testing DataLoader 0: 100%|██████████| 3275/3275 [00:20<00:00, 158.80it/s]

  for user, topic in user_predict:


Testing DataLoader 0: 100%|██████████| 3275/3275 [00:21<00:00, 148.93it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Runningstage.testing metric      DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       HitRate@10           0.5294456167274122
         MRR@10             0.22598707748333685
         NDCG@10            0.29645412718120967
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
