In [1]:
def keepalive():
  x = 1
  while True:
    x += 1
    x *= 2
    x -= x//2
    x -= 1

In [2]:
import gc 


In [3]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
!git clone https://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@github.com/davhofer/recommender.git recommender

import sys

sys.path.insert(0, '/content/recommender')

!pip install --upgrade -r recommender/requirements.txt

fatal: destination path 'recommender' already exists and is not an empty directory.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import pandas as pd

DATA_DIR = '/content/gdrive/MyDrive/data'

STUDY_DIR = DATA_DIR + '/study'

users = pd.read_csv(f'{DATA_DIR}/users.csv.gz')
topics = pd.read_csv(f'{DATA_DIR}/topics_translated.csv')
documents = pd.read_csv(f'{DATA_DIR}/documents.csv.gz')

# use study for less data, testing
events = pd.read_csv(f'{DATA_DIR}/events.csv.gz')
transactions = pd.read_csv(f'{DATA_DIR}/transactions.csv.gz')

In [6]:
# seeding everything for deterministic results

import os
import numpy as np
import torch
import random

SEED = 131

random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [7]:
from data import preprocess_events, LeaveOneOutSplitter, create_topic_features, create_user_features


def create_datasplitter(MATH, GERMAN, USE_FEATURES):
  TEST_USER_FRAC = 0.85
  VAL_USER_FRAC = 0.15
  TRAIN_NEGATIVE_FRAC = 2.0

  events_preprocessed = preprocess_events(events, topics, math=MATH, german=GERMAN)
  user_features = None if not USE_FEATURES else create_user_features(users, transactions)
  topic_features = None if not USE_FEATURES else create_topic_features(topics, documents, events)

  NUM_USER_FEATURES = 0
  NUM_TOPIC_FEATURES = 0
  if USE_FEATURES:
    NUM_USER_FEATURES = user_features.shape[1]
    NUM_TOPIC_FEATURES = topic_features.shape[1]

  data_splitter = LeaveOneOutSplitter(
      events_preprocessed,
      device=None,
      use_features=USE_FEATURES,
      user_features=user_features if USE_FEATURES else None,
      topic_features=topic_features if USE_FEATURES else None,
      test_user_frac=TEST_USER_FRAC,
      val_user_frac=VAL_USER_FRAC,
      train_negative_frac=TRAIN_NEGATIVE_FRAC,
      test_sample_strat="newest"
  )
  return data_splitter

In [8]:
events_preprocessed_german = preprocess_events(events, topics, math=False, german=True)

In [9]:
events_preprocessed_math = preprocess_events(events, topics, math=True, german=False)

In [10]:
german_ids = list(events_preprocessed_german['topic_id'].unique())
math_ids = list(events_preprocessed_math['topic_id'].unique())

In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [12]:
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl 
import torch
from torch import optim, nn
import pytorch_lightning as pl
import numpy as np
import pandas as pd 
from evaluation import HitRate_NDCG_MRR, metrics_per_topic, HitRate_NDCG_MRR_from_CSV, getMRR, getHitRatio, getNDCG

from ncf_model import NCFNetwork
from data import LeaveOneOutDS

BATCH_SIZE = 64


In [29]:
from pytorch_lightning.callbacks import EarlyStopping, TQDMProgressBar

def run_model(USE_FEATURES, PREDICTIVE_FACTORS, STUDENT_EMBEDDING_DIM, TOPIC_EMBEDDING_DIM, data_splitter, joint, epochs=10, patience=3):

    train_ds = LeaveOneOutDS(data_splitter.get_data(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())
    val_ds = LeaveOneOutDS(data_splitter.get_val_data(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())
    test_ds = LeaveOneOutDS(data_splitter.get_test_data(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())

    test_ds_gender_swap = LeaveOneOutDS(data_splitter.get_test_data_gender_swap(), data_splitter.get_user_ids(), data_splitter.get_topic_ids())
    
    train_dataloader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_dataloader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)
    test_dataloader = DataLoader(test_ds, batch_size=BATCH_SIZE, shuffle=False)

    test_dataloader_gender_swap = DataLoader(test_ds_gender_swap, batch_size=BATCH_SIZE, shuffle=False)
    ncf = NCFNetwork(
        num_students=data_splitter.num_students,
        num_topics=data_splitter.num_topics,
        student_embedding_dim=STUDENT_EMBEDDING_DIM,
        topic_embedding_dim=TOPIC_EMBEDDING_DIM,
        predictive_factors=PREDICTIVE_FACTORS,
        use_features=USE_FEATURES,
        intermediate_size_divisor=2,
        output_MLP_num_layers=3,
        num_user_features=data_splitter.num_user_features,
        num_topic_features=data_splitter.num_topic_features,
        loss=nn.BCELoss(),
        joint=joint,
        topic_ids=data_splitter.get_topic_ids(),
        german_ids=german_ids,
        math_ids=math_ids
        )
    

    early_stop_callback = EarlyStopping(
        monitor="val_loss",
        min_delta=0.0,
        patience=patience,
        verbose=True,
        mode="min"
    )

    trainer = pl.Trainer(
            accelerator="auto",
            devices=1,
            accumulate_grad_batches=1,
            max_epochs=epochs,
            callbacks=[TQDMProgressBar(refresh_rate=10), early_stop_callback]
    )

    trainer.fit(model=ncf, train_dataloaders=train_dataloader, val_dataloaders=val_dataloader)
    trainer.test(model=ncf, dataloaders=test_dataloader)
    original_eval = ncf.eval_results

    trainer.test(model=ncf, dataloaders=test_dataloader_gender_swap)
    
    return original_eval, ncf.eval_results

In [14]:
# NOTE: use the flags below to select what dataset to create and to train on!
USE_MATH = True 
USE_GERMAN = False 
USE_FEATURES = True
TEST_GENDER = True
#############################################################################

data_splitter = create_datasplitter(USE_MATH, USE_GERMAN, USE_FEATURES)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col_name] = df.apply(get_val, axis=1)


Sampled initial validation and test interactions
Completed test dataset
Completed validation dataset
Completed train dataset
Adding features...


In [31]:
joint = USE_MATH and USE_GERMAN

results, results_gender_swap = run_model(USE_FEATURES, 16, 128, 32, data_splitter, joint, patience=4, epochs=10)

  rank_zero_warn(
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                    | Type       | Params
-------------------------------------------------------
0 | student_embedding_layer | Embedding  | 785 K 
1 | topic_embedding_layer   | Embedding  | 1.4 K 
2 | user_embed_MLP          | Sequential | 17.6 K
3 | user_feature_MLP        | Sequential | 82    
4 | topic_embed_MLP         | Sequential | 1.1 K 
5 | topic_feature_MLP       | Sequential | 5     
6 | network                 | Sequential | 6.1 K 
7 | loss                    | BCELoss    | 0     
---------------------

intermediate layer size (concatenated): 85


Sanity Checking: 0it [00:00, ?it/s]

  for user, topic in user_predict:


HitRate@10 0.5
NDCG@10 0.23298993730885964
MRR@10 0.1527777777777778


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:
INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved. New best score: 0.347


HitRate@10 0.5571273122959739
NDCG@10 0.3130421541369026
MRR@10 0.23952838661761405


Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:
INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.017 >= min_delta = 0.0. New best score: 0.330


HitRate@10 0.5484221980413493
NDCG@10 0.31054143067588713
MRR@10 0.23864967096740763


Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:
INFO:pytorch_lightning.callbacks.early_stopping:Metric val_loss improved by 0.035 >= min_delta = 0.0. New best score: 0.295


HitRate@10 0.5429815016322089
NDCG@10 0.3123206821330281
MRR@10 0.24243527298478335


Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:


HitRate@10 0.5603917301414582
NDCG@10 0.32127541269293547
MRR@10 0.24865191633417968


Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:


HitRate@10 0.5854189336235038
NDCG@10 0.32793250422578435
MRR@10 0.2505013213119851


Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:


HitRate@10 0.5810663764961915
NDCG@10 0.32976981121287813
MRR@10 0.25360165466259044


Validation: 0it [00:00, ?it/s]

  for user, topic in user_predict:
INFO:pytorch_lightning.callbacks.early_stopping:Monitored metric val_loss did not improve in the last 4 records. Best score: 0.295. Signaling Trainer to stop.
INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


HitRate@10 0.5865070729053319
NDCG@10 0.3293859322801955
MRR@10 0.2518800628702696


Testing: 0it [00:00, ?it/s]

  for user, topic in user_predict:


INFO:pytorch_lightning.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

  for user, topic in user_predict:


In [41]:
def decode_gender(a):
  if a=='MALE':
    return 'Male'
  elif a=='FEMALE':
    return 'Female'
  else:
    return 'Other'

In [33]:
df = pd.DataFrame({'user_id': results[0], 'topic_id': results[1], 'was_interaction': results[2].flatten(), 'predict_proba': results[3].flatten()})
df_gender_swap = pd.DataFrame({'user_id': results_gender_swap[0], 'topic_id': results_gender_swap[1], 'was_interaction': results_gender_swap[2].flatten(), 'predict_proba': results_gender_swap[3].flatten()})

In [63]:
print(df['user_id'])

0         5539.0
1         5539.0
2         5539.0
3         5539.0
4         5539.0
           ...  
209561    1218.0
209562    1218.0
209563    1218.0
209564    1218.0
209565    1218.0
Name: user_id, Length: 209566, dtype: float32


In [69]:
user_predict = df.groupby(['user_id'])
gender_swap_dict = {}

for user, topic in user_predict:
    # Get the top N of highest probability and rank them 
    if (topic['was_interaction']==1).any() == False:
        continue
    top5 = [x for _, x in sorted(zip(topic['predict_proba'], topic['topic_id']), reverse=True)][:5]

    if (users['user_id'] == user).any():
      gender = users[users['user_id'] == user]['gender']
      gender_swap_dict[user] = {'Female': None, 'Male': None, 'Other': None}
      gender_swap_dict[user][decode_gender(gender)] = top5
    else:
      print('USER does not exist, check again!')
    
for user, topic in df_gender_swap.groupby(['user_id']):
    # Get the top N of highest probability and rank them 
    if (topic['was_interaction']==1).any() == False:
        continue
    top5 = [x for _, x in sorted(zip(topic['predict_proba'], topic['topic_id']), reverse=True)][:5]
    if (users['user_id'] == user).any():
      gender = users[users['user_id'] == user]['gender']
      gender_swap_dict[user][decode_gender(gender)] = top5
      print(gender_swap_dict[user])

  for user, topic in user_predict:


USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not exist, check again!
USER does not 

KeyboardInterrupt: ignored

In [70]:
import numpy as np

# Save
np.save('gender_swap_dict.npy', gender_swap_dict) 

# Load
read_dictionary = np.load('gender_swap_dict.npy',allow_pickle='TRUE').item()
print(read_dictionary) # displays "world"

{}
