In [1]:
# Import the tables of the data set as dataframes.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

DATA_DIR = './../data' 
STUDY_DIR = DATA_DIR + '/study'


users = pd.read_csv(f'{DATA_DIR}/users.csv.gz')

# use study for less data, for testing
events = pd.read_csv(f'{STUDY_DIR}/events.csv.gz')
transactions = pd.read_csv(f'{STUDY_DIR}/transactions.csv.gz')

In [2]:

import torch
from torch.utils.data import Dataset, DataLoader
import random
import pandas as pd
import numpy as np

class ItemKNNDS(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        user, topic, y = self.data[index]

        user = torch.tensor(user)

        topic = torch.tensor(topic)
        y = torch.tensor([y])
        return user, topic, y

class ItemKNNSplitter:
    def __init__(self,
                 df,
                 test_user_frac=0.5,
                 ):
        
        events_df = df[~df['topic_id'].isna()]

        #events_df = events_df[events_df['action'].isin(['GO_TO_THEORY', 'SUBMIT_ANSWER'])]

        interactions = events_df[['user_id', 'topic_id', 'event_id']].groupby(['user_id', 'topic_id']).count()
        interactions = interactions[interactions['event_id'] >= 5]
        interactions_index = interactions.index

        interactions = interactions.reset_index()
        interactions = interactions.rename(columns={'event_id': 'count'})

        self.matrix = interactions.pivot_table(index='topic_id', columns='user_id', values='count')
        self.matrix = self.matrix.subtract(self.matrix.mean(axis=1), axis=0)

        user_ids = list(set(map(lambda x: x[0], list(interactions_index))))

        test_size = int(test_user_frac * len(user_ids))

        user_ids = random.sample(user_ids, test_size)

        self.test_samples = []

        for uid in user_ids:
            tid = random.choice(self.matrix[~self.matrix[uid].isna()].reset_index()['topic_id'])
            val = self.matrix[uid][tid]
            self.matrix[uid][tid] = np.nan
            self.test_samples.append((uid, tid, val))


   
    def get_matrix(self):
        return self.matrix

    def get_test_samples(self):
        return self.test_samples

    # def get_test_dataset(self):
    #     return ItemKNNDS(self.get_test_samples())


In [3]:
from itemknn_model import ItemKNN

splitter = ItemKNNSplitter(events)


model = ItemKNN(5)
model.train(splitter.get_matrix())



In [4]:
for sample in splitter.test_samples:
    model.test_step(sample)

In [5]:
model.predict_proba

tensor([ 1.5188e+01,  4.4845e+01, -9.5294e+00, -4.1922e+01,  3.0618e+01,
        -2.2926e+01,  0.0000e+00,  0.0000e+00,  6.0454e+00, -3.6847e+01,
         4.8257e+01,  0.0000e+00,  5.4167e-01,  2.1227e+01, -1.2461e+01,
         1.3955e+01,         nan,  0.0000e+00, -6.7115e+00,  2.2637e+01,
         0.0000e+00,  0.0000e+00, -2.8357e+01, -3.6490e+01,  0.0000e+00,
         1.3991e+01,  2.6231e+01, -8.7333e+01,  0.0000e+00, -4.7397e+01,
         1.8471e+01,  1.9567e+02,  0.0000e+00, -2.2926e+01,  1.8618e+02,
        -1.7963e+01, -3.0377e+02,  1.7519e+02,  0.0000e+00, -3.0709e+01,
         3.0722e+01,  1.3155e+02, -1.3396e+01, -3.8590e+01,  0.0000e+00,
         9.4435e+00, -1.6532e+01,  3.1183e-01,  0.0000e+00,         nan,
         0.0000e+00,  0.0000e+00, -2.4793e+01, -1.1022e+01,  0.0000e+00,
        -2.3514e+01, -1.5504e+01,  1.3366e+01, -1.9124e+00, -1.4757e+01,
        -4.3078e+01,  3.4401e+01,  0.0000e+00,  2.7196e+01,  0.0000e+00,
        -3.9709e+01, -1.1693e+01, -1.2126e+03,  4.3

In [6]:
import csv


def get_predictions(data, probas):
    return [(item[0], item[1], item[2], proba.item()) for item, proba in zip(data, probas)]


def write_outputs(data, loss_logs, model_description, output_dir):
    probas_output_path = f"{output_dir}/{model_description}_probas.csv"

    with open(probas_output_path, 'w') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['user_id', 'topic_id', 'was_interaction', 'predict_proba'])

        for row in data:
            csv_out.writerow(row)

    loss_outputs_path = f"{output_dir}/{model_description}_loss.csv"

    with open(loss_outputs_path, 'w') as f:
        csv_out = csv.writer(f)
        csv_out.writerow(['loss_value', 'iteration'])
        for idx, loss_value in enumerate(loss_logs, 1):
            csv_out.writerow((loss_value, idx))

In [8]:
write_outputs(get_predictions(splitter.test_samples, model.predict_proba), [], 'test', '.')