In [8]:
#!pip install torch
#!pip install torch_geometric
#!pip install matplotlib
#!pip install pandas

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data, HeteroData
import torch_geometric.transforms as T

# For KDD Cup dataset

In [2]:
def get_data_object(user_ids, resource_ids, edge_index, edge_attr, edge_time, edge_y):
    data = HeteroData()
    data["user"].node_id = torch.tensor(user_ids)
    data["resource"].node_id = torch.tensor(resource_ids)
    data["user", "accesses", "resource"].edge_index = torch.tensor(edge_index).t().contiguous()
    data["user", "accesses", "resource"].edge_attr = torch.tensor(edge_attr)
    data["user", "accesses", "resource"].time = torch.tensor(edge_time)
    data["user", "accesses", "resource"].edge_y = torch.tensor(edge_y)
    data = T.ToUndirected()(data)
    return data

In [3]:
path = "data/act-mooc/"

# load TSV files
actions_df = pd.read_csv(path + 'mooc_actions.tsv', sep='\t')
labels_df = pd.read_csv(path + 'mooc_action_labels.tsv', sep='\t')
features_df = pd.read_csv(path + 'mooc_action_features.tsv', sep='\t')

data = get_data_object(user_ids=actions_df['USERID'].unique(),
                      resource_ids=actions_df['TARGETID'].unique(),
                      edge_index=actions_df[['USERID', 'TARGETID']].values,
                      edge_attr=features_df[['FEATURE0', 'FEATURE1', 'FEATURE2', 'FEATURE3']].values,
                      edge_time=actions_df['TIMESTAMP'].values,
                      edge_y=labels_df['LABEL'].values)

data

HeteroData(
  user={ node_id=[7047] },
  resource={ node_id=[97] },
  (user, accesses, resource)={
    edge_index=[2, 411749],
    edge_attr=[411749, 4],
    time=[411749],
    edge_y=[411749],
  },
  (resource, rev_accesses, user)={
    edge_index=[2, 411749],
    edge_attr=[411749, 4],
    time=[411749],
    edge_y=[411749],
  }
)

In [4]:
actions_df['TIMESTAMP'].max()

2572086.0

In [13]:
tsl = pd.concat([actions_df['TIMESTAMP'], labels_df['LABEL']], axis=1)
tsl.loc[tsl['LABEL']==1,].max()

TIMESTAMP    2570620.0
LABEL              1.0
dtype: float64

sanity check for the data object

In [96]:
# compare stats from https://snap.stanford.edu/data/act-mooc.html
assert data.edge_index_dict["user", "accesses", "resource"].shape[1] == 411749
assert data['resource'].num_nodes == 97
assert data['user'].num_nodes == 7047
assert (data.edge_y_dict[("user", "accesses", "resource")] == 1).sum().item() == 4066

In [97]:
torch.save(data, path + "graph.pt")

# For Junyi dataset

In [21]:
path = "data/junyi/"

# Read the Log_Problem.csv file
log_problem_df = pd.read_csv(path + "Log_Problem.csv")


In [22]:
log_problem_df.shape

(16217311, 14)

In [23]:
log_problem_df.head()

Unnamed: 0,timestamp_TW,uuid,ucid,upid,problem_number,exercise_problem_repeat_session,is_correct,total_sec_taken,total_attempt_cnt,used_hint_cnt,is_hint_used,is_downgrade,is_upgrade,level
0,2019-05-26 21:00:00 UTC,FLy+lviglNR5Y1l0Xiijnl6QHySBcpKHJLCtQ6ogm2Q=,KDOmuTrY/IJzDP4kIgIYCBiGyTymsJ8Iy4cDB35WGYg=,Vbs92l4JmdiWkUEm/iahxnUTaac2oN1IlUtXB7JcfoE=,18,2,True,33,1,0,False,False,True,3
1,2019-05-17 16:30:00 UTC,+Gqj2nalc6M9fusyVECTC0AN7UQdDQTXESIuElkDltU=,COZ39Wo+uIUO2s7c2VGEHjJf6Vx0xifxVAiaeHtaTdk=,Ek+pIeHNNoEo0tGEq91eBcBmGgy3+A5RWhpj95zTyHM=,4,1,True,8,1,0,False,,,0
2,2019-05-15 19:15:00 UTC,6D5QN8j8ng/VR74ES3A0zqAj0bIFFyaKjKEj8ZyXjQ8=,TwyqyV1uJYlDAX8wX/PtTCVZEBo/APIVfTzzleGkNCQ=,1MBa2f5Qog4JBoAuUfJf0fxeJctdEirAqKgfsg246eI=,9,1,True,17,1,0,False,,,0
3,2019-05-05 14:45:00 UTC,GgTZuCqZXObthtK6GAwqvlHrTMm5pKHWeezQxL/pcKc=,tBo6ECyT8IlKAM8UhQHWkqv92PRLcSiwuerfC7vNX+w=,kdMy2nG+QVMjPkuaMEWs0yV/sYZVoG1vm7zM0fCy+qk=,2,1,True,10,1,0,False,,,0
4,2019-05-14 16:45:00 UTC,JMNKWoU0CkMSzgQ8bCnmCYlD8jEzAVge3lHMYLXKM2g=,vVpSKAMQbTMvtdERR0ksOeRmmaFt0R210t4Z//0RpPA=,jjPR8fmkLSFoCQQYB4g6kI8mgdcK3sKtMirKUvfmZIk=,6,1,True,98,1,0,False,,,0


In [24]:
len(list(log_problem_df.problem_number.unique()))

1702

In [25]:
len(list(log_problem_df.uuid.unique()))

72758

In [26]:
unique_ids = log_problem_df.uuid.unique()
sampled_ids = np.random.choice(unique_ids, size=7000, replace=False)
log_problem_df_2 = log_problem_df[log_problem_df.uuid.isin(sampled_ids)]

In [27]:
len(list(log_problem_df_2.uuid.unique()))

7000

In [28]:
len(list(log_problem_df_2.problem_number.unique()))

283

In [29]:
log_problem_df_2.shape

(1568357, 14)

In [31]:
def process_and_save(log_problem_df, file_name)

    # map uuid to USERID
    log_problem_df['USERID'] = pd.factorize(log_problem_df.uuid)[0]
    log_problem_df['RESOURCEID'] = pd.factorize(log_problem_df.problem_number)[0]
    
    # extract edge features
    log_problem_df['timestamp_TW'] = pd.to_datetime(log_problem_df['timestamp_TW'], format='%Y-%m-%d %H:%M:%S UTC')
    earliest_time = log_problem_df['timestamp_TW'].min()
    log_problem_df['seconds_since_start'] = (log_problem_df['timestamp_TW'] - earliest_time).dt.total_seconds()
    
    log_problem_df['is_correct'] = log_problem_df['is_correct'].fillna(-1).astype(int)
    edge_attr = log_problem_df[['exercise_problem_repeat_session', 'is_correct', 'total_sec_taken', 'total_attempt_cnt', 'used_hint_cnt', 'level']].values
    
    one_month_in_seconds = 30*24*3600
    latest_time = log_problem_df['seconds_since_start'].max()
    
    log_problem_df = log_problem_df.sort_values(by='seconds_since_start')
    time_diffs = log_problem_df.groupby('USERID')['seconds_since_start'].diff(-1).abs()
    
    # Flag interactions followed by a break of over one month
    log_problem_df['dropout'] = time_diffs.gt(one_month_in_seconds).astype(int)
    
    # Flag the very last interaction of a user if it's more than a month from the end
    is_last_interaction = log_problem_df['seconds_since_start'] == log_problem_df.groupby('USERID')['seconds_since_start'].transform('last')
    log_problem_df.loc[is_last_interaction & (latest_time - log_problem_df['seconds_since_start'] > one_month_in_seconds), 'dropout'] = 1
    
    
    # Create PyTorch Geometric Data
    data = get_data_object(user_ids=log_problem_df['USERID'].unique(),
                          resource_ids=log_problem_df['RESOURCEID'].unique(),
                          edge_index=log_problem_df[['USERID', 'RESOURCEID']].values,
                          edge_attr=edge_attr,
                          edge_time=log_problem_df['seconds_since_start'].values,
                          edge_y=log_problem_df['dropout'].values)
    data = T.NormalizeFeatures()(data)
    
    torch.save(data, file_name)
    
    print(data)
    return data

HeteroData(
  user={ node_id=[7000] },
  resource={ node_id=[283] },
  (user, accesses, resource)={
    edge_index=[2, 1568357],
    edge_attr=[1568357, 6],
    time=[1568357],
    edge_y=[1568357],
  },
  (resource, rev_accesses, user)={
    edge_index=[2, 1568357],
    edge_attr=[1568357, 6],
    time=[1568357],
    edge_y=[1568357],
  }
)

In [None]:
data = process_and_save(log_problem_df, path + "graph.pt")
assert data.edge_index_dict["user", "accesses", "resource"].shape[1] == 16217311
assert data['resource'].num_nodes == 1702
assert data['user'].num_nodes == 72758

In [36]:
data = process_and_save(log_problem_df2, path + "graph_sub.pt")
assert data.edge_index_dict["user", "accesses", "resource"].shape[1] < 5000000
assert data['resource'].num_nodes < 1702
assert data['user'].num_nodes == 7000