In [1]:
import os
from datetime import datetime
import time
import tqdm
import pandas as pd
import random
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch

In [14]:
class CFG:
    path = "../../input/data/train_dataset"
    train = "train_data.csv"
    test = "test_data.csv"

In [15]:
csv_file_path = os.path.join(CFG.path, CFG.train)
df = pd.read_csv(csv_file_path)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225


# class Preprocess

In [16]:
# __preprocessingle
cate_cols = ['assessmentItemID', 'testId', 'KnowledgeTag']

le_list = []

is_train=True

for col in cate_cols:
    le = LabelEncoder()
    if is_train:
        a = df[col].unique().tolist() + ['unknown']
        # test에서 존재하지 않는 category 값 있어서 unknown으로 처리
        le.fit(a)
        le_list.append(le.classes_)
        df[col]= df[col].astype(str)
        test = le.transform(df[col])  # transform: cat to index
        df[col] = test

In [17]:
for l in le_list:
    print(l)
    break

['A010001001' 'A010001002' 'A010001003' ... 'A090074005' 'A090074006'
 'unknown']


In [18]:
df.head()  # index encoding으로 범주형이 변경된 걸 확인할 수 있음

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,5354,975,1,2020-03-24 00:17:11,618
1,0,5355,975,1,2020-03-24 00:17:14,619
2,0,5356,975,1,2020-03-24 00:17:22,619
3,0,5357,975,1,2020-03-24 00:17:29,619
4,0,5358,975,1,2020-03-24 00:17:36,619


In [7]:
# 저장된 npy 불러오기
# 위에 df 값이 index로 변환된 거에 또 수행하면 안됨. df 복사해서 시도하면 똑같을듯

# le_list = []

# for col in cate_cols:
#     label_path = os.path.join("/opt/ml/my_code/asset", col + "_classes.npy")
#     le.classes = np.load(label_path)
#     le_list.append(le.classes)
#     df[col] = df[col].apply(lambda x: x if x in le.classes_ else 'unknown')
#     df[col]= df[col].astype(str)
#     test = le.transform(df[col])  # transform: cat to index
#     df[col] = test

In [8]:
# for l in le_list:
#     print(l)
#     break

In [9]:
# df.head()  # index encoding으로 범주형이 변경된 걸 확인할 수 있음

In [19]:
def convert_time(s):
    timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
    return int(timestamp)

df['Timestamp'] = df['Timestamp'].apply(convert_time)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,5354,975,1,1585009031,618
1,0,5355,975,1,1585009034,619
2,0,5356,975,1,1585009042,619
3,0,5357,975,1,1585009049,619
4,0,5358,975,1,1585009056,619


In [11]:
df = df.sort_values(by=['userID', 'Timestamp'], axis=0)
columns = ['userID', 'assessmentItemID', 'testId', 'answerCode', 'KnowledgeTag']
group = df[columns].groupby('userID').apply(
        lambda r: (
            r['testId'].values, 
            r['assessmentItemID'].values,
            r['KnowledgeTag'].values,
            r['answerCode'].values
        )
    )  # userID로 묶어서 다른 feature들을 묶어준 듯

In [30]:
df[:745]  # 0번 user의 row 수 -> 추후 seq_len이 된다

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag
0,0,5354,975,1,1585009031,618
1,0,5355,975,1,1585009034,619
2,0,5356,975,1,1585009042,619
3,0,5357,975,1,1585009049,619
4,0,5358,975,1,1585009056,619
...,...,...,...,...,...,...
740,0,8858,1453,1,1608694554,280
741,0,8859,1453,0,1608694640,281
742,0,8860,1453,1,1608694723,281
743,0,8861,1453,0,1608694814,281


In [35]:
group[0]

(array([ 975,  975,  975,  975,  975,  975,  977,  977,  977,  977,  977,
         977,  977,  979,  979,  979,  979,  979,  979,  979,  981,  981,
         981,  981,  981,  981,  981, 1326, 1326, 1326, 1326, 1326, 1326,
         983,  983,  983,  983,  983,  983,  983,  990,  990,  990,  990,
         990,  990,  990, 1328, 1328, 1328, 1328, 1328, 1328, 1328, 1328,
        1330, 1330, 1330, 1330, 1330, 1330, 1330, 1330,  992,  992,  992,
         992,  992,  992,  992,  994,  994,  994,  994,  994,  994,  994,
        1332, 1332, 1332, 1332, 1332, 1332, 1332, 1332, 1334, 1334, 1334,
        1334, 1334, 1334, 1334, 1334,  996,  996,  996,  996,  996,  996,
         996, 1336, 1336, 1336, 1336, 1336, 1336, 1336, 1336, 1338, 1338,
        1338, 1338, 1338, 1338, 1338, 1338, 1340, 1340, 1340, 1340, 1340,
        1340, 1340, 1340,  998,  998,  998,  998,  998,  998,  998, 1342,
        1342, 1342, 1342, 1342, 1342, 1342, 1342, 1344, 1344, 1344, 1344,
        1344, 1344, 1344, 1344, 1005, 

In [36]:
len(group[0])

4

### split_data
- user와 시간순으로 정렬되어있긴 하지만, 임의로 비율로 자르면 data leakage가 아주 소수 있지 않을까 ? (한 user 정도)
- 아니다 .. user별로 group_by 했으므로 user가 겹칠일은 없으니 그냥 만들면 될 듯 하다! 편해졌네

In [75]:
ratio = 0.7

size = int(len(group)  * ratio)
data_1 = group[:size]
data_2 = group[size:]

In [76]:
data_1

userID
0       ([975, 975, 975, 975, 975, 975, 977, 977, 977,...
1       ([596, 596, 596, 596, 596, 598, 598, 598, 598,...
2       ([435, 435, 435, 435, 435, 386, 386, 386, 386,...
5       ([1325, 1325, 1325, 1325, 1325, 1325, 1327, 13...
6       ([401, 401, 401, 401, 401, 401, 402, 402, 402,...
                              ...                        
5194    ([8, 8, 8, 8, 8, 6, 6, 6, 6, 6, 19, 19, 19, 19...
5197    ([216, 216, 216, 216, 216, 216, 216, 256, 256,...
5198    ([1477, 1477, 1477, 1477, 1477, 1477, 1477, 14...
5199    ([682, 682, 682, 682, 682, 682, 684, 684, 684,...
5200    ([1471, 1471, 1471, 1471, 1471, 1471, 1471, 14...
Length: 4688, dtype: object

In [77]:
data_2

userID
5202    ([803, 803, 803, 803, 803, 803, 803, 809, 809,...
5203    ([194, 194, 194, 194, 232, 232, 232, 232, 232,...
5204    ([1055, 1055, 1055, 1055, 1055, 1055, 1055, 10...
5205    ([985, 985, 985, 985, 985, 988, 988, 988, 988,...
5206    ([1224, 1224, 1224, 1224, 1224, 1224, 1232, 12...
                              ...                        
7436    ([876, 876, 876, 876, 394, 394, 394, 394, 394,...
7437    ([655, 655, 655, 655, 655, 655, 1038, 1038, 10...
7438    ([1326, 1326, 1326, 1326, 1326, 1326, 774, 774...
7440    ([877, 877, 877, 877, 877, 521, 521, 521, 521,...
7441    ([456, 456, 456, 456, 456, 748, 748, 748, 748]...
Length: 2010, dtype: object

# class DKTDataset
- Preprocess에서 처리한 데이터를 train/val으로 split 한 데이터를 input으로 받는다

In [39]:
# group : testid, assessmentitemid, knowledgetag, answercode
row = group[0]
seq_len = len(row[0])
seq_len

745

In [40]:
test, question, tag, correct = row[0], row[1], row[2], row[3]
cate_cols = [test, question, tag, correct]

In [79]:
len(test), len(question), len(tag), len(correct)

(745, 745, 745, 745)

In [41]:
max_seq_len=300

# max seq len을 고려하여서 이보다 길면 자르고 아닐 경우 그대로 냅둔다
if seq_len > max_seq_len:
    for i, col in enumerate(cate_cols):
        cate_cols[i] = col[-max_seq_len:]
    mask = np.ones(max_seq_len, dtype=np.int16)
else:
    mask = np.zeros(max_seq_len, dtype=np.int16)
    mask[-seq_len:] = 1

In [42]:
mask

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], d

In [44]:
cate_cols.append(mask)  # mask도 추가, 여기서는 max_seq_len 안넘어서 mask 값인 0 추가 안됨

# dataloader

In [56]:
import sys
sys.path.append("/opt/ml/my_code")
from args import parse_args

In [59]:
class Cfg:
    seed = 42
    device = "cpu"
    data_dir = "/opt/ml/input/data/train_dataset"
    asset_dir = "asset/"
    file_name = "train_data.csv"
    model_dir = "models/"
    output_dir = "output/"
    max_seq_len = 20
    num_workers=1
    hidden_dim = 64
    n_layers = 2
    n_heads = 2
    drop_out = 0.2
    n_epochs = 20
    batch_size = 64
    lr = 1e-4
    clip_grad = 10
    patience = 5
    log_steps = 50
    model = 'lstm'
    optimizer = 'adam'
    scheduler = 'plateau'

In [60]:
args = Cfg()

In [62]:
from dkt.dataloader import Preprocess

In [64]:
preprocess = Preprocess(args)
preprocess.load_train_data(args.file_name)
train_data = preprocess.get_train_data()
train_data, valid_data = preprocess.split_data(train_data)

In [66]:
sys.path.append("/opt/ml/my_code/dkt")
from dataloader import get_loaders

train_loader, valid_loader = get_loaders(args, train_data, valid_data)

In [70]:
next(iter(train_loader))

(tensor([[ 241.,  241.,  241.,  ...,  350.,  350.,  350.],
         [ 738.,  738.,  738.,  ..., 1457., 1457., 1457.],
         [ 506.,  506.,  506.,  ...,  508.,  508.,  508.],
         ...,
         [ 972.,  972.,  972.,  ...,  913.,  913.,  913.],
         [ 707.,  707.,  709.,  ..., 1437., 1437., 1437.],
         [1137., 1137., 1137.,  ..., 1139., 1139., 1139.]]),
 tensor([[1226., 1227., 1228.,  ..., 1813., 1814., 1815.],
         [3855., 3856., 3857.,  ..., 8896., 8897., 8898.],
         [2636., 2637., 2638.,  ..., 2648., 2649., 2650.],
         ...,
         [5341., 5342., 5343.,  ..., 4955., 4956., 4957.],
         [3701., 3702., 3708.,  ..., 8738., 8739., 8740.],
         [6421., 6422., 6423.,  ..., 6435., 6436., 6437.]]),
 tensor([[711., 711., 711.,  ..., 752., 752., 752.],
         [220., 220., 220.,  ..., 770., 770., 770.],
         [494., 494., 494.,  ..., 498., 498., 498.],
         ...,
         [ 24.,  21.,  24.,  ..., 364., 364., 486.],
         [213., 213., 214.,  ..., 

In [71]:
next(iter(train_loader))[0].shape  # [batch_size, max_len_seq]

torch.Size([64, 20])