In [36]:
import sys
sys.path.append("/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/")

from fe.agg import AggFeBase
from fe.seq import SeqFeBase

from fe.agg import (
    MakeCorrectCount, 
    MakeCorrectPercent, 
    MakeQuestionCount, 
    MakeTopNCorrectPercent
)

from fe.seq import (
    SplitAssessmentItemID,
    MakeFirstClass,
    MakeSecondClass,
    MakeTimeDiff,
    MakeYMD,
    ConvertTime
)

import easydict
from fe.feature import FEPipeline
from dkt_dataset import Preprocess
from utils import get_args, get_root_dir
from IPython.display import clear_output

In [101]:
def make_df_pipeline(train_data, test_data):
    train_data = MakeFirstClass.transform(train_data, key="train")
    test_data = MakeFirstClass.transform(test_data, key="test")
    
    feature_data = pd.merge(train_data, train_data.groupby(["userID", "firstClass"]).answerCode.mean(), 
                        on=["userID", "firstClass"], how="inner", suffixes=['', '_userper'])
    
    t_data = pd.merge(test_data, test_data.groupby(["userID", "firstClass"]).answerCode.mean(), 
                        on=["userID", "firstClass"], how="inner", suffixes=['', '_userper'])
    
    feature_data = pd.merge(feature_data, 
                        train_data.groupby("assessmentItemID").answerCode.mean(),
                       on="assessmentItemID", how="inner", suffixes=['', '_assper'])
    
    t_data = pd.merge(t_data, 
                        train_data.groupby("assessmentItemID").answerCode.mean(),
                       on="assessmentItemID", how="inner", suffixes=['', '_assper'])
    
    feature_data['firstClass'] = feature_data['testId'].apply(lambda x: int(x[2:3]))
    t_data['firstClass'] = t_data['testId'].apply(lambda x: int(x[2:3]))
    
    feature_data = pd.merge(feature_data, 
                        feature_data.groupby("firstClass").answerCode.mean(),
                       on="firstClass", how="inner", suffixes=["", "_firper"])
    
    t_data = pd.merge(t_data, 
                        feature_data.groupby("firstClass").answerCode.mean(),
                       on="firstClass", how="inner", suffixes=["", "_firper"])
    
    feature_data = pd.merge(feature_data, 
                        feature_data.groupby("testId").answerCode.mean(),
                       on="testId", how="inner", suffixes=["", "_testper"])
    
    t_data = pd.merge(t_data, 
                        feature_data.groupby("testId").answerCode.mean(),
                       on="testId", how="inner", suffixes=["", "_testper"])
    
    feature_data = pd.merge(feature_data, 
                        feature_data.groupby(["userID", "KnowledgeTag"]).answerCode.mean(),
                       on=["userID", "KnowledgeTag"], how="inner", suffixes=["", "_knowper"])
    
    t_data = pd.merge(t_data, 
                        feature_data.groupby(["userID", "KnowledgeTag"]).answerCode.mean(),
                       on=["userID", "KnowledgeTag"], how="inner", suffixes=["", "_knowper"])
    
    
    feature_data = feature_data.rename(columns={
        "answerCode_firper" : "firPer",
        "answerCode_userper": "userPer",
        "answerCode_assper": "assPer",
        "answerCode_testper" : "testPer",
        "answerCode_knowper" : "secPer"
    })

    t_data = t_data.rename(columns={
        "answerCode_firper" : "firPer",
        "answerCode_userper": "userPer",
        "answerCode_assper": "assPer",
        "answerCode_testper" : "testPer",
        "answerCode_knowper" : "secPer"
    })

    return feature_data, t_data

In [102]:
args = get_args()
args.data_dir = "../../input/data/train_dataset/"
args.root_dir = get_root_dir("../last_lstm/")

In [103]:
import os.path as p
import pandas as pd

train_data = pd.read_csv(p.join(args.data_dir, "train_data.csv"))
test_data = pd.read_csv(p.join(args.data_dir, "test_data.csv"))

In [104]:
import torch
from trainer import DKTTrainer
from models.lstm.model import LSTM

class FeatureTestTrainer(DKTTrainer):
    def _process_batch(self, batch):
        batch['mask'] = batch['mask'].type(torch.FloatTensor)
        batch["answerCode"] = batch["answerCode"].type(torch.FloatTensor)

        batch["interaction"] = batch["answerCode"] + 1
        batch["interaction"] = batch["interaction"].roll(shifts=1, dims=1)
        batch["mask"] = batch["mask"].roll(shifts=1, dims=1)
        batch["mask"][:, 0] = 0
        batch["interaction"] = (batch["interaction"] * batch["mask"]).to(torch.int64)
        
        
        for k in self.args.n_linears: # 수치형
            batch[k] = batch[k].type(torch.FloatTensor)
            
        for k, v in self.args.n_embeddings.items(): # 범주형
            batch[k] = batch[k].to(torch.int64)
            
        for k in batch.keys():
            batch[k] = batch[k].to(self.args.device)
        
        return batch

In [105]:
train_data["org_index"] = train_data.index
test_data["org_index"] = test_data.index

In [106]:
train_set, test_set = make_df_pipeline(train_data, test_data)

load features /home/j-gunmo/features/train_make_first_class.pkl to dataframe ... 
load features /home/j-gunmo/features/test_make_first_class.pkl to dataframe ... 


In [107]:
train_set = train_set.sort_values(["org_index"], axis=0).reset_index(drop=True)
test_set = test_set.sort_values(["org_index"], axis=0).reset_index(drop=True)

In [108]:
test_set.head(2)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,org_index,firstClass,userPer,assPer,firPer,testPer,secPer
0,3,A050023001,A050000023,1,2020-01-09 10:56:31,2626,0,5,0.653132,0.646789,0.658208,0.560944,0.641379
1,3,A050023002,A050000023,1,2020-01-09 10:56:57,2626,1,5,0.653132,0.62844,0.658208,0.560944,0.641379


In [109]:
fe_list = [
    SplitAssessmentItemID,
    ConvertTime,
    MakeFirstClass,
    MakeSecondClass,
    MakeCorrectCount,
    MakeQuestionCount,
    MakeCorrectPercent
]

fe_pipeline = FEPipeline(
    args, fe_list
)

columns = [
    "userID",
    "answerCode",
    "testPaper",
    "firstClass",
    "secondClass",
    "timeSec"
]

add_columns =[
    'userPer',
    'assPer',
    'firPer',
    'testPer',
    'secPer'
]

pre_encodes = {
    "label" : ['testPaper', 'firstClass', 'secondClass'],
    "min_max" : ['userPer', 'assPer', 'firPer', 'testPer', 'secPer'],
    "std" : ["timeSec"]
}

preprocess = Preprocess(args, fe_pipeline, columns)

In [110]:
preprocess.feature_engineering()

Feature Engineering Start ... 
Feature Engineering Start ... 
Feature Engineering Start ... 
Feature Engineering Start ... 
Feature Engineering Start ... 


load features /home/j-gunmo/features/train_split_assessmentitem_id.pkl to dataframe ... 



Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id



testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 060001
INDEX 0000: 060001
INDEX 0000: 060001
INDEX 0000: 060001
INDEX 0000: 060001


INDEX 1000: 080012
INDEX 1000: 080012
INDEX 1000: 080012
INDEX 1000: 080012
INDEX 1000: 080012


INDEX 2000: 060041
INDEX 2000: 060041
INDEX 2000: 060041
INDEX 2000: 060041
INDEX 2000: 060041


INDEX 3000: 080041
INDEX 3000: 080041
INDEX 3000: 080041
INDEX 3000: 080041
INDEX 3000: 080041


INDEX 4000: 060091
INDEX 4000: 060091
INDEX 4000: 060091
INDEX 4000: 060091
INDEX 4000: 060091


INDEX 5000: 060101
INDEX 5000: 060101
INDEX 5000: 060101
INDEX 5000: 060101
INDEX 5000: 060101


INDEX 6000: 080100
INDEX 6000: 080100
INDEX 6000: 080100
INDEX 6000: 080100
INDEX 6000: 080100


INDEX 7000: 080112
INDEX 7000: 080112
INDEX 7000: 080112
INDEX 7000: 080112
INDEX 7000: 080112


INDEX 8000: 040049
INDEX 8000: 040049
INDEX 8000: 040049
INDEX 8000: 040049
INDEX 8000: 040049


INDEX 9000: 090011
INDEX 9000: 090011
INDEX 9000: 090011
INDEX 9000: 090011
INDEX 9000: 090011



testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 001
INDEX 0000: 001
INDEX 0000: 001
INDEX 0000: 001
INDEX 0000: 001


INDEX 1000: 001
INDEX 1000: 001
INDEX 1000: 001
INDEX 1000: 001
INDEX 1000: 001


INDEX 2000: 007
INDEX 2000: 007
INDEX 2000: 007
INDEX 2000: 007
INDEX 2000: 007


INDEX 3000: 005
INDEX 3000: 005
INDEX 3000: 005
INDEX 3000: 005
INDEX 3000: 005


INDEX 4000: 003
INDEX 4000: 003
INDEX 4000: 003
INDEX 4000: 003
INDEX 4000: 003


INDEX 5000: 003
INDEX 5000: 003
INDEX 5000: 003
INDEX 5000: 003
INDEX 5000: 003


INDEX 6000: 004
INDEX 6000: 004
INDEX 6000: 004
INDEX 6000: 004
INDEX 6000: 004


INDEX 7000: 002
INDEX 7000: 002
INDEX 7000: 002
INDEX 7000: 002
INDEX 7000: 002


INDEX 8000: 003
INDEX 8000: 003
INDEX 8000: 003
INDEX 8000: 003
INDEX 8000: 003


INDEX 9000: 007
INDEX 9000: 007
INDEX 9000: 007
INDEX 9000: 007
INDEX 9000: 007


load features /home/j-gunmo/features/train_convert_time.pkl to dataframe ... 



Feature Engineering Name: convert_time

Feature Engineering Name: convert_time

Feature Engineering Name: convert_time

Feature Engineering Name: convert_time

Feature Engineering Name: convert_time



timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.


dtype: int64
dtype: int64
dtype: int64
dtype: int64
dtype: int64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 1584976631
INDEX 0000: 1584976631
INDEX 0000: 1584976631
INDEX 0000: 1584976631
INDEX 0000: 1584976631


INDEX 1000: 1588175753
INDEX 1000: 1588175753
INDEX 1000: 1588175753
INDEX 1000: 1588175753
INDEX 1000: 1588175753


INDEX 2000: 1590428625
INDEX 2000: 1590428625
INDEX 2000: 1590428625
INDEX 2000: 1590428625
INDEX 2000: 1590428625


INDEX 3000: 1592332726
INDEX 3000: 1592332726
INDEX 3000: 1592332726
INDEX 3000: 1592332726
INDEX 3000: 1592332726


INDEX 4000: 1594832366
INDEX 4000: 1594832366
INDEX 4000: 1594832366
INDEX 4000: 1594832366
INDEX 4000: 1594832366


INDEX 5000: 1599493405
INDEX 5000: 1599493405
INDEX 5000: 1599493405
INDEX 5000: 1599493405
INDEX 5000: 1599493405


INDEX 6000: 1602576550
INDEX 6000: 1602576550
INDEX 6000: 1602576550
INDEX 6000: 1602576550
INDEX 6000: 1602576550


INDEX 7000: 1604429286
INDEX 7000: 1604429286
INDEX 7000: 1604429286
INDEX 7000: 1604429286
INDEX 7000: 1604429286


INDEX 8000: 1581193055
INDEX 8000: 1581193055
INDEX 8000: 1581193055
INDEX 8000: 1581193055
INDEX 8000: 1581193055


INDEX 9000: 1584613367
INDEX 9000: 1584613367
INDEX 9000: 1584613367
INDEX 9000: 1584613367
INDEX 9000: 1584613367


load features /home/j-gunmo/features/train_make_first_class.pkl to dataframe ... 



Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class



firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 6
INDEX 0000: 6
INDEX 0000: 6
INDEX 0000: 6
INDEX 0000: 6


INDEX 1000: 8
INDEX 1000: 8
INDEX 1000: 8
INDEX 1000: 8
INDEX 1000: 8


INDEX 2000: 6
INDEX 2000: 6
INDEX 2000: 6
INDEX 2000: 6
INDEX 2000: 6


INDEX 3000: 8
INDEX 3000: 8
INDEX 3000: 8
INDEX 3000: 8
INDEX 3000: 8


INDEX 4000: 6
INDEX 4000: 6
INDEX 4000: 6
INDEX 4000: 6
INDEX 4000: 6


INDEX 5000: 6
INDEX 5000: 6
INDEX 5000: 6
INDEX 5000: 6
INDEX 5000: 6


INDEX 6000: 8
INDEX 6000: 8
INDEX 6000: 8
INDEX 6000: 8
INDEX 6000: 8


INDEX 7000: 8
INDEX 7000: 8
INDEX 7000: 8
INDEX 7000: 8
INDEX 7000: 8


INDEX 8000: 4
INDEX 8000: 4
INDEX 8000: 4
INDEX 8000: 4
INDEX 8000: 4


INDEX 9000: 9
INDEX 9000: 9
INDEX 9000: 9
INDEX 9000: 9
INDEX 9000: 9


load features /home/j-gunmo/features/train_make_second_class.pkl to dataframe ... 



Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class



secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 7224
INDEX 0000: 7224
INDEX 0000: 7224
INDEX 0000: 7224
INDEX 0000: 7224


INDEX 1000: 4659
INDEX 1000: 4659
INDEX 1000: 4659
INDEX 1000: 4659
INDEX 1000: 4659


INDEX 2000: 602
INDEX 2000: 602
INDEX 2000: 602
INDEX 2000: 602
INDEX 2000: 602


INDEX 3000: 4795
INDEX 3000: 4795
INDEX 3000: 4795
INDEX 3000: 4795
INDEX 3000: 4795


INDEX 4000: 628
INDEX 4000: 628
INDEX 4000: 628
INDEX 4000: 628
INDEX 4000: 628


INDEX 5000: 706
INDEX 5000: 706
INDEX 5000: 706
INDEX 5000: 706
INDEX 5000: 706


INDEX 6000: 7171
INDEX 6000: 7171
INDEX 6000: 7171
INDEX 6000: 7171
INDEX 6000: 7171


INDEX 7000: 2711
INDEX 7000: 2711
INDEX 7000: 2711
INDEX 7000: 2711
INDEX 7000: 2711


INDEX 8000: 2071
INDEX 8000: 2071
INDEX 8000: 2071
INDEX 8000: 2071
INDEX 8000: 2071


INDEX 9000: 5261
INDEX 9000: 5261
INDEX 9000: 5261
INDEX 9000: 5261
INDEX 9000: 5261


load features /home/j-gunmo/features/train_make_correct_count.pkl to dataframe ... 



Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count



correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.


dtype: int64
dtype: int64
dtype: int64
dtype: int64
dtype: int64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 470
INDEX 0000: 470
INDEX 0000: 470
INDEX 0000: 470
INDEX 0000: 470


INDEX 1000: 470
INDEX 1000: 470
INDEX 1000: 470
INDEX 1000: 470
INDEX 1000: 470


INDEX 2000: 470
INDEX 2000: 470
INDEX 2000: 470
INDEX 2000: 470
INDEX 2000: 470


INDEX 3000: 470
INDEX 3000: 470
INDEX 3000: 470
INDEX 3000: 470
INDEX 3000: 470


INDEX 4000: 470
INDEX 4000: 470
INDEX 4000: 470
INDEX 4000: 470
INDEX 4000: 470


INDEX 5000: 470
INDEX 5000: 470
INDEX 5000: 470
INDEX 5000: 470
INDEX 5000: 470


INDEX 6000: 470
INDEX 6000: 470
INDEX 6000: 470
INDEX 6000: 470
INDEX 6000: 470


INDEX 7000: 470
INDEX 7000: 470
INDEX 7000: 470
INDEX 7000: 470
INDEX 7000: 470


INDEX 8000: 796
INDEX 8000: 796
INDEX 8000: 796
INDEX 8000: 796
INDEX 8000: 796


INDEX 9000: 796
INDEX 9000: 796
INDEX 9000: 796
INDEX 9000: 796
INDEX 9000: 796


load features /home/j-gunmo/features/train_make_question_count.pkl to dataframe ... 



Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count



quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.


dtype: int64
dtype: int64
dtype: int64
dtype: int64
dtype: int64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 745
INDEX 0000: 745
INDEX 0000: 745
INDEX 0000: 745
INDEX 0000: 745


INDEX 1000: 745
INDEX 1000: 745
INDEX 1000: 745
INDEX 1000: 745
INDEX 1000: 745


INDEX 2000: 745
INDEX 2000: 745
INDEX 2000: 745
INDEX 2000: 745
INDEX 2000: 745


INDEX 3000: 745
INDEX 3000: 745
INDEX 3000: 745
INDEX 3000: 745
INDEX 3000: 745


INDEX 4000: 745
INDEX 4000: 745
INDEX 4000: 745
INDEX 4000: 745
INDEX 4000: 745


INDEX 5000: 745
INDEX 5000: 745
INDEX 5000: 745
INDEX 5000: 745
INDEX 5000: 745


INDEX 6000: 745
INDEX 6000: 745
INDEX 6000: 745
INDEX 6000: 745
INDEX 6000: 745


INDEX 7000: 745
INDEX 7000: 745
INDEX 7000: 745
INDEX 7000: 745
INDEX 7000: 745


INDEX 8000: 933
INDEX 8000: 933
INDEX 8000: 933
INDEX 8000: 933
INDEX 8000: 933


INDEX 9000: 933
INDEX 9000: 933
INDEX 9000: 933
INDEX 9000: 933
INDEX 9000: 933


load features /home/j-gunmo/features/train_make_correct_percent.pkl to dataframe ... 



Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent



correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.


dtype: float64
dtype: float64
dtype: float64
dtype: float64
dtype: float64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 0.6308724832214765
INDEX 0000: 0.6308724832214765
INDEX 0000: 0.6308724832214765
INDEX 0000: 0.6308724832214765
INDEX 0000: 0.6308724832214765


INDEX 1000: 0.6308724832214765
INDEX 1000: 0.6308724832214765
INDEX 1000: 0.6308724832214765
INDEX 1000: 0.6308724832214765
INDEX 1000: 0.6308724832214765


INDEX 2000: 0.6308724832214765
INDEX 2000: 0.6308724832214765
INDEX 2000: 0.6308724832214765
INDEX 2000: 0.6308724832214765
INDEX 2000: 0.6308724832214765


INDEX 3000: 0.6308724832214765
INDEX 3000: 0.6308724832214765
INDEX 3000: 0.6308724832214765
INDEX 3000: 0.6308724832214765
INDEX 3000: 0.6308724832214765


INDEX 4000: 0.6308724832214765
INDEX 4000: 0.6308724832214765
INDEX 4000: 0.6308724832214765
INDEX 4000: 0.6308724832214765
INDEX 4000: 0.6308724832214765


INDEX 5000: 0.6308724832214765
INDEX 5000: 0.6308724832214765
INDEX 5000: 0.6308724832214765
INDEX 5000: 0.6308724832214765
INDEX 5000: 0.6308724832214765


INDEX 6000: 0.6308724832214765
INDEX 6000: 0.6308724832214765
INDEX 6000: 0.6308724832214765
INDEX 6000: 0.6308724832214765
INDEX 6000: 0.6308724832214765


INDEX 7000: 0.6308724832214765
INDEX 7000: 0.6308724832214765
INDEX 7000: 0.6308724832214765
INDEX 7000: 0.6308724832214765
INDEX 7000: 0.6308724832214765


INDEX 8000: 0.8531618435155413
INDEX 8000: 0.8531618435155413
INDEX 8000: 0.8531618435155413
INDEX 8000: 0.8531618435155413
INDEX 8000: 0.8531618435155413


INDEX 9000: 0.8531618435155413
INDEX 9000: 0.8531618435155413
INDEX 9000: 0.8531618435155413
INDEX 9000: 0.8531618435155413
INDEX 9000: 0.8531618435155413


Feature Engineering End ... 
Feature Engineering End ... 
Feature Engineering End ... 
Feature Engineering End ... 
Feature Engineering End ... 


Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')


Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       'secondClass', 'correctCnt', 'quesCnt', 'correctPer'],
      dtype='object')
Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       'secondClass', 'correctCnt', 'quesCnt', 'correctPer'],
      dtype='object')
Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       'secondClass', 'correctCnt', 'quesCnt', 'correctPer'],
      dtype='object')
Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       '

Feature Engineering Start ... 
Feature Engineering Start ... 
Feature Engineering Start ... 
Feature Engineering Start ... 
Feature Engineering Start ... 



Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id

Feature Engineering Name: split_assessmentitem_id


load features /home/j-gunmo/features/test_split_assessmentitem_id.pkl to dataframe ... 



testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.

testPaper       : 시험지 번호입니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 050023
INDEX 0000: 050023
INDEX 0000: 050023
INDEX 0000: 050023
INDEX 0000: 050023


INDEX 1000: 020035
INDEX 1000: 020035
INDEX 1000: 020035
INDEX 1000: 020035
INDEX 1000: 020035


INDEX 2000: 050006
INDEX 2000: 050006
INDEX 2000: 050006
INDEX 2000: 050006
INDEX 2000: 050006


INDEX 3000: 020037
INDEX 3000: 020037
INDEX 3000: 020037
INDEX 3000: 020037
INDEX 3000: 020037


INDEX 4000: 050009
INDEX 4000: 050009
INDEX 4000: 050009
INDEX 4000: 050009
INDEX 4000: 050009


INDEX 5000: 050045
INDEX 5000: 050045
INDEX 5000: 050045
INDEX 5000: 050045
INDEX 5000: 050045


INDEX 6000: 050072
INDEX 6000: 050072
INDEX 6000: 050072
INDEX 6000: 050072
INDEX 6000: 050072


INDEX 7000: 050089
INDEX 7000: 050089
INDEX 7000: 050089
INDEX 7000: 050089
INDEX 7000: 050089


INDEX 8000: 050089
INDEX 8000: 050089
INDEX 8000: 050089
INDEX 8000: 050089
INDEX 8000: 050089


INDEX 9000: 050105
INDEX 9000: 050105
INDEX 9000: 050105
INDEX 9000: 050105
INDEX 9000: 050105



testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.

testPaperCnt    : 시험지의 문항 번호입니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 001
INDEX 0000: 001
INDEX 0000: 001
INDEX 0000: 001
INDEX 0000: 001


INDEX 1000: 003
INDEX 1000: 003
INDEX 1000: 003
INDEX 1000: 003
INDEX 1000: 003


INDEX 2000: 001
INDEX 2000: 001
INDEX 2000: 001
INDEX 2000: 001
INDEX 2000: 001


INDEX 3000: 003
INDEX 3000: 003
INDEX 3000: 003
INDEX 3000: 003
INDEX 3000: 003


INDEX 4000: 003
INDEX 4000: 003
INDEX 4000: 003
INDEX 4000: 003
INDEX 4000: 003


INDEX 5000: 001
INDEX 5000: 001
INDEX 5000: 001
INDEX 5000: 001
INDEX 5000: 001


INDEX 6000: 001
INDEX 6000: 001
INDEX 6000: 001
INDEX 6000: 001
INDEX 6000: 001


INDEX 7000: 003
INDEX 7000: 003
INDEX 7000: 003
INDEX 7000: 003
INDEX 7000: 003


INDEX 8000: 006
INDEX 8000: 006
INDEX 8000: 006
INDEX 8000: 006
INDEX 8000: 006


INDEX 9000: 004
INDEX 9000: 004
INDEX 9000: 004
INDEX 9000: 004
INDEX 9000: 004


load features /home/j-gunmo/features/test_convert_time.pkl to dataframe ... 



Feature Engineering Name: convert_time

Feature Engineering Name: convert_time

Feature Engineering Name: convert_time

Feature Engineering Name: convert_time

Feature Engineering Name: convert_time



timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.

timeSec         : 사용자가 문항을 푼 타임스태프 정보입니다.


dtype: int64
dtype: int64
dtype: int64
dtype: int64
dtype: int64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 1578534991
INDEX 0000: 1578534991
INDEX 0000: 1578534991
INDEX 0000: 1578534991
INDEX 0000: 1578534991


INDEX 1000: 1582080021
INDEX 1000: 1582080021
INDEX 1000: 1582080021
INDEX 1000: 1582080021
INDEX 1000: 1582080021


INDEX 2000: 1583876850
INDEX 2000: 1583876850
INDEX 2000: 1583876850
INDEX 2000: 1583876850
INDEX 2000: 1583876850


INDEX 3000: 1586022801
INDEX 3000: 1586022801
INDEX 3000: 1586022801
INDEX 3000: 1586022801
INDEX 3000: 1586022801


INDEX 4000: 1588743149
INDEX 4000: 1588743149
INDEX 4000: 1588743149
INDEX 4000: 1588743149
INDEX 4000: 1588743149


INDEX 5000: 1591764737
INDEX 5000: 1591764737
INDEX 5000: 1591764737
INDEX 5000: 1591764737
INDEX 5000: 1591764737


INDEX 6000: 1594353821
INDEX 6000: 1594353821
INDEX 6000: 1594353821
INDEX 6000: 1594353821
INDEX 6000: 1594353821


INDEX 7000: 1595904396
INDEX 7000: 1595904396
INDEX 7000: 1595904396
INDEX 7000: 1595904396
INDEX 7000: 1595904396


INDEX 8000: 1597130998
INDEX 8000: 1597130998
INDEX 8000: 1597130998
INDEX 8000: 1597130998
INDEX 8000: 1597130998


INDEX 9000: 1599087082
INDEX 9000: 1599087082
INDEX 9000: 1599087082
INDEX 9000: 1599087082
INDEX 9000: 1599087082



Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class

Feature Engineering Name: make_first_class


load features /home/j-gunmo/features/test_make_first_class.pkl to dataframe ... 



Feature Engineering Name: make_first_class



firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.

firstClass      : 대분류에 해당합니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 5
INDEX 0000: 5
INDEX 0000: 5
INDEX 0000: 5
INDEX 0000: 5


INDEX 1000: 2
INDEX 1000: 2
INDEX 1000: 2
INDEX 1000: 2
INDEX 1000: 2


INDEX 2000: 5
INDEX 2000: 5
INDEX 2000: 5
INDEX 2000: 5
INDEX 2000: 5


INDEX 3000: 2
INDEX 3000: 2
INDEX 3000: 2
INDEX 3000: 2
INDEX 3000: 2


INDEX 4000: 5
INDEX 4000: 5
INDEX 4000: 5
INDEX 4000: 5
INDEX 4000: 5


INDEX 5000: 5
INDEX 5000: 5
INDEX 5000: 5
INDEX 5000: 5
INDEX 5000: 5


INDEX 6000: 5
INDEX 6000: 5
INDEX 6000: 5
INDEX 6000: 5
INDEX 6000: 5


INDEX 7000: 5
INDEX 7000: 5
INDEX 7000: 5
INDEX 7000: 5
INDEX 7000: 5


INDEX 8000: 5
INDEX 8000: 5
INDEX 8000: 5
INDEX 8000: 5
INDEX 8000: 5


INDEX 9000: 5
INDEX 9000: 5
INDEX 9000: 5
INDEX 9000: 5
INDEX 9000: 5


load features /home/j-gunmo/features/test_make_second_class.pkl to dataframe ... 



Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class

Feature Engineering Name: make_second_class



secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.

secondClass     : 중분류에 해당합니다.


dtype: object
dtype: object
dtype: object
dtype: object
dtype: object


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 2626
INDEX 0000: 2626
INDEX 0000: 2626
INDEX 0000: 2626
INDEX 0000: 2626


INDEX 1000: 7693
INDEX 1000: 7693
INDEX 1000: 7693
INDEX 1000: 7693
INDEX 1000: 7693


INDEX 2000: 2617
INDEX 2000: 2617
INDEX 2000: 2617
INDEX 2000: 2617
INDEX 2000: 2617


INDEX 3000: 7924
INDEX 3000: 7924
INDEX 3000: 7924
INDEX 3000: 7924
INDEX 3000: 7924


INDEX 4000: 2618
INDEX 4000: 2618
INDEX 4000: 2618
INDEX 4000: 2618
INDEX 4000: 2618


INDEX 5000: 3729
INDEX 5000: 3729
INDEX 5000: 3729
INDEX 5000: 3729
INDEX 5000: 3729


INDEX 6000: 3827
INDEX 6000: 3827
INDEX 6000: 3827
INDEX 6000: 3827
INDEX 6000: 3827


INDEX 7000: 395
INDEX 7000: 395
INDEX 7000: 395
INDEX 7000: 395
INDEX 7000: 395


INDEX 8000: 394
INDEX 8000: 394
INDEX 8000: 394
INDEX 8000: 394
INDEX 8000: 394


INDEX 9000: 5269
INDEX 9000: 5269
INDEX 9000: 5269
INDEX 9000: 5269
INDEX 9000: 5269



Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count

Feature Engineering Name: make_correct_count


load features /home/j-gunmo/features/test_make_correct_count.pkl to dataframe ... 



correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.

correctCnt      : 사용자가 맞춘 문항수를 나타냅니다.


dtype: int64
dtype: int64
dtype: int64
dtype: int64
dtype: int64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 716
INDEX 0000: 716
INDEX 0000: 716
INDEX 0000: 716
INDEX 0000: 716


INDEX 1000: 716
INDEX 1000: 716
INDEX 1000: 716
INDEX 1000: 716
INDEX 1000: 716


INDEX 2000: 716
INDEX 2000: 716
INDEX 2000: 716
INDEX 2000: 716
INDEX 2000: 716


INDEX 3000: 716
INDEX 3000: 716
INDEX 3000: 716
INDEX 3000: 716
INDEX 3000: 716


INDEX 4000: 716
INDEX 4000: 716
INDEX 4000: 716
INDEX 4000: 716
INDEX 4000: 716


INDEX 5000: 716
INDEX 5000: 716
INDEX 5000: 716
INDEX 5000: 716
INDEX 5000: 716


INDEX 6000: 716
INDEX 6000: 716
INDEX 6000: 716
INDEX 6000: 716
INDEX 6000: 716


INDEX 7000: 716
INDEX 7000: 716
INDEX 7000: 716
INDEX 7000: 716
INDEX 7000: 716


INDEX 8000: 716
INDEX 8000: 716
INDEX 8000: 716
INDEX 8000: 716
INDEX 8000: 716


INDEX 9000: 716
INDEX 9000: 716
INDEX 9000: 716
INDEX 9000: 716
INDEX 9000: 716



Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count

Feature Engineering Name: make_question_count


load features /home/j-gunmo/features/test_make_question_count.pkl to dataframe ... 



quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.

quesCnt         : 사용자가 푼 문항수를 나타냅니다.


dtype: int64
dtype: int64
dtype: int64
dtype: int64
dtype: int64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 1036
INDEX 0000: 1036
INDEX 0000: 1036
INDEX 0000: 1036
INDEX 0000: 1036


INDEX 1000: 1036
INDEX 1000: 1036
INDEX 1000: 1036
INDEX 1000: 1036
INDEX 1000: 1036


INDEX 2000: 1036
INDEX 2000: 1036
INDEX 2000: 1036
INDEX 2000: 1036
INDEX 2000: 1036


INDEX 3000: 1036
INDEX 3000: 1036
INDEX 3000: 1036
INDEX 3000: 1036
INDEX 3000: 1036


INDEX 4000: 1036
INDEX 4000: 1036
INDEX 4000: 1036
INDEX 4000: 1036
INDEX 4000: 1036


INDEX 5000: 1036
INDEX 5000: 1036
INDEX 5000: 1036
INDEX 5000: 1036
INDEX 5000: 1036


INDEX 6000: 1036
INDEX 6000: 1036
INDEX 6000: 1036
INDEX 6000: 1036
INDEX 6000: 1036


INDEX 7000: 1036
INDEX 7000: 1036
INDEX 7000: 1036
INDEX 7000: 1036
INDEX 7000: 1036


INDEX 8000: 1036
INDEX 8000: 1036
INDEX 8000: 1036
INDEX 8000: 1036
INDEX 8000: 1036


INDEX 9000: 1036
INDEX 9000: 1036
INDEX 9000: 1036
INDEX 9000: 1036
INDEX 9000: 1036



Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent

Feature Engineering Name: make_correct_percent


load features /home/j-gunmo/features/test_make_correct_percent.pkl to dataframe ... 



correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.

correctPer      : 사용자가 푼 전체 문항에 대한 정답률입니다.


dtype: float64
dtype: float64
dtype: float64
dtype: float64
dtype: float64


[Examples]
[Examples]
[Examples]
[Examples]
[Examples]


INDEX 0000: 0.6911196911196911
INDEX 0000: 0.6911196911196911
INDEX 0000: 0.6911196911196911
INDEX 0000: 0.6911196911196911
INDEX 0000: 0.6911196911196911


INDEX 1000: 0.6911196911196911
INDEX 1000: 0.6911196911196911
INDEX 1000: 0.6911196911196911
INDEX 1000: 0.6911196911196911
INDEX 1000: 0.6911196911196911


INDEX 2000: 0.6911196911196911
INDEX 2000: 0.6911196911196911
INDEX 2000: 0.6911196911196911
INDEX 2000: 0.6911196911196911
INDEX 2000: 0.6911196911196911


INDEX 3000: 0.6911196911196911
INDEX 3000: 0.6911196911196911
INDEX 3000: 0.6911196911196911
INDEX 3000: 0.6911196911196911
INDEX 3000: 0.6911196911196911


INDEX 4000: 0.6911196911196911
INDEX 4000: 0.6911196911196911
INDEX 4000: 0.6911196911196911
INDEX 4000: 0.6911196911196911
INDEX 4000: 0.6911196911196911


INDEX 5000: 0.6911196911196911
INDEX 5000: 0.6911196911196911
INDEX 5000: 0.6911196911196911
INDEX 5000: 0.6911196911196911
INDEX 5000: 0.6911196911196911


INDEX 6000: 0.6911196911196911
INDEX 6000: 0.6911196911196911
INDEX 6000: 0.6911196911196911
INDEX 6000: 0.6911196911196911
INDEX 6000: 0.6911196911196911


INDEX 7000: 0.6911196911196911
INDEX 7000: 0.6911196911196911
INDEX 7000: 0.6911196911196911
INDEX 7000: 0.6911196911196911
INDEX 7000: 0.6911196911196911


INDEX 8000: 0.6911196911196911
INDEX 8000: 0.6911196911196911
INDEX 8000: 0.6911196911196911
INDEX 8000: 0.6911196911196911
INDEX 8000: 0.6911196911196911


INDEX 9000: 0.6911196911196911
INDEX 9000: 0.6911196911196911
INDEX 9000: 0.6911196911196911
INDEX 9000: 0.6911196911196911
INDEX 9000: 0.6911196911196911


Feature Engineering End ... 
Feature Engineering End ... 
Feature Engineering End ... 
Feature Engineering End ... 
Feature Engineering End ... 


Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')
Original DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag'],
      dtype='object')


Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       'secondClass', 'correctCnt', 'quesCnt', 'correctPer'],
      dtype='object')
Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       'secondClass', 'correctCnt', 'quesCnt', 'correctPer'],
      dtype='object')
Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       'secondClass', 'correctCnt', 'quesCnt', 'correctPer'],
      dtype='object')
Feature Added DataFrame Keywords: Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'testPaper', 'testPaperCnt', 'timeSec', 'firstClass',
       '

In [111]:
preprocess.datas['train'] = pd.concat([preprocess.datas['train'], train_set[add_columns]], axis=1)
preprocess.datas['test'] = pd.concat([preprocess.datas['test'], test_set[add_columns]], axis=1)

In [112]:
preprocess.columns += add_columns

In [113]:
print(preprocess.columns)

['userID', 'answerCode', 'testPaper', 'firstClass', 'secondClass', 'timeSec', 'userPer', 'assPer', 'firPer', 'testPer', 'secPer']


In [114]:
preprocess.split_data()

Split based on User
Split based on User
Split based on User
Split based on User
Split based on User


Original Train Dataset: 2266586
Original Train Dataset: 2266586
Original Train Dataset: 2266586
Original Train Dataset: 2266586
Original Train Dataset: 2266586


Split Train Dataset: 1582219
Split Train Dataset: 1582219
Split Train Dataset: 1582219
Split Train Dataset: 1582219
Split Train Dataset: 1582219


Split Valid Dataset: 684367
Split Valid Dataset: 684367
Split Valid Dataset: 684367
Split Valid Dataset: 684367
Split Valid Dataset: 684367


In [118]:
preprocess.datas['train'].head(3)

Unnamed: 0,userID,answerCode,testPaper,firstClass,secondClass,timeSec,userPer,assPer,firPer,testPer,secPer
0,0,1,975,5,618,-1.435761,0.791908,0.984954,0.738852,0.987599,0.971153
1,0,1,975,5,619,-1.43576,0.791908,0.96601,0.738852,0.987599,0.918119
2,0,1,975,5,619,-1.435759,0.791908,0.90918,0.738852,0.987599,0.918119


In [116]:
preprocess.scaling(pre_encodes)

Preprocessing Labels .. 
Preprocessing Labels .. 
Preprocessing Labels .. 
Preprocessing Labels .. 
Preprocessing Labels .. 


Label Columns: ['testPaper', 'firstClass', 'secondClass']
Label Columns: ['testPaper', 'firstClass', 'secondClass']
Label Columns: ['testPaper', 'firstClass', 'secondClass']
Label Columns: ['testPaper', 'firstClass', 'secondClass']
Label Columns: ['testPaper', 'firstClass', 'secondClass']



Length of testPaper            : 1538

Length of testPaper            : 1538

Length of testPaper            : 1538

Length of testPaper            : 1538

Length of testPaper            : 1538


Before : 0    060001
1    060001
2    060001
3    060001
4    060001
5    060001
6    060003
7    060003
8    060003
9    060003
Name: testPaper, dtype: object
Before : 0    060001
1    060001
2    060001
3    060001
4    060001
5    060001
6    060003
7    060003
8    060003
9    060003
Name: testPaper, dtype: object
Before : 0    060001
1    060001
2    060001
3    060001
4    060001
5    060001
6    060003
7    060003
8    060003
9    060003
Name: testPaper, dtype: object
Before : 0    060001
1    060001
2    060001
3    060001
4    060001
5    060001
6    060003
7    060003
8    060003
9    060003
Name: testPaper, dtype: object
Before : 0    060001
1    060001
2    060001
3    060001
4    060001
5    060001
6    060003
7    060003
8    060003
9    060003
Name: testPaper, dtype: object


After : 0    975
1    975
2    975
3    975
4    975
5    975
6    977
7    977
8    977
9    977
Name: testPaper, dtype: int64
After : 0    975
1    975
2    975
3    975
4    975
5    975
6    977
7    977
8    977
9    977
Name: testPaper, dtype: int64
After : 0    975
1    975
2    975
3    975
4    975
5    975
6    977
7    977
8    977
9    977
Name: testPaper, dtype: int64
After : 0    975
1    975
2    975
3    975
4    975
5    975
6    977
7    977
8    977
9    977
Name: testPaper, dtype: int64
After : 0    975
1    975
2    975
3    975
4    975
5    975
6    977
7    977
8    977
9    977
Name: testPaper, dtype: int64



Length of firstClass           : 10

Length of firstClass           : 10

Length of firstClass           : 10

Length of firstClass           : 10

Length of firstClass           : 10


Before : 0    6
1    6
2    6
3    6
4    6
5    6
6    6
7    6
8    6
9    6
Name: firstClass, dtype: object
Before : 0    6
1    6
2    6
3    6
4    6
5    6
6    6
7    6
8    6
9    6
Name: firstClass, dtype: object
Before : 0    6
1    6
2    6
3    6
4    6
5    6
6    6
7    6
8    6
9    6
Name: firstClass, dtype: object
Before : 0    6
1    6
2    6
3    6
4    6
5    6
6    6
7    6
8    6
9    6
Name: firstClass, dtype: object
Before : 0    6
1    6
2    6
3    6
4    6
5    6
6    6
7    6
8    6
9    6
Name: firstClass, dtype: object


After : 0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
Name: firstClass, dtype: int64
After : 0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
Name: firstClass, dtype: int64
After : 0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
Name: firstClass, dtype: int64
After : 0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
Name: firstClass, dtype: int64
After : 0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    5
Name: firstClass, dtype: int64



Length of secondClass          : 913

Length of secondClass          : 913

Length of secondClass          : 913

Length of secondClass          : 913

Length of secondClass          : 913


Before : 0    7224
1    7225
2    7225
3    7225
4    7225
5    7225
6    7226
7    7226
8    7226
9    7226
Name: secondClass, dtype: object
Before : 0    7224
1    7225
2    7225
3    7225
4    7225
5    7225
6    7226
7    7226
8    7226
9    7226
Name: secondClass, dtype: object
Before : 0    7224
1    7225
2    7225
3    7225
4    7225
5    7225
6    7226
7    7226
8    7226
9    7226
Name: secondClass, dtype: object
Before : 0    7224
1    7225
2    7225
3    7225
4    7225
5    7225
6    7226
7    7226
8    7226
9    7226
Name: secondClass, dtype: object
Before : 0    7224
1    7225
2    7225
3    7225
4    7225
5    7225
6    7226
7    7226
8    7226
9    7226
Name: secondClass, dtype: object


After : 0    618
1    619
2    619
3    619
4    619
5    619
6    620
7    620
8    620
9    620
Name: secondClass, dtype: int64
After : 0    618
1    619
2    619
3    619
4    619
5    619
6    620
7    620
8    620
9    620
Name: secondClass, dtype: int64
After : 0    618
1    619
2    619
3    619
4    619
5    619
6    620
7    620
8    620
9    620
Name: secondClass, dtype: int64
After : 0    618
1    619
2    619
3    619
4    619
5    619
6    620
7    620
8    620
9    620
Name: secondClass, dtype: int64
After : 0    618
1    619
2    619
3    619
4    619
5    619
6    620
7    620
8    620
9    620
Name: secondClass, dtype: int64


Preprocessing Min Max .. 
Preprocessing Min Max .. 
Preprocessing Min Max .. 
Preprocessing Min Max .. 
Preprocessing Min Max .. 


Min Max Columns: ['userPer', 'assPer', 'firPer', 'testPer', 'secPer']
Min Max Columns: ['userPer', 'assPer', 'firPer', 'testPer', 'secPer']
Min Max Columns: ['userPer', 'assPer', 'firPer', 'testPer', 'secPer']
Min Max Columns: ['userPer', 'assPer', 'firPer', 'testPer', 'secPer']
Min Max Columns: ['userPer', 'assPer', 'firPer', 'testPer', 'secPer']


MAX: [1.         0.99630996 0.80087621 0.95547445 0.97777778] MIN: [0.         0.04942966 0.44994765 0.32718579 0.18894009]
MAX: [1.         0.99630996 0.80087621 0.95547445 0.97777778] MIN: [0.         0.04942966 0.44994765 0.32718579 0.18894009]
MAX: [1.         0.99630996 0.80087621 0.95547445 0.97777778] MIN: [0.         0.04942966 0.44994765 0.32718579 0.18894009]
MAX: [1.         0.99630996 0.80087621 0.95547445 0.97777778] MIN: [0.         0.04942966 0.44994765 0.32718579 0.18894009]
MAX: [1.         0.99630996 0.80087621 0.95547445 0.97777778] MIN: [0.         0.04942966 0.44994765 0.32718579 0.18894009]


Preprocessing Min Max .. 
Preprocessing Min Max .. 
Preprocessing Min Max .. 
Preprocessing Min Max .. 
Preprocessing Min Max .. 


Standard Columns: ['timeSec']
Standard Columns: ['timeSec']
Standard Columns: ['timeSec']
Standard Columns: ['timeSec']
Standard Columns: ['timeSec']


MEAN: [1.59503981e+09] VAR: [4.91254683e+13]
MEAN: [1.59503981e+09] VAR: [4.91254683e+13]
MEAN: [1.59503981e+09] VAR: [4.91254683e+13]
MEAN: [1.59503981e+09] VAR: [4.91254683e+13]
MEAN: [1.59503981e+09] VAR: [4.91254683e+13]


In [117]:
preprocess.data_augmentation(choices=[1, 3])

Use the test datast for data augmentation
Use the test datast for data augmentation
Use the test datast for data augmentation
Use the test datast for data augmentation
Use the test datast for data augmentation


Before Length: 1582219
Before Length: 1582219
Before Length: 1582219
Before Length: 1582219
Before Length: 1582219


After Length: 1841589
After Length: 1841589
After Length: 1841589
After Length: 1841589
After Length: 1841589


Group By (userID, firstClass)
Group By (userID, firstClass)
Group By (userID, firstClass)
Group By (userID, firstClass)
Group By (userID, firstClass)


Group By (userID, firstClass) Length: 14429
Group By (userID, firstClass) Length: 14429
Group By (userID, firstClass) Length: 14429
Group By (userID, firstClass) Length: 14429
Group By (userID, firstClass) Length: 14429


Group By (userID, firstClass) Length: 5310
Group By (userID, firstClass) Length: 5310
Group By (userID, firstClass) Length: 5310
Group By (userID, firstClass) Length: 5310
Group By (userID, firstClass) Length: 5310


Group By (userID, firstClass) Length: 1987
Group By (userID, firstClass) Length: 1987
Group By (userID, firstClass) Length: 1987
Group By (userID, firstClass) Length: 1987
Group By (userID, firstClass) Length: 1987


In [120]:
args.columns = columns[1:]

In [121]:
train_dataset = preprocess.datas['train_grouped']
valid_dataset = preprocess.datas['valid_grouped']
test_dataset = preprocess.datas['test_grouped']

In [122]:
args.hidden_dim = 1024
args.batch_size = 256

In [123]:
trainer = FeatureTestTrainer(args, LSTM)

In [124]:
trainer.debug(train_dataset, valid_dataset, test_dataset)

In [78]:
trainer.run_cv(train_dataset, valid_dataset, test_dataset, folds=5, seeds=[0, 1, 2, 3, 4])

[34m[1mwandb[0m: Currently logged in as: [33mggm1207[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.6900123357772827
Training steps: 50 Loss: 0.5907658338546753
Training steps: 100 Loss: 0.6147000789642334
VALID AUC : 0.7775235145970394 ACC : 0.709726443768997



Training steps: 0 Loss: 0.6053119897842407
Training steps: 50 Loss: 0.5378944873809814
Training steps: 100 Loss: 0.6572806239128113
VALID AUC : 0.7812949732730264 ACC : 0.7092198581560284



Training steps: 0 Loss: 0.5924673676490784
Training steps: 50 Loss: 0.5560609698295593
Training steps: 100 Loss: 0.5315614938735962
VALID AUC : 0.7831839792351974 ACC : 0.7201114488348531



Training steps: 0 Loss: 0.48299431800842285
Training steps: 50 Loss: 0.5084896087646484
Training steps: 100 Loss: 0.6187272071838379
VALID AUC : 0.783579615542763 ACC : 0.7160587639311043



Training steps: 0 Loss: 0.49531498551368713
Training steps: 50 Loss: 0.6231862902641296
Training steps: 100 Loss: 0.5287169218063354
VALID AUC : 0.7848938630756579 ACC : 0.7178318135764944



Training steps: 0 Loss: 0.5620738863945007
Training steps: 50 Loss: 0.5381721258163452
Training steps: 100 Loss: 0.5804488658905029
VALID AUC : 0.78572998046875 ACC : 0.7152988855116514



Training steps: 0 Loss: 0.5542099475860596
Training steps: 50 Loss: 0.5552836060523987
Training steps: 100 Loss: 0.6334229111671448
VALID AUC : 0.7867742598684211 ACC : 0.7190982776089159



Training steps: 0 Loss: 0.5800855755805969
Training steps: 50 Loss: 0.632712721824646
Training steps: 100 Loss: 0.5088061094284058
VALID AUC : 0.7864337479440789 ACC : 0.71580547112462



Training steps: 0 Loss: 0.5841782689094543
Training steps: 50 Loss: 0.5900067090988159
Training steps: 100 Loss: 0.5625967979431152
VALID AUC : 0.7866415244654605 ACC : 0.7211246200607903



Training steps: 0 Loss: 0.6123999953269958
Training steps: 50 Loss: 0.6168683767318726
Training steps: 100 Loss: 0.5065632462501526
VALID AUC : 0.7869325657894737 ACC : 0.7223910840932117



Training steps: 0 Loss: 0.574791669845581
Training steps: 50 Loss: 0.6042125225067139
Training steps: 100 Loss: 0.5882450342178345
VALID AUC : 0.7857474557976973 ACC : 0.7221377912867275



Training steps: 0 Loss: 0.5638935565948486
Training steps: 50 Loss: 0.5049084424972534
Training steps: 100 Loss: 0.6082775592803955
VALID AUC : 0.7867584549753289 ACC : 0.7188449848024316



Training steps: 0 Loss: 0.5412283539772034
Training steps: 50 Loss: 0.5785769820213318
Training steps: 100 Loss: 0.580246090888977
VALID AUC : 0.7869091796875001 ACC : 0.7147922998986829



Training steps: 0 Loss: 0.5693046450614929
Training steps: 50 Loss: 0.5571205615997314
Training steps: 100 Loss: 0.5613222122192383
VALID AUC : 0.7862507709703948 ACC : 0.7173252279635258



Training steps: 0 Loss: 0.5333999395370483
Training steps: 50 Loss: 0.5525428056716919
Training steps: 100 Loss: 0.55007404088974
VALID AUC : 0.7856954152960527 ACC : 0.7145390070921985



VBox(children=(Label(value=' 5.91MB of 5.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
train_loss,0.56444
train_auc,0.78046
train_acc,0.71503
valid_auc,0.7857
valid_acc,0.71454
_runtime,108.0
_timestamp,1623643244.0
_step,14.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▃▃▂▂▂▁▁▁▁▁▁▁▁▁
train_auc,▁▆▆▇▇▇▇▇███████
train_acc,▁▅▆▆▆█▇█▇▇▇████
valid_auc,▁▄▅▆▆▇████▇██▇▇
valid_acc,▁▁▇▅▆▄▆▅▇██▆▄▅▄
_runtime,▁▁▂▂▃▄▄▅▅▆▆▇▇██
_timestamp,▁▁▂▂▃▄▄▅▅▆▆▇▇██
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.6956632137298584
Training steps: 50 Loss: 0.6047564744949341
Training steps: 100 Loss: 0.5139285326004028
VALID AUC : 0.7722368522368522 ACC : 0.7120060790273556



Training steps: 0 Loss: 0.49331462383270264
Training steps: 50 Loss: 0.529936671257019
Training steps: 100 Loss: 0.5918154716491699
VALID AUC : 0.7733615666949001 ACC : 0.7099797365754813



Training steps: 0 Loss: 0.6010558605194092
Training steps: 50 Loss: 0.5656651258468628
Training steps: 100 Loss: 0.526378333568573
VALID AUC : 0.7747424347424348 ACC : 0.7142857142857143



Training steps: 0 Loss: 0.4948004484176636
Training steps: 50 Loss: 0.5347343683242798
Training steps: 100 Loss: 0.5812312364578247
VALID AUC : 0.7777392777392776 ACC : 0.7021276595744681



Training steps: 0 Loss: 0.5414777994155884
Training steps: 50 Loss: 0.5836319923400879
Training steps: 100 Loss: 0.529004693031311
VALID AUC : 0.7801699135032468 ACC : 0.7180851063829787



Training steps: 0 Loss: 0.6129568815231323
Training steps: 50 Loss: 0.5401780009269714
Training steps: 100 Loss: 0.5128384828567505
VALID AUC : 0.7808023408023408 ACC : 0.7152988855116514



Training steps: 0 Loss: 0.5902056694030762
Training steps: 50 Loss: 0.567690372467041
Training steps: 100 Loss: 0.5701600313186646
VALID AUC : 0.7813852313852314 ACC : 0.7145390070921985



Training steps: 0 Loss: 0.5171765089035034
Training steps: 50 Loss: 0.5627180337905884
Training steps: 100 Loss: 0.5194752216339111
VALID AUC : 0.7803426503426505 ACC : 0.71580547112462



Training steps: 0 Loss: 0.547912061214447
Training steps: 50 Loss: 0.5491472482681274
Training steps: 100 Loss: 0.5594881772994995
VALID AUC : 0.7800145016811683 ACC : 0.7178318135764944



Training steps: 0 Loss: 0.5520672798156738
Training steps: 50 Loss: 0.5654823780059814
Training steps: 100 Loss: 0.5199532508850098
VALID AUC : 0.7803834603834605 ACC : 0.7175785207700102



Training steps: 0 Loss: 0.5919545292854309
Training steps: 50 Loss: 0.5420883893966675
Training steps: 100 Loss: 0.6372974514961243
VALID AUC : 0.7811976078642745 ACC : 0.7196048632218845



Training steps: 0 Loss: 0.5964817404747009
Training steps: 50 Loss: 0.5308734178543091
Training steps: 100 Loss: 0.5982387661933899
VALID AUC : 0.7800872667539335 ACC : 0.7102330293819655



VBox(children=(Label(value=' 5.91MB of 5.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,11.0
train_loss,0.56357
train_auc,0.78024
train_acc,0.71294
valid_auc,0.78009
valid_acc,0.71023
_runtime,87.0
_timestamp,1623643335.0
_step,11.0


0,1
epoch,▁▂▂▃▄▄▅▅▆▇▇█
train_loss,█▃▂▂▂▂▂▂▁▁▁▁
train_auc,▁▅▆▇▇▇▇█████
train_acc,▁▄▅▇▆▇▇█▇▇██
valid_auc,▁▂▃▅▇██▇▇▇█▇
valid_acc,▅▄▆▁▇▆▆▆▇▇█▄
_runtime,▁▂▂▃▄▄▅▅▆▇▇█
_timestamp,▁▂▂▃▄▄▅▅▆▇▇█
_step,▁▂▂▃▄▄▅▅▆▇▇█


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.6913742423057556
Training steps: 50 Loss: 0.5355721116065979
Training steps: 100 Loss: 0.5349569320678711
VALID AUC : 0.7700127097441714 ACC : 0.702887537993921



Training steps: 0 Loss: 0.5768215656280518
Training steps: 50 Loss: 0.614223301410675
Training steps: 100 Loss: 0.5630612373352051
VALID AUC : 0.7752861747368919 ACC : 0.7069402228976697



Training steps: 0 Loss: 0.5325376987457275
Training steps: 50 Loss: 0.5577712059020996
Training steps: 100 Loss: 0.5947613716125488
VALID AUC : 0.7783801292345692 ACC : 0.6957953394123607



Training steps: 0 Loss: 0.5837271213531494
Training steps: 50 Loss: 0.5761606693267822
Training steps: 100 Loss: 0.5762360692024231
VALID AUC : 0.7796893767293523 ACC : 0.7099797365754813



Training steps: 0 Loss: 0.47796210646629333
Training steps: 50 Loss: 0.4875192642211914
Training steps: 100 Loss: 0.5315066576004028
VALID AUC : 0.779184891330146 ACC : 0.7125126646403243



Training steps: 0 Loss: 0.6465626358985901
Training steps: 50 Loss: 0.5946018099784851
Training steps: 100 Loss: 0.6741921305656433
VALID AUC : 0.7819092152323768 ACC : 0.7130192502532928



Training steps: 0 Loss: 0.567429780960083
Training steps: 50 Loss: 0.6065412759780884
Training steps: 100 Loss: 0.638810396194458
VALID AUC : 0.7801021608620022 ACC : 0.711499493414387



Training steps: 0 Loss: 0.5399481654167175
Training steps: 50 Loss: 0.5218640565872192
Training steps: 100 Loss: 0.4904070198535919
VALID AUC : 0.7813191933576431 ACC : 0.7125126646403243



Training steps: 0 Loss: 0.5853046178817749
Training steps: 50 Loss: 0.5416632890701294
Training steps: 100 Loss: 0.5536460876464844
VALID AUC : 0.7817139968497918 ACC : 0.711499493414387



Training steps: 0 Loss: 0.577796220779419
Training steps: 50 Loss: 0.6202760338783264
Training steps: 100 Loss: 0.5366042256355286
VALID AUC : 0.7821347438375208 ACC : 0.7117527862208713



Training steps: 0 Loss: 0.560056746006012
Training steps: 50 Loss: 0.5554572343826294
Training steps: 100 Loss: 0.5694162249565125
VALID AUC : 0.7825752695298012 ACC : 0.7112462006079028



Training steps: 0 Loss: 0.5295521020889282
Training steps: 50 Loss: 0.5969763994216919
Training steps: 100 Loss: 0.575781524181366
VALID AUC : 0.782078746985674 ACC : 0.7104863221884499



Training steps: 0 Loss: 0.5148870944976807
Training steps: 50 Loss: 0.5265598893165588
Training steps: 100 Loss: 0.582970917224884
VALID AUC : 0.781872611785642 ACC : 0.709726443768997



Training steps: 0 Loss: 0.5209767818450928
Training steps: 50 Loss: 0.6247624754905701
Training steps: 100 Loss: 0.5499745607376099
VALID AUC : 0.7815698948594889 ACC : 0.7102330293819655



Training steps: 0 Loss: 0.5333714485168457
Training steps: 50 Loss: 0.6032456159591675
Training steps: 100 Loss: 0.6091723442077637
VALID AUC : 0.7821745581129165 ACC : 0.7087132725430598



Training steps: 0 Loss: 0.5378310680389404
Training steps: 50 Loss: 0.5848501920700073
Training steps: 100 Loss: 0.5238747000694275
VALID AUC : 0.7826245878580331 ACC : 0.7092198581560284



Training steps: 0 Loss: 0.5902220606803894
Training steps: 50 Loss: 0.514001727104187
Training steps: 100 Loss: 0.5256128311157227
VALID AUC : 0.7818614381019019 ACC : 0.7142857142857143



Training steps: 0 Loss: 0.5726213455200195
Training steps: 50 Loss: 0.6058125495910645
Training steps: 100 Loss: 0.6490216255187988
VALID AUC : 0.7819914124460967 ACC : 0.7107396149949341



Training steps: 0 Loss: 0.5488406419754028
Training steps: 50 Loss: 0.49625492095947266
Training steps: 100 Loss: 0.5783048272132874
VALID AUC : 0.7819795965966243 ACC : 0.7142857142857143



Training steps: 0 Loss: 0.5436710715293884
Training steps: 50 Loss: 0.5600281953811646
Training steps: 100 Loss: 0.501496434211731
VALID AUC : 0.7814843583839617 ACC : 0.711499493414387



VBox(children=(Label(value=' 5.91MB of 5.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,19.0
train_loss,0.5626
train_auc,0.78269
train_acc,0.7168
valid_auc,0.78148
valid_acc,0.7115
_runtime,146.0
_timestamp,1623643485.0
_step,19.0


0,1
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
train_loss,█▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁
train_auc,▁▅▆▇▇▇▇▇▇███████████
train_acc,▁▅▆▇▆▆▇▇▇▇▇▇█▇███▇▇█
valid_auc,▁▄▆▆▆█▇▇▇████▇█████▇
valid_acc,▄▅▁▆▇█▇▇▇▇▇▇▆▆▆▆█▇█▇
_runtime,▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇██
_timestamp,▁▁▂▂▃▃▃▄▄▄▅▅▅▆▆▇▇▇██
_step,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.6926342248916626
Training steps: 50 Loss: 0.5687990188598633
Training steps: 100 Loss: 0.5877044200897217
VALID AUC : 0.7770054937651348 ACC : 0.71403242147923



Training steps: 0 Loss: 0.598274290561676
Training steps: 50 Loss: 0.5735397934913635
Training steps: 100 Loss: 0.5986031889915466
VALID AUC : 0.7817929360010368 ACC : 0.7165653495440729



Training steps: 0 Loss: 0.6041520237922668
Training steps: 50 Loss: 0.6107185482978821
Training steps: 100 Loss: 0.6402684450149536
VALID AUC : 0.7856438330850167 ACC : 0.7203647416413373



Training steps: 0 Loss: 0.508784830570221
Training steps: 50 Loss: 0.5759673118591309
Training steps: 100 Loss: 0.6266924142837524
VALID AUC : 0.7854798286037675 ACC : 0.7211246200607903



Training steps: 0 Loss: 0.5312784910202026
Training steps: 50 Loss: 0.6370711326599121
Training steps: 100 Loss: 0.5892148017883301
VALID AUC : 0.786339889662713 ACC : 0.7160587639311043



Training steps: 0 Loss: 0.6155085563659668
Training steps: 50 Loss: 0.5747513771057129
Training steps: 100 Loss: 0.5691822171211243
VALID AUC : 0.7852531995631681 ACC : 0.7180851063829787



Training steps: 0 Loss: 0.565617024898529
Training steps: 50 Loss: 0.5301616787910461
Training steps: 100 Loss: 0.5788697600364685
VALID AUC : 0.7878913874548121 ACC : 0.7226443768996961



Training steps: 0 Loss: 0.597571611404419
Training steps: 50 Loss: 0.5937321782112122
Training steps: 100 Loss: 0.5536482334136963
VALID AUC : 0.7863534925383094 ACC : 0.7218844984802432



Training steps: 0 Loss: 0.5318087339401245
Training steps: 50 Loss: 0.5684150457382202
Training steps: 100 Loss: 0.542328417301178
VALID AUC : 0.7851341102371905 ACC : 0.7178318135764944



Training steps: 0 Loss: 0.506584882736206
Training steps: 50 Loss: 0.539682924747467
Training steps: 100 Loss: 0.5789807438850403
VALID AUC : 0.7857867916077956 ACC : 0.7163120567375887



Training steps: 0 Loss: 0.5878555178642273
Training steps: 50 Loss: 0.6004154086112976
Training steps: 100 Loss: 0.5787967443466187
VALID AUC : 0.7862872747665373 ACC : 0.7180851063829787



Training steps: 0 Loss: 0.5501285791397095
Training steps: 50 Loss: 0.5474671125411987
Training steps: 100 Loss: 0.6256847381591797
VALID AUC : 0.7858953579545382 ACC : 0.7221377912867275



VBox(children=(Label(value=' 5.91MB of 5.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,11.0
train_loss,0.56325
train_auc,0.78143
train_acc,0.71376
valid_auc,0.7859
valid_acc,0.72214
_runtime,89.0
_timestamp,1623643578.0
_step,11.0


0,1
epoch,▁▂▂▃▄▄▅▅▆▇▇█
train_loss,█▄▃▂▂▂▂▁▁▁▁▁
train_auc,▁▅▆▇▇█▇█████
train_acc,▁▅▅▇▇▇▇▇█▇██
valid_auc,▁▄▇▆▇▆█▇▆▇▇▇
valid_acc,▁▃▆▇▃▄█▇▄▃▄█
_runtime,▁▂▂▃▃▄▅▅▆▇▇█
_timestamp,▁▂▂▃▃▄▅▅▆▇▇█
_step,▁▂▂▃▄▄▅▅▆▇▇█


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.6933257579803467
Training steps: 50 Loss: 0.5805238485336304
Training steps: 100 Loss: 0.56990647315979
VALID AUC : 0.772485226313397 ACC : 0.7107396149949341



Training steps: 0 Loss: 0.549719512462616
Training steps: 50 Loss: 0.5594302415847778
Training steps: 100 Loss: 0.5625337362289429
VALID AUC : 0.7746486284336004 ACC : 0.7125126646403243



Training steps: 0 Loss: 0.5817434787750244
Training steps: 50 Loss: 0.5453137755393982
Training steps: 100 Loss: 0.5870969891548157
VALID AUC : 0.7771609829394498 ACC : 0.7120060790273556



Training steps: 0 Loss: 0.5543053150177002
Training steps: 50 Loss: 0.6055698394775391
Training steps: 100 Loss: 0.6192243099212646
VALID AUC : 0.7779723389460893 ACC : 0.7026342451874367



Training steps: 0 Loss: 0.5653420090675354
Training steps: 50 Loss: 0.5917598009109497
Training steps: 100 Loss: 0.5065524578094482
VALID AUC : 0.7788619814386593 ACC : 0.7122593718338399



Training steps: 0 Loss: 0.5467380285263062
Training steps: 50 Loss: 0.5646530389785767
Training steps: 100 Loss: 0.5690449476242065
VALID AUC : 0.77985775987199 ACC : 0.7152988855116514



Training steps: 0 Loss: 0.6232166290283203
Training steps: 50 Loss: 0.5867307186126709
Training steps: 100 Loss: 0.5117599368095398
VALID AUC : 0.7764942644240284 ACC : 0.711499493414387



Training steps: 0 Loss: 0.6147621870040894
Training steps: 50 Loss: 0.5941517353057861
Training steps: 100 Loss: 0.48723554611206055
VALID AUC : 0.778465543807449 ACC : 0.7120060790273556



Training steps: 0 Loss: 0.5422154664993286
Training steps: 50 Loss: 0.5661067962646484
Training steps: 100 Loss: 0.6025415658950806
VALID AUC : 0.7786846176295519 ACC : 0.711499493414387



Training steps: 0 Loss: 0.5817445516586304
Training steps: 50 Loss: 0.5666242837905884
Training steps: 100 Loss: 0.6370506882667542
VALID AUC : 0.7784343575515786 ACC : 0.7092198581560284



Training steps: 0 Loss: 0.5707943439483643
Training steps: 50 Loss: 0.5356687903404236
Training steps: 100 Loss: 0.5926520824432373
VALID AUC : 0.7784059947427415 ACC : 0.7112462006079028



(0.7837383064719081, 0.7168186423505573)

In [126]:
auc, acc = trainer.run_cv(train_dataset, valid_dataset, test_dataset, folds=5, seeds=[0, 1, 2, 3, 4])
clear_output()
print(f"auc : {auc} acc : {acc}")
print(f"logging path: {trainer.prefix_save_path}")

auc : 0.8026846231477597 acc : 0.7294326241134752
logging path: ../last_lstm/LOG_[06.14_13:48]


In [178]:
import torch.nn as nn
from models.lstm.model import EmbeddingLayer, LinearLayer

class HiddenLayer(nn.Module):
    def __init__(self, args):
        super(HiddenLayer, self).__init__()
        
        self.args = args
        self.hidden_dim = self.args.hidden_dim
        self.hidden_layer = nn.Linear(2, self.hidden_dim * args.n_layers)
        
    def forward(self, batch):
        batch_hid = torch.cat([batch['userPer'][:, -1:], 
                               batch['testPer'][:, -1:]], 1)
        output = self.hidden_layer(batch_hid)
        output = output.reshape(-1, self.args.n_layers, self.hidden_dim).permute(1, 0, 2)
        return output.contiguous(), torch.empty_like(output).copy_(output).contiguous()
        

class LSTM(nn.Module):
    def __init__(self, args):
        super(LSTM, self).__init__()
        self.args = args
        self.device = args.device

        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers
        
        self.hid_layer = HiddenLayer(args)
        self.emb_layer = EmbeddingLayer(args, self.hidden_dim // 2)
        args.n_linears = ['secPer', 'timeSec', 'assPer']
        self.nli_layer = LinearLayer(args, self.hidden_dim // 2)
        
        self.comb_proj = nn.Linear(self.hidden_dim, self.hidden_dim)
        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(self.hidden_dim, 1)
        self.activation = nn.Sigmoid()

    def forward(self, batch):
        batch_size = batch["interaction"].size(0)

        embed = self.emb_layer(batch)
        nnbed = self.nli_layer(batch)

        embed = torch.cat([embed, nnbed], 2)
        X = self.comb_proj(embed)
        
        hidden = self.hid_layer(batch)
        out, hidden = self.lstm(X, hidden)
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)

        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds

In [176]:
args.n_linears

['userPer', 'assPer', 'firPer', 'testPer', 'secPer', 'timeSec']

In [173]:
trainer = FeatureTestTrainer(args, LSTM)

In [175]:
auc, acc = trainer.run_cv(train_dataset, valid_dataset, test_dataset, folds=5, seeds=[0, 1, 2, 3, 4])
clear_output()
print(f"auc : {auc} acc : {acc}")
print(f"logging path: {trainer.prefix_save_path}")

auc : 0.8022710933055196 acc : 0.7281661600810537
logging path: ../last_lstm/LOG_[06.14_14:32]


In [177]:
args.n_linears = ['secPer', 'timeSec', 'assPer']

['userPer', 'assPer', 'firPer', 'testPer', 'secPer', 'timeSec']

In [179]:
trainer = FeatureTestTrainer(args, LSTM)

In [182]:
args.n_layers, args.hidden_dim

(2, 1024)

In [183]:
auc, acc = trainer.run_cv(train_dataset, valid_dataset, test_dataset, folds=5, seeds=[0, 1, 2, 3, 4])
clear_output()
print(f"auc : {auc} acc : {acc}")
print(f"logging path: {trainer.prefix_save_path}")

auc : 0.782386306241773 acc : 0.7152988855116514
logging path: ../last_lstm/LOG_[06.14_14:50]


In [188]:
from transformers import BertConfig, BertModel

In [189]:
class Bert(nn.Module):
    def __init__(self, args):
        super(Bert, self).__init__()
        self.args = args
        self.device = args.device

        # Defining some parameters
        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers
        
        self.emb_layer = EmbeddingLayer(args, self.hidden_dim // 2)
        self.nli_layer = LinearLayer(args, self.hidden_dim // 2)
        
        self.comb_proj = nn.Linear(self.hidden_dim, self.hidden_dim)

        # Bert config
        self.config = BertConfig(
            3,  # not used
            hidden_size=self.hidden_dim,
            num_hidden_layers=self.args.n_layers,
            num_attention_heads=self.args.n_heads,
            max_position_embeddings=self.args.max_seq_len,
        )

        # Defining the layers
        # Bert Layer
        self.encoder = BertModel(self.config)

        # Fully connected layer
        self.fc = nn.Linear(self.args.hidden_dim, 1)
        self.activation = nn.Sigmoid()

    def forward(self, batch):
        batch_size = batch["interaction"].size(0)
    
        embed = self.emb_layer(batch)
        nnbed = self.nli_layer(batch)
        
        embed = torch.cat([embed, nnbed], 2)
        X = self.comb_proj(embed)

        # Bert
        encoded_layers = self.encoder(inputs_embeds=X, attention_mask=batch["mask"])
        out = encoded_layers[0]
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)
        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds

In [193]:
args.n_heads = 4

In [196]:
trainer = FeatureTestTrainer(args, Bert)

In [None]:
auc, acc = trainer.run_cv(train_dataset, valid_dataset, test_dataset, folds=5, seeds=[0, 1, 2, 3, 4])
clear_output()
print(f"auc : {auc} acc : {acc}")
print(f"logging path: {trainer.prefix_save_path}")

VBox(children=(Label(value=' 5.91MB of 5.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,14.0
train_loss,0.56348
train_auc,0.7806
train_acc,0.71072
valid_auc,0.7776
valid_acc,0.71327
_runtime,100.0
_timestamp,1623650488.0
_step,14.0


0,1
epoch,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█
train_loss,█▃▂▂▂▁▂▁▁▁▁▁▁▁▁
train_auc,▁▆▆▇▇▇▇█████▇██
train_acc,▁▆▆▆▇▇▇▇█▇█████
valid_auc,▁▄▆▆▇▇▆▇▇██▇█▇█
valid_acc,▃▁▅▆▇█▆▆▁▅██▅▇▆
_runtime,▁▂▂▃▃▄▄▅▅▆▆▇▇██
_timestamp,▁▂▂▃▃▄▄▅▅▆▆▇▇██
_step,▁▁▂▃▃▃▄▅▅▅▆▇▇▇█


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.736411452293396
Training steps: 50 Loss: 0.6395953893661499
VALID AUC : 0.8274023437500001 ACC : 0.7454407294832827



Training steps: 0 Loss: 0.4838271141052246
Training steps: 50 Loss: 0.5337360501289368
VALID AUC : 0.8348422080592106 ACC : 0.7462006079027356



Training steps: 0 Loss: 0.5237232446670532
Training steps: 50 Loss: 0.5032291412353516
VALID AUC : 0.8371335320723685 ACC : 0.7616514690982776



Training steps: 0 Loss: 0.472443163394928
Training steps: 50 Loss: 0.5020096898078918
VALID AUC : 0.835569490131579 ACC : 0.745694022289767



Training steps: 0 Loss: 0.4279080033302307
Training steps: 50 Loss: 0.5043120384216309
VALID AUC : 0.8350704152960525 ACC : 0.7467071935157041



Training steps: 0 Loss: 0.48992013931274414
Training steps: 50 Loss: 0.43310001492500305
VALID AUC : 0.8350585937499999 ACC : 0.7469604863221885



Training steps: 0 Loss: 0.4284961223602295
Training steps: 50 Loss: 0.46940889954566956
VALID AUC : 0.833740234375 ACC : 0.7502532928064843



Training steps: 0 Loss: 0.44940146803855896
Training steps: 50 Loss: 0.5265388488769531
VALID AUC : 0.8355103824013158 ACC : 0.7540526849037488



VBox(children=(Label(value=' 5.91MB of 5.91MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
epoch,7.0
train_loss,0.50012
train_auc,0.83509
train_acc,0.75182
valid_auc,0.83551
valid_acc,0.75405
_runtime,66.0
_timestamp,1623650645.0
_step,7.0


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▃▂▂▁▂▁▁
train_auc,▁▆▇▇████
train_acc,▁▆▇▇█▇██
valid_auc,▁▆█▇▇▇▆▇
valid_acc,▁▁█▁▂▂▃▅
_runtime,▁▂▃▄▅▆▇█
_timestamp,▁▂▃▄▅▆▇█
_step,▁▂▃▄▅▆▇█


[34m[1mwandb[0m: wandb version 0.10.32 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Training steps: 0 Loss: 0.7974072694778442
Training steps: 50 Loss: 0.49199768900871277
VALID AUC : 0.8197433330766665 ACC : 0.7282168186423505



Training steps: 0 Loss: 0.5327182412147522
Training steps: 50 Loss: 0.5293370485305786
VALID AUC : 0.8213051513051512 ACC : 0.7368287740628167



Training steps: 0 Loss: 0.5345301032066345
Training steps: 50 Loss: 0.5069355964660645
VALID AUC : 0.8242185775519111 ACC : 0.7340425531914894



Training steps: 0 Loss: 0.5496634244918823
Training steps: 50 Loss: 0.458016961812973
VALID AUC : 0.8243825877159211 ACC : 0.7413880445795339



Training steps: 0 Loss: 0.511398196220398
Training steps: 50 Loss: 0.5222494602203369
VALID AUC : 0.8324362824362824 ACC : 0.7322695035460993



Training steps: 0 Loss: 0.4823376536369324
Training steps: 50 Loss: 0.4921101927757263
VALID AUC : 0.8326385359718692 ACC : 0.7489868287740629



Training steps: 0 Loss: 0.4637526273727417
Training steps: 50 Loss: 0.5603840947151184
VALID AUC : 0.831415774749108 ACC : 0.7515197568389058



Training steps: 0 Loss: 0.48088812828063965
Training steps: 50 Loss: 0.5347450375556946
VALID AUC : 0.8344077410744076 ACC : 0.747467071935157



Training steps: 0 Loss: 0.4528772234916687
Training steps: 50 Loss: 0.5596789717674255
VALID AUC : 0.8324837658170992 ACC : 0.7411347517730497



Training steps: 0 Loss: 0.5551417469978333
Training steps: 50 Loss: 0.48741453886032104
VALID AUC : 0.8327412027412027 ACC : 0.754305977710233



Training steps: 0 Loss: 0.5082675814628601
Training steps: 50 Loss: 0.5236749649047852
VALID AUC : 0.832568209234876 ACC : 0.7459473150962512



Training steps: 0 Loss: 0.45531702041625977


In [198]:
auc, acc

(0.8328989350284008, 0.747872340425532)