In [1]:
import sys
sys.path.append("/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/")

from fe.agg import (
    MakeCorrectCount, 
    MakeCorrectPercent, 
    MakeQuestionCount, 
    MakeTopNCorrectPercent
)

from fe.seq import (
    SplitAssessmentItemID,
    MakeFirstClass,
    MakeSecondClass,
    MakeYMD,
    ConvertTime
)

from dkt_dataset import Preprocess
from utils import get_args, get_root_dir
from fe.feature import FEPipeline
import easydict

In [2]:
args = get_args()
args.root_dir = get_root_dir(
    '/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test'
)

In [3]:
args.data_dir = "../../input/data/train_dataset/"

In [4]:
fe_pipeline = FEPipeline(args, [
    SplitAssessmentItemID,
    ConvertTime,
    MakeFirstClass,
    MakeSecondClass,
    MakeCorrectCount,
    MakeQuestionCount,
    MakeCorrectPercent,
    MakeTopNCorrectPercent
])
fe_pipeline.debug()

In [5]:
fe_pipeline.description()

[Feature Descriptions]

feature name : base_feature
feature type : seq
 - userID               : 사용자의 고유 번호입니다. 총 7,442명의 학생이 있습니다
 - assessmentItemID     : 사용자가 푼 문항의 일련 번호입니다.
 - testID               : 사용자가 푼 문항이 포함된 시험지의 일련 번호입니다.
 - answerCode           : 사용자가 푼 문항의 정답 여부를 담고 있는 이진 (0/1) 데이터입니다.
 - Timestamp            : 사용자가 문항을 푼 시간 정보입니다.
 - KnowledgeTag         : 사용자가 푼 문항의 고유 태그가 담겨져 있습니다.

feature name : split_assessmentitem_id
feature type : seq
 - testPaper            : 시험지 번호입니다.
 - testPaperCnt         : 시험지의 문항 번호입니다.

feature name : convert_time
feature type : seq
 - timeSec              : 사용자가 문항을 푼 타임스태프 정보입니다.

feature name : make_first_class
feature type : seq
 - firstClass           : 대분류에 해당합니다.

feature name : make_second_class
feature type : seq
 - secondClass          : 중분류에 해당합니다.

feature name : make_correct_count
feature type : agg
 - correctCnt           : 사용자가 맞춘 문항수를 나타냅니다.

feature name : make_question_count
feature type : agg
 - quesCnt              : 사

In [6]:
columns = ['userID', 'answerCode', 
           'testPaper', 'timeSec', 'firstClass', 'secondClass', 
           'correctPer', 'top10CorrectPer']
pre_encoders = {
    'label': ['testPaper', 'firstClass', 'secondClass'],
    'min_max': ['top10CorrectPer', 'correctPer'],
    'std': ['timeSec']
}

preprocess = Preprocess(args, fe_pipeline, columns)

In [None]:
preprocess.feature_engineering()
preprocess.split_data()
preprocess.preprocessing(pre_encoders)
preprocess.data_augmentation(choices=[1, 3])

In [8]:
train_dataset = preprocess.get_data('train_grouped')
valid_dataset = preprocess.get_data('valid_grouped')
test_dataset = preprocess.get_data('test_grouped')

In [9]:
import os
import json
import logging
import os.path as p
from datetime import datetime

import wandb
import torch
import numpy as np
import pandas as pd
from ray import tune
from torchinfo import summary
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

from logger import get_logger
from dkt_dataset import DKTDataset
from utils import get_args, get_criterion, get_optimizer, get_scheduler, set_seeds


class CustomStopper(tune.Stopper):
    def __init__(self, args):
        self.should_stop = False
        self.args = args

    def __call__(self, trial_id, result):
        if not self.should_stop and result["valid_auc"] > 0.83:
            self.should_stop = True

        return self.should_stop or result["training_iteration"] >= self.args.n_epochs

    def stop_all(self):
        return self.should_stop


class DKTTrainer:
    def __init__(self, args, Model):
        self.args = get_args()
        self.args.update(**args)
        self.create_model = Model

        self._helper_init()

    def _helper_init(self):
        self.prefix_save_path = datetime.now().strftime("[%m.%d_%H:%M]")
        self.prefix_save_path = p.join(self.args.root_dir, f"LOG_{self.prefix_save_path}")

        os.mkdir(self.prefix_save_path)

    def _save_config(self, args, filename="run_config.json"):
        save_path = p.join(self.prefix_save_path, filename)

        with open(save_path, "w") as writer:
            writer.write(json.dumps(args, indent=4, ensure_ascii=False) + "\n")

    def _get_model(self):
        model = self.create_model(self.args).to(self.args.device)
        return model

    def _update_params(self, loss, model, optimizer):
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), self.args.clip_grad)
        optimizer.step()
        optimizer.zero_grad()

    def _collate_fn(self, batches):
        """ key값으로 batch 형성 """
        new_batches = {k: [] for k in batches[0].keys()}

        max_seq_len = 20

        # batch의 값들을 각 column끼리 그룹화
        for k in batches[0].keys():
            for batch in batches:
                pre_padded = torch.zeros(max_seq_len)
                pre_padded[-len(batch[k]) :] = batch[k]
                new_batches[k].append(pre_padded)

        for k in batches[0].keys():
            new_batches[k] = torch.stack(new_batches[k])

        return new_batches

    def _get_loaders(self, train_data, valid_data):
        trainset = DKTDataset(train_data, self.args, self.args.columns)
        validset = DKTDataset(valid_data, self.args, self.args.columns)

        train_loader = torch.utils.data.DataLoader(
            trainset,
            num_workers=self.args.num_workers,
            shuffle=True,
            batch_size=self.args.batch_size,
            pin_memory=True,
            collate_fn=self._collate_fn,
        )

        valid_loader = torch.utils.data.DataLoader(
            validset,
            num_workers=self.args.num_workers,
            shuffle=False,
            batch_size=self.args.batch_size,
            pin_memory=True,
            collate_fn=self._collate_fn,
        )

        return train_loader, valid_loader

    def _to_numpy(self, preds):
        if self.args.device == "cuda":
            preds = preds.to("cpu").detach().numpy()
        else:  # cpu
            preds = preds.detach().numpy()
        return preds

    def _save_model(self, model, prefix=None):
        save_path = p.join(self.args.root_dir, self.prefix_save_path)
        assert p.exists(save_path), f"{save_path} does not exist"

        # get original model if use torch.nn.DataParallel
        model = model.module if hasattr(model, "module") else model
        save_path = f"{save_path}/{prefix}_model.pth" if prefix else f"{save_path}/model.pth"
        torch.save(model.state_dict(), save_path)

    def _load_model(self, prefix=None):
        load_path = p.join(self.args.root_dir, self.prefix_save_path)
        load_path = f"{load_path}/{prefix}_model.pth" if prefix else f"{load_path}/model.pth"
        assert p.exists(load_path), f"{load_path} does not exist"

        model = self._get_model()
        # strict=False, 일치하지 않는 키들을 무시
        model.load_state_dict(torch.load(load_path), strict=False)
        return model

    def _get_metric(self, targets, preds):
        auc = roc_auc_score(targets, preds)
        acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
        return auc, acc

    def _compute_loss(self, preds, targets):
        loss = get_criterion(preds, targets)

        # 마지막 Sequence에 대한 값만 Loss를 계산한다.
        loss = loss[:, -1]
        loss = torch.mean(loss)
        return loss

    def _process_batch(self, batch):
        raise NotImplementedError

    def _hyper(self, checkpoint_dir):
        step = 0
        checkpoint_path = p.join(checkpoint_dir, "checkpoint")

        model = self._get_model()
        optimizer = get_optimizer(model, self.args)
        scheduler = get_scheduler(optimizer, self.args)

        if checkpoint_dir is not None:
            checkpoint = torch.load(checkpoint_path)
            model.load_state_dict(checkpoint["model"])
            optimizer.load_state_dict(checkpoint["optimizer"])
            scheduler.load_state_dict(checkpoint["scheduler"])
            step = checkpoint["step"]

        while True:
            train_auc, train_acc, train_loss = self._train(model, self.train_loader, optimizer)
            valid_auc, valid_acc, _, _ = self._validate(model, self.valid_loader)

            tune.report(
                valid_auc=valid_auc,
                valid_acc=valid_acc,
                train_auc=train_auc,
                train_acc=train_acc,
                train_loss=train_loss,
            )

            step += 1

            with tune.checkpoint_dir(step=step) as checkpoint_dir:
                torch.save(
                    {
                        "model": model.state_dict(),
                        "optimizer": optimizer.state_dict(),
                        "scheduler": scheduler.state_dict(),
                        "step": step,
                    },
                    checkpoint_path,
                )

    def hyper(self, args, tune_args, train_data, valid_data):
        self.train_loader, self.valid_loader = self._get_loaders(train_data, valid_data)

        pbt_scheduler = tune.schedulers.PopulationBasedTraining(
            time_attr="training_iteration", 
            **tune_args
        )

        stopper = CustomStopper(self.args)

        analysis = tune.run(
            self._hyper,
            name="pbt_lstm",
            stop=stopper,
            max_failures=3,
            num_samples=4,
            metric="",
            scheduler=pbt_scheduler,
            keep_checkpoints_num=2,
            local_dir="~/ray_results",
            checkpoint_score_attr="max-valid_auc",
            resources_per_trial={"cpu": 3, "gpu": 1},
            config=self.args,  # custom search algorithm may ignore this
        )

        return analysis

    def _train(self, model, train_loader, optimizer):
        model.train()

        total_preds, total_targets = [], []
        losses = []

        for step, batch in enumerate(train_loader):
            batch = self._process_batch(batch)
            preds = model(batch)
            targets = batch["answerCode"]  # correct

            loss = self._compute_loss(preds, targets)
            self._update_params(loss, model, optimizer)

            if step % self.args.log_steps == 0:
                print(f"Training steps: {step} Loss: {str(loss.item())}")
                wandb.log({"step_train_loss": loss})

            preds, targets = preds[:, -1], targets[:, -1]

            if self.args.device == "cuda":
                preds = preds.to("cpu").detach().numpy()
                targets = targets.to("cpu").detach().numpy()
            else:
                preds = preds.detach().numpy()
                targets = targets.detach().numpy()

            total_preds.append(preds)
            total_targets.append(targets)
            losses.append(loss)

        total_preds = np.concatenate(total_preds)
        total_targets = np.concatenate(total_targets)

        # Train AUC / ACC
        auc, acc = self._get_metric(total_targets, total_preds)
        loss_avg = sum(losses) / len(losses)

        return auc, acc, loss_avg

    def _validate(self, model, valid_loader):
        model.eval()

        total_preds = []
        total_targets = []

        for step, batch in enumerate(valid_loader):
            batch = self._process_batch(batch)

            preds = model(batch)
            targets = batch["answerCode"]  # correct

            # predictions
            preds = preds[:, -1]
            targets = targets[:, -1]

            if self.args.device == "cuda":
                preds = preds.to("cpu").detach().numpy()
                targets = targets.to("cpu").detach().numpy()
            else:  # cpu
                preds = preds.detach().numpy()
                targets = targets.detach().numpy()

            total_preds.append(preds)
            total_targets.append(targets)

        total_preds = np.concatenate(total_preds)
        total_targets = np.concatenate(total_targets)

        # Train AUC / ACC
        auc, acc = self._get_metric(total_targets, total_preds)
        print(f"VALID AUC : {auc} ACC : {acc}\n")

        return auc, acc, total_preds, total_targets

    def _inference(self, test_data, prefix=None):
        model = self._load_model(prefix)  # loaded best model to self.model
        model.eval()

        _, test_loader = self._get_loaders(test_data, test_data)

        total_proba_preds = []

        for step, batch in enumerate(test_loader):
            batch = self._process_batch(batch)

            fancy_index = torch.where(batch["answerCode"][:, -1] == -1)
            if fancy_index[0].size(0) == 0:
                continue

            for k in batch.keys():
                batch[k] = batch[k][fancy_index]

            preds = model(batch)
            preds = preds[:, -1]

            preds = self._to_numpy(preds)
            total_proba_preds += list(preds)

        write_path = os.path.join(self.prefix_save_path, f"{prefix}_test_results.csv")

        with open(write_path, "w", encoding="utf8") as w:
            w.write("id,prediction\n")
            for idx, proba in enumerate(total_proba_preds):
                w.write(f"{idx},{proba}\n")

    def debug(self, train_data, valid_data, test_data):
        """간단한 입,출력을 테스트합니다.
        1. Model Summary
        3. 한 개 데이터가 잘 생성되는지 체크합니다.
        4. 배치 데이터가 잘 생성되는지 체크합니다.
        5. forward를 체크합니다.
        6. Loss 계산 및, Predict를 체크합니다.
        """
        debug_file_handler = logging.FileHandler(f"{self.prefix_save_path}/debug.log")
        logger = get_logger("debug")
        logger.setLevel(logging.INFO)
        logger.addHandler(debug_file_handler)

        model = self._get_model()
        logger.info("MODEl SUMMARY\n")
        logger.info(summary(model))

        logger.info("\nCHECK DATASET")

        for dataset, name in zip([train_data, valid_data, test_data], ["TRAIN", "VALID", "TEST"]):
            logger.info(f"\n{name} EXAMPLES")
            for column, data in zip(self.args.columns, dataset[0]):
                logger.info(f"{column} : {data[:10]}")

        train_loader, valid_loader = self._get_loaders(train_data, valid_data)
        _, test_loader = self._get_loaders(test_data, test_data)

        logger.info("\nCHECK BATCH SHAPE")
        for data_loader, name in zip([train_loader, test_loader, valid_loader], ["TRAIN", "TEST", "VALID"]):
            batch = next(iter(data_loader))
            logger.info(f"\n{name} BATCH TYPE : {type(batch)}")
            logger.info(f"\n{name} BATCH LEN : {len(batch)}")
            logger.info(f"\n{name} BATCH DICT VALUE SHAPE : {batch['answerCode'].shape}")

        logger.info("\nCHECK MODEL FORWARD")

        batch = self._process_batch(batch)
        preds = model(batch)

        logger.info(f"\nPREDS SHAPE: {preds.shape}")
        logger.info(f"\nPREDS EXAMPLES: {preds[0]}")

        logger.info("\nCHECK METRICS")

        gt = batch["answerCode"]
        loss = self._compute_loss(preds, gt)

        logger.info(f"\nLOSS : {loss.item()}")

        auc, acc = self._get_metric(self._to_numpy(gt[:, -1]), self._to_numpy(preds[:, -1]))
        logger.info(f"AUC: {auc} ACC: {acc}")

    def run(self, train_data, valid_data, test_data, prefix="run"):
        self._save_config(self.args)
        set_seeds(self.args.seed)

        run_file_handler = logging.FileHandler(f"{self.prefix_save_path}/{prefix}.log")
        logger = get_logger("run")
        logger.setLevel(logging.DEBUG)
        logger.addHandler(run_file_handler)

        model = self._get_model()
        wandb.init(project="p-stage-4", reinit=True)
        wandb.config.update(self.args)
        wandb.watch(model)
        wandb.run.name = f"{self.prefix_save_path}_{prefix}"

        train_loader, valid_loader = self._get_loaders(train_data, valid_data)

        self.args.total_steps = int(len(train_loader.dataset) / self.args.batch_size) * (self.args.n_epochs)
        self.args.warmup_steps = self.args.total_steps // 10

        if self.args.scheduler == "linear_warmup":
            self.args.scheduler_hp = {
                "num_training_steps": self.args.total_steps,
                "num_warmup_steps": self.args.warmup_steps,
            }

        optimizer = get_optimizer(model, self.args)
        scheduler = get_scheduler(optimizer, self.args)

        best_auc, best_acc = -1, -1
        early_stopping_counter = 0

        for epoch in range(self.args.n_epochs):
            logger.info(f"Start Training: Epoch {epoch + 1}")

            train_auc, train_acc, train_loss = self._train(model, train_loader, optimizer)
            valid_auc, valid_acc, _, _ = self._validate(model, valid_loader)

            wandb.log(
                {
                    "epoch": epoch,
                    "train_loss": train_loss,
                    "train_auc": train_auc,
                    "train_acc": train_acc,
                    "valid_auc": valid_auc,
                    "valid_acc": valid_acc,
                }
            )

            logger.info(f"TRAIN_LOSS: {train_loss}")
            logger.info(f"TRAIN AUC: {train_auc} TRAIN ACC: {train_acc}")
            logger.info(f"VALID AUC: {valid_auc} VALID ACC: {valid_acc}\n")

            if valid_auc > best_auc:
                best_auc, best_acc = valid_auc, valid_acc
                self._save_model(model, prefix)
                early_stopping_counter = 0
            else:
                early_stopping_counter += 1
                logger.info(f"EarlyStopping counter: {early_stopping_counter}")
                if early_stopping_counter >= self.args.patience:
                    logger.info(f"EarlyStopping counter: {early_stopping_counter} out of {self.args.patience}")
                    break

            if self.args.scheduler == "plateau":
                scheduler.step(best_auc)
            else:
                scheduler.step()

        self._inference(test_data, prefix)
        return best_auc, best_acc

    def run_cv(self, train_data, valid_data, test_data, test_size:float, folds: int, seeds: list):
        assert folds == len(seeds), "fold와 len(seeds)는 같은 수여야 합니다."

        total_data = np.concatenate([train_data, valid_data])
        self.args.seeds = seeds

        valid_results = {}

        for n_fold, seed in enumerate(seeds):
            self.args.seed = seed
            # TODO: User 패턴이 학습이 된다면, 충분히 데이터 유출될 수 있음
            train_data, valid_data = train_test_split(total_data, test_size=test_size, random_state=seed)
            prefix = f"cv_{n_fold}"

            best_auc, best_acc = self.run(train_data, valid_data, test_data, prefix=prefix)
            valid_results[prefix] = f"best_auc:{best_auc},best_acc:{best_acc}"

        self._save_config(valid_results, "valid_cv_results.json")

        new_df = pd.DataFrame([])

        for idx in range(folds):
            df = pd.read_csv(p.join(self.prefix_save_path, f"cv_{idx}_test_results.csv"))

            if idx == 0:
                new_df["id"] = df["id"]
                new_df["prediction"] = df["prediction"]
            else:
                new_df["prediction"] += df["prediction"]

        new_df["prediction"] /= folds
        new_df.to_csv(p.join(self.prefix_save_path, "cv_ensemble_test_results.csv"))

In [10]:
import torch
import torch.nn as nn

class EmbeddingLayer(nn.Module):
    def __init__(self, args, hidden_dim):
        super(EmbeddingLayer, self).__init__()

        self.args = args
        self.device = args.device
        self.hidden_dim = hidden_dim

        labels_dim = self.hidden_dim // (len(self.args.n_embeddings) + 1)
        interaction_dim = self.hidden_dim - (labels_dim * len(self.args.n_embeddings))

        self.embedding_interaction = nn.Embedding(3, interaction_dim)
        self.embeddings = nn.ModuleDict(
            {k: nn.Embedding(v + 1, labels_dim) for k, v in self.args.n_embeddings.items()}  # plus 1 for padding
        )

    def forward(self, batch):
        embed_interaction = self.embedding_interaction(batch["interaction"])
        embed = torch.cat(
            [embed_interaction] + [self.embeddings[k](batch[k]) for k in self.args.n_embeddings.keys()], 2
        )
        return embed


class LinearLayer(nn.Module):
    def __init__(self, args, hidden_dim):
        super(LinearLayer, self).__init__()

        self.args = args
        self.device = args.device

        self.hidden_dim = hidden_dim
        in_features = len(self.args.n_linears)
        self.fc_layer = nn.Linear(in_features, self.hidden_dim)

    def forward(self, batch):
        cont_v = torch.stack([batch[k] for k in self.args.n_linears]).permute(1, 2, 0)
        output = self.fc_layer(cont_v)
        return output


class LSTM(nn.Module):
    def __init__(self, args):
        super(LSTM, self).__init__()
        self.args = args
        self.device = args.device

        self.hidden_dim = self.args.hidden_dim
        self.n_layers = self.args.n_layers

        self.emb_layer = EmbeddingLayer(args, self.hidden_dim // 2)
        self.nli_layer = LinearLayer(args, self.hidden_dim // 2)

        self.comb_proj = nn.Linear(self.hidden_dim, self.hidden_dim)

        self.lstm = nn.LSTM(self.hidden_dim, self.hidden_dim, self.n_layers, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(self.hidden_dim, 1)
        self.activation = nn.Sigmoid()

    def init_hidden(self, batch_size):
        h = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        h = h.to(self.device)

        c = torch.zeros(self.n_layers, batch_size, self.hidden_dim)
        c = c.to(self.device)

        return (h, c)

    def forward(self, batch):
        batch_size = batch["interaction"].size(0)

        embed = self.emb_layer(batch)
        nnbed = self.nli_layer(batch)

        embed = torch.cat([embed, nnbed], 2)
        X = self.comb_proj(embed)

        hidden = self.init_hidden(batch_size)
        out, hidden = self.lstm(X, hidden)
        out = out.contiguous().view(batch_size, -1, self.hidden_dim)

        out = self.fc(out)
        preds = self.activation(out).view(batch_size, -1)

        return preds

    

class LSTMTrainer(DKTTrainer):
    def _process_batch(self, batch):
        batch['mask'] = batch['mask'].type(torch.FloatTensor)
        batch['answerCode'] = batch['answerCode'].type(torch.FloatTensor)
        batch['correctPer'] = batch['correctPer'].type(torch.FloatTensor)
        batch['timeSec'] = batch['timeSec'].type(torch.FloatTensor)

        batch['interaction'] = batch['answerCode'] + 1
        batch['interaction'] = batch['interaction'].roll(shifts=1, dims=1)
        batch['mask'] = batch['mask'].roll(shifts=1, dims=1)
        batch['mask'][:, 0] = 0
        batch['interaction'] = (batch['interaction'] * batch['mask']).to(torch.int64)

        batch['testPaper'] = batch['testPaper'].to(torch.int64)
        batch['firstClass'] = batch['firstClass'].to(torch.int64)
        batch['secondClass'] = batch['secondClass'].to(torch.int64)
        
        for k in batch.keys():
            batch[k] = batch[k].to(self.args.device)

        return batch

In [11]:
args.columns = columns[1:]
args.hidden_dim = 512
args.n_epochs = 20
args.lr = 0.000144
args.batch_size = 60
args.n_layers = 2
args.weight_decay = 0.00096

In [12]:
trainer = LSTMTrainer(args, LSTM)

In [None]:
trainer.run_cv(train_dataset, valid_dataset, test_dataset, 
               test_size=0.5,
               folds=5,
               seeds=[0, 1, 2, 3, 4]
              )

In [14]:
def metric_result(prefix_path, fold):
    print(prefix_path)
    file_path = f"{prefix_path}/valid_cv_results.json"
    total_auc = 0
    
    with open(file_path, "r") as f:
        temp = json.load(f)
        
    for v in temp.values():
        print(v)
        auc = float(v.split(",")[0].split(":")[1])
        print(auc)
        total_auc += auc
        
    return total_auc / fold

**(folds=5, test_size=0.5)**

In [16]:
metric_result(trainer.prefix_save_path, 5)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.08_06:02]
best_auc:0.7661859024091989,best_acc:0.7038500506585613
0.7661859024091989
best_auc:0.7697012223980025,best_acc:0.7083080040526849
0.7697012223980025
best_auc:0.7695822663689388,best_acc:0.707193515704154
0.7695822663689388
best_auc:0.766073944882736,best_acc:0.7004052684903749
0.766073944882736
best_auc:0.7703990192142419,best_acc:0.7065856129685917
0.7703990192142419


0.7683884710546237

**(folds=3, test_size=0.1)**

'/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]'

In [47]:
metric_result(trainer.prefix_save_path, 3)

best_auc:0.7787203082239915,best_acc:0.7031408308004052
0.7787203082239915
best_auc:0.7568773386852548,best_acc:0.6930091185410334
0.7568773386852548
best_auc:0.7832042699233832,best_acc:0.7173252279635258
0.7832042699233832


0.7729339722775431

**(folds=3, test_size=0.2)**

In [51]:
metric_result(trainer.prefix_save_path, 3)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7711663795329549,best_acc:0.7021276595744681
0.7711663795329549
best_auc:0.7694992957225927,best_acc:0.708966565349544
0.7694992957225927
best_auc:0.7688831714676145,best_acc:0.705420466058764
0.7688831714676145


0.7698496155743874

**(folds=3, test_size=0.3)**

In [53]:
metric_result(trainer.prefix_save_path, 3)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7716193510849351,best_acc:0.706855791962175
0.7716193510849351
best_auc:0.7674919575633128,best_acc:0.7007767646065518
0.7674919575633128
best_auc:0.7670384501490044,best_acc:0.707531239446133
0.7670384501490044


0.7687165862657507

**(folds=5, test_size=0.1)**

In [55]:
metric_result(trainer.prefix_save_path, 5)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7787203082239915,best_acc:0.7031408308004052
0.7787203082239915
best_auc:0.7568773386852548,best_acc:0.6930091185410334
0.7568773386852548
best_auc:0.7832042699233832,best_acc:0.7173252279635258
0.7832042699233832
best_auc:0.763088549745266,best_acc:0.6990881458966566
0.763088549745266
best_auc:0.7603983572895276,best_acc:0.7001013171225937
0.7603983572895276


0.7684577647734846

**(folds=5, test_size=0.2)**

In [58]:
metric_result(trainer.prefix_save_path, 5)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7711662510614954,best_acc:0.7021276595744681
0.7711662510614954
best_auc:0.7694992957225927,best_acc:0.708966565349544
0.7694992957225927
best_auc:0.7688831714676148,best_acc:0.705420466058764
0.7688831714676148
best_auc:0.7611421154394541,best_acc:0.7008611955420466
0.7611421154394541
best_auc:0.7683923069322051,best_acc:0.7031408308004052
0.7683923069322051


0.7678166281246723

**(folds=5, test_size=0.3)**

In [60]:
metric_result(trainer.prefix_save_path, 5)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7716193510849351,best_acc:0.706855791962175
0.7716193510849351
best_auc:0.7674919575633128,best_acc:0.7007767646065518
0.7674919575633128
best_auc:0.7670384501490044,best_acc:0.707531239446133
0.7670384501490044
best_auc:0.761875755153294,best_acc:0.698919284025667
0.761875755153294
best_auc:0.7712231208940146,best_acc:0.7060114826072272
0.7712231208940146


0.7678497269689121

**(folds=10, test_size=0.1)**

In [62]:
metric_result(trainer.prefix_save_path, 10)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7787203082239915,best_acc:0.7031408308004052
0.7787203082239915
best_auc:0.7568773386852548,best_acc:0.6930091185410334
0.7568773386852548
best_auc:0.7832042699233832,best_acc:0.7173252279635258
0.7832042699233832
best_auc:0.763088549745266,best_acc:0.6990881458966566
0.763088549745266
best_auc:0.7603983572895276,best_acc:0.7001013171225937
0.7603983572895276
best_auc:0.7687280830111568,best_acc:0.7102330293819655
0.7687280830111568
best_auc:0.7894953798767967,best_acc:0.7264437689969605
0.7894953798767967
best_auc:0.7855520736077601,best_acc:0.7137791286727457
0.7855520736077601
best_auc:0.7867072490505839,best_acc:0.7137791286727457
0.7867072490505839
best_auc:0.76207450000411,best_acc:0.7036474164133738
0.76207450000411


0.773484610941783

**(folds=10, test_size=0.2)**

In [64]:
metric_result(trainer.prefix_save_path, 10)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7711662510614954,best_acc:0.7021276595744681
0.7711662510614954
best_auc:0.7694992957225927,best_acc:0.708966565349544
0.7694992957225927
best_auc:0.7688831714676145,best_acc:0.705420466058764
0.7688831714676145
best_auc:0.761142115439454,best_acc:0.7008611955420466
0.761142115439454
best_auc:0.7683923069322051,best_acc:0.7031408308004052
0.7683923069322051
best_auc:0.772597449538329,best_acc:0.7102330293819655
0.772597449538329
best_auc:0.7781345440173286,best_acc:0.7193515704154002
0.7781345440173286
best_auc:0.7746736038092428,best_acc:0.7074468085106383
0.7746736038092428
best_auc:0.7780290602025899,best_acc:0.7079533941236069
0.7780290602025899
best_auc:0.7647608510883205,best_acc:0.7084599797365755
0.7647608510883205


0.7707278649279171

**(folds=10, test_size=0.3)**

In [66]:
metric_result(trainer.prefix_save_path, 10)

/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]
best_auc:0.7716193510849351,best_acc:0.706855791962175
0.7716193510849351
best_auc:0.7674919575633128,best_acc:0.7007767646065518
0.7674919575633128
best_auc:0.7670384501490044,best_acc:0.707531239446133
0.7670384501490044
best_auc:0.761875755153294,best_acc:0.698919284025667
0.761875755153294
best_auc:0.7712231208940146,best_acc:0.7060114826072272
0.7712231208940146
best_auc:0.7697955602890236,best_acc:0.7012833502195205
0.7697955602890236
best_auc:0.7769459937007558,best_acc:0.7102330293819655
0.7769459937007558
best_auc:0.7756501346153429,best_acc:0.7051671732522796
0.7756501346153429
best_auc:0.7749429001964473,best_acc:0.704322863897332
0.7749429001964473
best_auc:0.7714736603336819,best_acc:0.7066869300911854
0.7714736603336819


0.7708056883979812

In [71]:
df = pd.read_csv(
    "../models/lstm/hyper_test/LOG_[06.07_16:15]/cv_ensemble_test_results.csv",
    index_col=['Unnamed: 0']
)

In [72]:
df.head()

Unnamed: 0,id,prediction
0,0,0.589862
1,1,0.594878
2,2,0.262907
3,3,0.788662
4,4,0.433151


In [75]:
trainer.prefix_save_path

'/home/j-gunmo/desktop/00.my-project/17.P-Stage-T1003/4-STAGE/models/lstm/hyper_test/LOG_[06.07_16:15]'

In [74]:
df.to_csv(
    "../models/lstm/hyper_test/LOG_[06.07_16:15]/cv_ensembles_test_results.csv",
    index=False
)

In [73]:
df.to_csv??