## Import

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

import os
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import random

import warnings
warnings.filterwarnings(action='ignore') 

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')


## Hyperparameter setting

In [None]:
CFG = {
    'EPOCHS': 30,
    'LEARNING_RATE':1e-2,
    'BATCH_SIZE':256,
    'SEED':41
}

## Fixed RandomSeed

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED'])

## Data Load

In [None]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

## Data Preprocessing
#### 1. 결측치 처리
#### 2. Train / Validation 분할
#### 3. Data label-encoding, scaling

In [None]:
from pandas.core.reshape.reshape import get_dummies
temp=get_dummies(train['COMPONENT_ARBITRARY'])
train=pd.concat([temp,train],axis=1)
train=train.drop('COMPONENT_ARBITRARY',axis=1)

temp=get_dummies(test['COMPONENT_ARBITRARY'])
test=pd.concat([temp,test],axis=1)
test=test.drop('COMPONENT_ARBITRARY',axis=1)

In [None]:
train

Unnamed: 0,COMPONENT1,COMPONENT2,COMPONENT3,COMPONENT4,ID,ANONYMOUS_1,YEAR,SAMPLE_TRANSFER_DAY,ANONYMOUS_2,AG,...,U25,U20,U14,U6,U4,V,V100,V40,ZN,Y_LABEL
0,0,0,1,0,TRAIN_00000,1486,2011,7,200,0,...,,,,,,0,,154.0,75,0
1,0,1,0,0,TRAIN_00001,1350,2021,51,375,0,...,2.0,4.0,6.0,216.0,1454.0,0,,44.0,652,0
2,0,1,0,0,TRAIN_00002,2415,2015,2,200,0,...,0.0,3.0,39.0,11261.0,41081.0,0,,72.6,412,1
3,0,0,1,0,TRAIN_00003,7389,2010,2,200,0,...,,,,,,0,,133.3,7,0
4,0,0,1,0,TRAIN_00004,3954,2015,4,200,0,...,,,,,,0,,133.1,128,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14090,0,0,1,0,TRAIN_14090,1616,2014,8,200,0,...,,,,,,0,,135.4,16,0
14091,1,0,0,0,TRAIN_14091,2784,2013,2,200,0,...,,,,,,0,14.5,117.5,1408,0
14092,0,0,1,0,TRAIN_14092,1788,2008,9,550,0,...,,,,,,0,,54.0,1301,0
14093,0,1,0,0,TRAIN_14093,2498,2009,19,550,0,...,7.0,8.0,100.0,1625.0,18890.0,0,,44.3,652,0


In [None]:
categorical_features = ['COMPONENT_ARBITRARY', 'YEAR']


# Inference(실제 진단 환경)에 사용하는 컬럼
test_stage_features = ['COMPONENT_ARBITRARY', 'ANONYMOUS_1', 'YEAR' , 'ANONYMOUS_2', 'AG', 'CO', 'CR', 'CU', 'FE', 'H2O', 'MN', 'MO', 'NI', 'PQINDEX', 'TI', 'V', 'V40', 'ZN']




In [None]:
#train = train.fillna(train.mean())
#test = test.fillna(test.mean())

train = train.fillna(0)
test = test.fillna(0)

In [None]:
all_X = train.drop(['ID', 'Y_LABEL'], axis = 1)
all_y = train['Y_LABEL']

test = test.drop(['ID'], axis = 1)

train_X, val_X, train_y, val_y = train_test_split(all_X, all_y, test_size=0.2, random_state=CFG['SEED'], stratify=all_y)

In [None]:
def get_values(value):
    return value.values.reshape(-1, 1)

for col in train_X.columns:
    if col not in categorical_features:
        scaler = StandardScaler()
        train_X[col] = scaler.fit_transform(get_values(train_X[col]))
        val_X[col] = scaler.transform(get_values(val_X[col]))
        if col in test.columns:
            test[col] = scaler.transform(get_values(test[col]))
            
le = LabelEncoder()
for col in categorical_features:    
    train_X[col] = le.fit_transform(train_X[col])
    val_X[col] = le.transform(val_X[col])
    if col in test.columns:
        test[col] = le.transform(test[col])
        


## CustomDataset

In [None]:


#__len__
#__len__ 함수는 데이터셋의 샘플 개수를 반환합니다.
"""
__getitem__ 함수는 주어진 인덱스 idx 에 해당하는 샘플을 데이터셋에서 불러오고 반환합니다.
인덱스를 기반으로, 디스크에서 이미지의 위치를 식별하고, read_image 를 사용하여 이미지를 텐서로 변환하고,
self.img_labels 의 csv 데이터로부터 해당하는 정답(label)을 가져오고, (해당하는 경우) 변형(transform) 함수들을 호출한 뒤,
텐서 이미지와 라벨을 Python 사전(dict)형으로 반환합니다.
"""

class CustomDataset(Dataset):
    def __init__(self, data_X, data_y, distillation=False):
        super(CustomDataset, self).__init__()
        self.data_X = data_X
        self.data_y = data_y
        self.distillation = distillation
        
    def __len__(self):
        return len(self.data_X)
    
    def __getitem__(self, index):
        if self.distillation:
            # 지식 증류 학습 시
            teacher_X = torch.Tensor(self.data_X.iloc[index])
            student_X = torch.Tensor(self.data_X[test_stage_features].iloc[index])
            y = self.data_y.values[index]
            return teacher_X, student_X, y
        else:
            if self.data_y is None:
                test_X = torch.Tensor(self.data_X.iloc[index])
                return test_X
            else:
                teacher_X = torch.Tensor(self.data_X.iloc[index])
                y = self.data_y.values[index]
                return teacher_X, y

In [None]:
train_dataset = CustomDataset(train_X, train_y, False)
val_dataset = CustomDataset(val_X, val_y, False)
#객체 생성 

# DataLoader
DataLoader로 학습용 데이터 준비하기
Dataset 은 데이터셋의 특징(feature)을 가져오고 하나의 샘플에 정답(label)을 지정하는 일을 한 번에 합니다.
모델을 학습할 때, 일반적으로 샘플들을 “미니배치(minibatch)”로 전달하고, 매 에폭(epoch)마다 데이터를 다시 섞어서 과적합(overfit)을 막고,
Python의 multiprocessing 을 사용하여 데이터 검색 속도를 높이려고 합니다.



In [None]:
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

## Define Teacher Model

In [None]:
class Teacher(nn.Module):
    def __init__(self):
        super(Teacher, self).__init__()
        self.classifier = nn.Sequential(
            nn.Linear(in_features=52, out_features=256),
            nn.LazyBatc hNorm1d(256),
            nn.SiLU(),
            nn.Linear(in_features=256, out_features=512),
            nn.LazyBatchNorm1d(512),
            nn.SiLU(),
            nn.Linear(in_features=512, out_features=256),
            nn.LazyBatchNorm1d(256),
            nn.SiLU(),  
            nn.Linear(in_features=256, out_features=1),
            nn.Sigmoid()
        )
    def forward(self, x):

      output=self.classifier(x)
      return output

## Teacher Train / Validation

In [None]:
def train(model, optimizer, train_loader, val_loader, scheduler, device):
    model.to(device)
    best_score = 0
    best_model = None
    criterion = nn.BCELoss().to(device)
    best_f1=0
    for epoch in range(CFG["EPOCHS"]):
        train_loss = []
  
        model.train()
        for X, y in tqdm(train_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()
            
            y_pred = model(X)
            
            loss = criterion(y_pred, y.reshape(-1, 1))
            loss.backward()
            
            optimizer.step()

            train_loss.append(loss.item())

        val_loss, val_score = validation_teacher(model, val_loader, criterion, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')

        if scheduler is not None:
            scheduler.step(val_score)
        
        if best_score < val_score:
            best_model = model
            best_score = val_score
        print(best_score,'best_f1')
    return best_model 




In [None]:
def competition_metric(true, pred):
    return f1_score(true, pred, average="macro")

def validation_teacher(model, val_loader, criterion, device):
    model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35
    best_f1=-1
    with torch.no_grad():
        for X, y in tqdm(val_loader):
            X = X.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(X.to(device))
            
            loss = criterion(model_pred, y.reshape(-1, 1))
            val_loss.append(loss.item())      
            
            model_pred = model_pred.squeeze(1).to('cpu')  
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
        if best_f1 < val_f1:
          best_f1=val_f1
    return val_loss, val_f1 

## Run (Teacher Model)

In [None]:
model = Teacher()
model.eval()
optimizer = torch.optim.AdamW(model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)
model

Teacher(
  (classifier): Sequential(
    (0): Linear(in_features=52, out_features=256, bias=True)
    (1): LazyBatchNorm1d(0, eps=256, momentum=0.1, affine=True, track_running_stats=True)
    (2): SiLU()
    (3): Linear(in_features=256, out_features=512, bias=True)
    (4): LazyBatchNorm1d(0, eps=512, momentum=0.1, affine=True, track_running_stats=True)
    (5): SiLU()
    (6): Linear(in_features=512, out_features=256, bias=True)
    (7): LazyBatchNorm1d(0, eps=256, momentum=0.1, affine=True, track_running_stats=True)
    (8): SiLU()
    (9): Linear(in_features=256, out_features=1, bias=True)
    (10): Sigmoid()
  )
)

In [None]:
teacher_model = train(model, optimizer, train_loader, val_loader, scheduler, device)



  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [0], Train Loss : [0.33819] Val Loss : [0.35475] Val F1 Score : [0.62677]
0.6267679127725857 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.22122] Val Loss : [0.36874] Val F1 Score : [0.75852]
0.7585221532054718 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.19132] Val Loss : [0.31893] Val F1 Score : [0.80136]
0.8013625742634347 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.19556] Val Loss : [0.27318] Val F1 Score : [0.78329]
0.8013625742634347 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.18795] Val Loss : [0.33137] Val F1 Score : [0.75020]
Epoch 00005: reducing learning rate of group 0 to 5.0000e-03.
0.8013625742634347 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.17089] Val Loss : [0.33070] Val F1 Score : [0.80522]
0.8052213101229272 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.17033] Val Loss : [0.31374] Val F1 Score : [0.79971]
0.8052213101229272 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.16999] Val Loss : [0.33517] Val F1 Score : [0.80334]
Epoch 00008: reducing learning rate of group 0 to 2.5000e-03.
0.8052213101229272 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.16368] Val Loss : [0.31141] Val F1 Score : [0.80905]
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.16212] Val Loss : [0.27113] Val F1 Score : [0.80718]
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [10], Train Loss : [0.16088] Val Loss : [0.26437] Val F1 Score : [0.80351]
Epoch 00011: reducing learning rate of group 0 to 1.2500e-03.
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [11], Train Loss : [0.18439] Val Loss : [0.28025] Val F1 Score : [0.80526]
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [12], Train Loss : [0.16119] Val Loss : [0.24739] Val F1 Score : [0.80248]
Epoch 00013: reducing learning rate of group 0 to 6.2500e-04.
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [13], Train Loss : [0.15717] Val Loss : [0.25995] Val F1 Score : [0.80244]
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [14], Train Loss : [0.15984] Val Loss : [0.25862] Val F1 Score : [0.80530]
Epoch 00015: reducing learning rate of group 0 to 3.1250e-04.
0.8090519336664355 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [15], Train Loss : [0.15696] Val Loss : [0.25444] Val F1 Score : [0.81089]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [16], Train Loss : [0.15624] Val Loss : [0.26361] Val F1 Score : [0.80244]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [17], Train Loss : [0.15942] Val Loss : [0.26108] Val F1 Score : [0.80624]
Epoch 00018: reducing learning rate of group 0 to 1.5625e-04.
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [18], Train Loss : [0.16254] Val Loss : [0.26425] Val F1 Score : [0.80624]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [19], Train Loss : [0.15589] Val Loss : [0.26700] Val F1 Score : [0.80244]
Epoch 00020: reducing learning rate of group 0 to 7.8125e-05.
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [20], Train Loss : [0.15660] Val Loss : [0.25884] Val F1 Score : [0.80902]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [21], Train Loss : [0.16609] Val Loss : [0.26169] Val F1 Score : [0.80997]
Epoch 00022: reducing learning rate of group 0 to 3.9063e-05.
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [22], Train Loss : [0.15696] Val Loss : [0.26109] Val F1 Score : [0.80997]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [23], Train Loss : [0.15665] Val Loss : [0.26120] Val F1 Score : [0.80997]
Epoch 00024: reducing learning rate of group 0 to 1.9531e-05.
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [24], Train Loss : [0.16005] Val Loss : [0.26108] Val F1 Score : [0.80997]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [25], Train Loss : [0.16124] Val Loss : [0.26321] Val F1 Score : [0.80624]
Epoch 00026: reducing learning rate of group 0 to 9.7656e-06.
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [26], Train Loss : [0.15733] Val Loss : [0.26056] Val F1 Score : [0.80904]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [27], Train Loss : [0.15632] Val Loss : [0.26424] Val F1 Score : [0.80718]
Epoch 00028: reducing learning rate of group 0 to 4.8828e-06.
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [28], Train Loss : [0.16017] Val Loss : [0.26208] Val F1 Score : [0.80997]
0.8108867483001788 best_f1


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [29], Train Loss : [0.16211] Val Loss : [0.26194] Val F1 Score : [0.80997]
Epoch 00030: reducing learning rate of group 0 to 2.4414e-06.
0.8108867483001788 best_f1


In [None]:
#0.80535011117898 best_f1.  lr_scheduler=max , gru= num_layers =1
#0.8062359237412331 best_f1  lr_scheduler=min , gru= num_layers =2
#0.8071693142869618 best_f1. lr_scheduler=max , gru= num_layers =2


### layer 
#0.8080850716191927 best_f1  lr_scheduler=max , gru= num_layers =2 265 512 증가하는 폭을 조금 수정
# 0.8071796488389799 best_f1 linear layer 하나 더 추가 nn.Linear(in_features=128, out_features=256)
#0.4776727811747267 best_f1 그냥 젤큰 레이어 추가하는건 별로 안좋음
#0.8117531050504077 1)best_f1 최고 nn.Linear(in_features=128, out_features=256) 피처 범위 512 로 바꿈
#                   2) 
#0.8071332686913297   //  최고범위 256 


## Define Student Model

In [None]:
class Student(nn.Module):
    def __init__(self):
        super(Student, self).__init__()
        self.lstm = nn.GRU(input_size=18, hidden_size=18, num_layers=7, bias=True, batch_first=True)
        self.classifier = nn.Sequential(
            nn.Linear(in_features=18, out_features=128),
            nn.LazyBatchNorm1d(128),
            nn.SiLU(),
            nn.Linear(in_features=128, out_features=512),
            nn.LazyBatchNorm1d(512),
            nn.SiLU(),
            nn.Linear(in_features=512, out_features=128),
            nn.LazyBatchNorm1d(128),
            nn.SiLU(),
            nn.Linear(in_features=128, out_features=1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        hidden, _ = self.lstm(x)
        output = self.classifier(hidden)
        return output

## Define Knowledge distillation Loss

In [None]:
def distillation(student_logits, labels, teacher_logits, alpha):
    distillation_loss = nn.BCELoss()(student_logits, teacher_logits)
    student_loss = nn.BCELoss()(student_logits, labels.reshape(-1, 1))
    return alpha * student_loss + (1-alpha) * distillation_loss

In [None]:
def distill_loss(output, target, teacher_output, loss_fn=distillation, opt=optimizer):
    loss_b = loss_fn(output, target, teacher_output, alpha=0.1)

    if opt is not None:
        opt.zero_grad()
        loss_b.backward()
        opt.step()

    return loss_b.item()

## Student Train / Validation

In [None]:
def student_train(s_model, t_model, optimizer, train_loader, val_loader, scheduler, device):
    s_model.to(device)
    t_model.to(device)
    
    best_score = 0
    best_model = None

    for epoch in range(CFG["EPOCHS"]):
        train_loss = []
        s_model.train()
        t_model.eval()
        
        for X_t, X_s, y in tqdm(train_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()

            output = s_model(X_s)
            with torch.no_grad():
                teacher_output = t_model(X_t)
                
            loss_b = distill_loss(output, y, teacher_output, loss_fn=distillation, opt=optimizer)

            train_loss.append(loss_b)

        val_loss, val_score = validation_student(s_model, t_model, val_loader, distill_loss, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = s_model
            best_score = val_score
        
    return best_model
"""
def student_layer(s_model, t_model, optimizer, train_loader, val_loader, scheduler, device):
    s_model.to(device)
    t_model.to(device)
    
    best_score = 0
    best_model = None

    for epoch in range(1):
        train_loss = []
        s_model.train()
        t_model.eval()
        
        for X_t, X_s, y in tqdm(train_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            optimizer.zero_grad()

            output = s_model(X_s)
            with torch.no_grad():
                teacher_output = t_model(X_t)
                
            loss_b = distill_loss(output, y, teacher_output, loss_fn=distillation, opt=optimizer)

            train_loss.append(loss_b)

        val_loss, val_score = validation_student(s_model, t_model, val_loader, distill_loss, device)
        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.mean(val_loss) :.5f}] Val F1 Score : [{val_score:.5f}]')
        
        if scheduler is not None:
            scheduler.step(val_score)
            
        if best_score < val_score:
            best_model = s_model
            best_score = val_score
        
    return best_model"""

"\ndef student_layer(s_model, t_model, optimizer, train_loader, val_loader, scheduler, device):\n    s_model.to(device)\n    t_model.to(device)\n    \n    best_score = 0\n    best_model = None\n\n    for epoch in range(1):\n        train_loss = []\n        s_model.train()\n        t_model.eval()\n        \n        for X_t, X_s, y in tqdm(train_loader):\n            X_t = X_t.float().to(device)\n            X_s = X_s.float().to(device)\n            y = y.float().to(device)\n            \n            optimizer.zero_grad()\n\n            output = s_model(X_s)\n            with torch.no_grad():\n                teacher_output = t_model(X_t)\n                \n            loss_b = distill_loss(output, y, teacher_output, loss_fn=distillation, opt=optimizer)\n\n            train_loss.append(loss_b)\n\n        val_loss, val_score = validation_student(s_model, t_model, val_loader, distill_loss, device)\n        print(f'Epoch [{epoch}], Train Loss : [{np.mean(train_loss) :.5f}] Val Loss : [{np.m

In [None]:
def validation_student(s_model, t_model, val_loader, criterion, device):
    s_model.eval()
    t_model.eval()

    val_loss = []
    pred_labels = []
    true_labels = []
    threshold = 0.35#0.35
    
    with torch.no_grad():
        for X_t, X_s, y in tqdm(val_loader):
            X_t = X_t.float().to(device)
            X_s = X_s.float().to(device)
            y = y.float().to(device)
            
            model_pred = s_model(X_s)
            teacher_output = t_model(X_t)
            
            loss_b = distill_loss(model_pred, y, teacher_output, loss_fn=distillation, opt=None)
            val_loss.append(loss_b)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        pred_labels = np.where(np.array(pred_labels) > threshold, 1, 0)
        val_f1 = competition_metric(true_labels, pred_labels)
    return val_loss, val_f1    

## Run (Student Model)

In [None]:
train_dataset = CustomDataset(train_X, train_y, True)
val_dataset = CustomDataset(val_X, val_y, True)

train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [None]:
student_model = Student()
student_model.eval()
optimizer = torch.optim.AdamW(student_model.parameters(), lr=CFG['LEARNING_RATE'])
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=1, threshold_mode='abs',min_lr=1e-8, verbose=True)

best_student_model = student_train(student_model, teacher_model, optimizer, train_loader, val_loader, scheduler, device)

  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [0], Train Loss : [0.36123] Val Loss : [0.28062] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [1], Train Loss : [0.27848] Val Loss : [0.27760] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [2], Train Loss : [0.27862] Val Loss : [0.27774] Val F1 Score : [0.47767]
Epoch 00003: reducing learning rate of group 0 to 5.0000e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [3], Train Loss : [0.28023] Val Loss : [0.27740] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [4], Train Loss : [0.27977] Val Loss : [0.27795] Val F1 Score : [0.47767]
Epoch 00005: reducing learning rate of group 0 to 2.5000e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [5], Train Loss : [0.27863] Val Loss : [0.27737] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [6], Train Loss : [0.28278] Val Loss : [0.27752] Val F1 Score : [0.47767]
Epoch 00007: reducing learning rate of group 0 to 1.2500e-03.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [7], Train Loss : [0.27932] Val Loss : [0.27738] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [8], Train Loss : [0.27932] Val Loss : [0.27743] Val F1 Score : [0.47767]
Epoch 00009: reducing learning rate of group 0 to 6.2500e-04.


  0%|          | 0/45 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

Epoch [9], Train Loss : [0.27868] Val Loss : [0.27737] Val F1 Score : [0.47767]


  0%|          | 0/45 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

## Choose Inference Threshold

In [None]:

def choose_threshold(model, val_loader, device):
    model.to(device)
    model.eval()
    
    thresholds = [0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
    pred_labels = []
    true_labels = []
    
    best_score = 0
    best_thr = None
    with torch.no_grad():
        for _, x_s, y in tqdm(iter(val_loader)):
            x_s = x_s.float().to(device)
            y = y.float().to(device)
            
            model_pred = model(x_s)
            
            model_pred = model_pred.squeeze(1).to('cpu')
            pred_labels += model_pred.tolist()
            true_labels += y.tolist()
        
        for threshold in thresholds:
            pred_labels_thr = np.where(np.array(pred_labels) > threshold, 1, 0)
            score_thr = competition_metric(true_labels, pred_labels_thr)
            if best_score < score_thr:
                best_score = score_thr
                best_thr = threshold
    return best_thr, best_score

In [None]:
best_threshold, best_score = choose_threshold(best_student_model, val_loader, device)

print(f'Best Threshold : [{best_threshold}], Score : [{best_score:.5f}]')

## Inference

In [None]:
test_datasets = CustomDataset(test, None, False)
test_loaders = DataLoader(test_datasets, batch_size = CFG['BATCH_SIZE'], shuffle=False)

In [None]:
def inference(model, test_loader, threshold, device):
    model.to(device)
    model.eval()
    
    test_predict = []
    with torch.no_grad():
        for x in tqdm(test_loader):
            x = x.float().to(device)
            model_pred = model(x)

            model_pred = model_pred.squeeze(1).to('cpu')
            test_predict += model_pred
        
    test_predict = np.where(np.array(test_predict) > threshold, 1, 0)
    print('Done.')
    return test_predict

In [None]:
preds = inference(best_student_model, test_loaders, best_threshold, device)

  0%|          | 0/24 [00:00<?, ?it/s]

Done.


## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')
submit['Y_LABEL'] = preds
submit.head()

Unnamed: 0,ID,Y_LABEL
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [None]:
submit.to_csv('./submit.csv', index=False)

In [None]:
input = torch.randn(5, 3, 10)
h0 = torch.randn(2, 3, 20)

In [None]:
h0