In [9]:
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns
from sklearn.metrics import mean_squared_error

import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader


sns.set(style="whitegrid")
device=torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [2]:
from tqdm import tqdm
from tqdm import trange
import time
from tqdm import tqdm, tqdm_pandas

tqdm.pandas()

### 1. Hyperparameter

In [3]:
EPOCHS=400
LR=1e-2
BS=16384
SEED=41

### 2. 시드 고정 작업

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    ## 위에서 random.seed 지정해줬는데 왜 np.random 지정해야하는지 모르겠음!!
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(200)

### 3. 데이터 불러오기

In [5]:
train_df = pd.read_csv('./input/train.csv')
train_df = train_df.drop(columns=['ID'])
print(train_df.head())

val_df = pd.read_csv('./input/val.csv')
val_df = val_df.drop(columns=['ID'])
print(val_df.head())

         V1        V2        V3        V4        V5        V6        V7  \
0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
2 -0.425966  0.960523  1.141109 -0.168252  0.420987 -0.029728  0.476201   
3 -0.644269  1.417964  1.074380 -0.492199  0.948934  0.428118  1.120631   
4 -0.894286  0.286157 -0.113192 -0.271526  2.669599  3.721818  0.370145   

         V8        V9       V10    ...          V21       V22       V23  \
0  0.247676 -1.514654  0.207643    ...     0.247998  0.771679  0.909412   
1  0.377436 -1.387024 -0.054952    ...    -0.108300  0.005274 -0.190321   
2  0.260314 -0.568671 -0.371407    ...    -0.208254 -0.559825 -0.026398   
3 -3.807864  0.615375  1.249376    ...     1.943465 -1.015455  0.057504   
4  0.851084 -0.392048 -0.410430    ...    -0.073425 -0.268092 -0.204233   

        V24       V25       V26       V27       V28       V29       V30  
0 -0.689281 -0.327642 -0

In [6]:
#데이터 차원 확인
print(train_df.shape)
print(val_df.shape)

(113842, 30)
(28462, 31)


### 4. 데이터셋 생성

In [7]:
class MyDataset(Dataset):
    def __init__(self, df, validation):
        self.df = df
        self.validation = validation
        if self.validation:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.validation:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)
    #eval_mode -> validation 변경
    

In [8]:
train_dataset = MyDataset(df=train_df, validation=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=6)

val_dataset = MyDataset(df = val_df, validation=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=6)

### 5. 1D AutoEncoder

In [10]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.Tanh(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.Tanh(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x
    
    #leakyrelu -> relu or tanh

### 6. training

In [11]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = mean_squared_error(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print('Epoch : [{}] Train loss : [{}] Val Score : [{}])'.format(epoch, np.mean(train_loss), score))

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = mean_squared_error(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                # 여기서 thr을 조정해주고 싶은데 뭘 바꿔야할지 모르겠음!!!
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')
    
    # loss function, cos -> MAE, MSE, ,RMSE, MAPE, MPE
    # loss backward -> loss forward
    

### 7. 모델 학습

In [12]:
model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-2, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.

### 8. 추론

In [None]:
model = AutoEncoder()
model.load_state_dict(torch.load('./best_model.pth'))
model = nn.DataParallel(model)
model.eval()

In [None]:
test_df = pd.read_csv('./input/test.csv')
test_df = test_df.drop(columns=['ID'])

test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=6)

def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

preds = prediction(model, 0.95, test_loader, device)

submit = pd.read_csv('./sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit_autoencoder.csv', index=False)