In [1]:
from keras.layers import Input, Dense
from keras.models import Model, Sequential
from keras import regularizers
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.manifold import TSNE
from sklearn import preprocessing 
import matplotlib.pyplot as plt
import pandas as pd 
import numpy as np
import seaborn as sns

import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

In [2]:
#GPU 사용하기 위한 코드
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [14]:
torch.cuda.empty_cache()

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(200)

In [4]:
train_df = pd.read_csv('./input/train.csv')
train_df = train_df.drop(columns=['ID'])

val_df = pd.read_csv('./input/val.csv')
val_df = val_df.drop(columns=['ID'])

#사기거래 데이터만 따로 뽑기.
#이후 시각화에 쓰기 위함이다. 
val_fraud=val_df[val_df['Class']==1]

#validation set에서 class 제거. 
#이 set으로 class를 예측한 후 원래 class와 비교하여 우리가 고른 방법의 f1 score를 확인할 수 있다. 
val_re=val_df.drop(columns=['Class'])

In [5]:
v=val_re.shape[1]
s=val_fraud.shape[0]

In [6]:
#일단 각 column별 수치 normalization하기
#apply로 인해 각 컬럼별로 시행
def mean_norm(df_input):
    return df_input.apply(lambda x: (x-x.mean())/ x.std(), axis=0)

#각 column별로 normalization된 사기거래의 값을 히스토그램으로 나타내자.
index=val_fraud.index.to_list()
val_df_ss = mean_norm(val_df)
val_df_ss_fraud=val_df_ss.loc[index]

In [7]:
class MyDataset(Dataset):
    def __init__(self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values
        
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
        
    def __len__(self):
        return len(self.df)

In [8]:
class AutoEncoder(nn.Module):
    def __init__(self, dim):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(dim,dim*4),
            nn.BatchNorm1d(dim*4),
            nn.LeakyReLU(),
            nn.Linear(dim*4, dim*8),
            nn.BatchNorm1d(dim*8),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(dim*8,dim*4),
            nn.BatchNorm1d(dim*4),
            nn.LeakyReLU(),
            nn.Linear(dim*4,dim),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x


In [9]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            if epoch%5==0: print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])') 
            

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [10]:
#표준화된 값의 제곱의 합이 클 수록 이상치들은 평균과 멀리 떨어져있다.
#표준화 제곱합을 구해 작은 순서대로 나열하자. 값이 작을 수록 덜 중요한 column이라고 볼 수 있다.
val_df_ss_frauds=val_df_ss_fraud**2
val_df_ss_fraudss=val_df_ss_frauds.sum()
val_in=pd.DataFrame({"ss":val_df_ss_fraudss})
val_in=val_in.sort_values('ss',ascending=False)
val_in=val_in.reset_index()

print(val_in)

    index            ss
0   Class  28431.001054
1     V17   4887.567869
2      V7   3031.531722
3     V14   2715.521810
4     V12   2449.949704
5     V10   2440.117488
6      V3   2318.483142
7     V16   1623.472952
8      V5   1020.893449
9      V1    837.261995
10    V11    817.454015
11    V27    811.313837
12    V18    720.152848
13     V4    627.950347
14     V8    610.439513
15     V2    486.825038
16     V9    445.115537
17    V21    427.996096
18    V19    158.207126
19    V28    124.140565
20     V6     96.866704
21    V20     85.070018
22    V22     69.728692
23    V25     62.262633
24    V23     56.806406
25    V15     51.495540
26    V26     41.461606
27    V30     40.175201
28    V13     37.993958
29    V24     24.073152
30    V29      9.806658


In [11]:
for i in range(1,s+1):
    ran=range(1,i+1)
    ind=val_in.loc[ran,:]["index"].tolist()
    print(ind)
    train_ma=train_df[ind]
    val_ma=val_df[ind+['Class']]
    print(val_ma.shape)


['V17']
(28462, 2)
['V17', 'V7']
(28462, 3)
['V17', 'V7', 'V14']
(28462, 4)
['V17', 'V7', 'V14', 'V12']
(28462, 5)
['V17', 'V7', 'V14', 'V12', 'V10']
(28462, 6)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3']
(28462, 7)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16']
(28462, 8)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5']
(28462, 9)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1']
(28462, 10)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1', 'V11']
(28462, 11)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1', 'V11', 'V27']
(28462, 12)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1', 'V11', 'V27', 'V18']
(28462, 13)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1', 'V11', 'V27', 'V18', 'V4']
(28462, 14)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1', 'V11', 'V27', 'V18', 'V4', 'V8']
(28462, 15)
['V17', 'V7', 'V14', 'V12', 'V10', 'V3', 'V16', 'V5', 'V1', 'V11', 'V27', 'V18', 'V4', 'V8', 'V2']
(28462, 16)
['V17', 'V7', 'V14

In [12]:
#하이퍼파라미터 지정
EPOCHS=400
LR=1e-2
BS=1
SEED=41

In [15]:
#중요한 열부터 하나씩 뽑아서 각 결과를 나타냈다.
for i in range(1,s+1):
    ran=range(1,i+1)
    ind=val_in.loc[ran,:]["index"].tolist()
    print(f'{ind} 행만 선택하여 계산한다.')
    train_ma=train_df[ind]
    val_ma=val_df[ind+['Class']]    

    DIM=train_ma.shape[1]

    train_dataset = MyDataset(df=train_ma, eval_mode=False)
    train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=0)

    val_dataset = MyDataset(df = val_ma, eval_mode=True)
    val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=0)

    model = nn.DataParallel(AutoEncoder(DIM))
    model.eval()
    optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

    trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
    trainer.fit()

['V17'] 행만 선택하여 계산한다.


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

- 확인 결과 17번째인 ["V28","V6","V20","V22","V25","V23","V15","V26","V30","V13","V24","V29"] 열을 선택했을 때 좋은 결과가 나오면서 적절한 수의 column인 것을 확인할 수 있다.
- 따라서 test set도 이 열만 골라서 autoencoder 모델을 시행하였다.

In [None]:
model = AutoEncoder(DIM)
model.load_state_dict(torch.load('./best_model.pth'))

model = nn.DataParallel(model)
model.eval()

In [None]:
#테스트 셋을 같은 모델로 돌려 사기거래를 찾는다.
test_df = pd.read_csv('./input/test.csv')
test_df = test_df.drop(columns=["ID","V28","V6","V20","V22","V25","V23","V15","V26","V30","V13","V24","V29"])

In [None]:
test_dataset = MyDataset(test_df, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=0)

In [None]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-6)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [None]:
preds = prediction(model, 0.95, test_loader, device)

In [None]:
#잘 예측됐는지 확인하기 위해 사기거래의 비율을 히스토그램과 숫자로 확인한다.
plt.hist(preds)
plt.show

print(preds.count(1)/preds.count(0))

In [None]:
#제출하기 위해 제출파일을 저장한다.
submit = pd.read_csv('./input/sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit_autoencoder.csv', index=False)