## Unsupervised Learning with Autoencoder

- Using Pytorch(macos)
- device : MPS

In [28]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import f1_score


In [2]:
# Device Setting
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps')
device

device(type='mps')

In [19]:
# Hyperparameters
epochs = 100
lr = 0.01
batch_size = 4096

In [20]:
# Seed setting

def def_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

def_seed(seed=123)

In [21]:
# CSV

df_train = pd.read_csv('dataset/train.csv', index_col=0)
df_val = pd.read_csv('dataset/val.csv', index_col=0)

In [22]:
class MyDataset(Dataset):
    def __init__(self, df, if_val):
        self.df = df
        self.if_val = if_val
        if self.if_val:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns=['Class']).values
        else:
            self.df = self.df.values

    def __getitem__(self, index):
        if self.if_val:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
    
    def __len__(self):
        return len(self.df)

In [23]:
train = MyDataset(df_train, if_val=False)
train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)

val = MyDataset(df_val, if_val=True)
val_loader = DataLoader(val, batch_size=batch_size, shuffle=False)

In [24]:
# AutoEncoder

class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(30,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,128),
            nn.BatchNorm1d(128),
            nn.LeakyReLU(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128,64),
            nn.BatchNorm1d(64),
            nn.LeakyReLU(),
            nn.Linear(64,30),
        )
        
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

In [31]:
# EarlyStopper

class EarlyStopping:
    """주어진 patience 이후로 validation loss가 개선되지 않으면 학습을 조기 중지"""
    def __init__(self, patience=7, verbose=False, delta=0, path='checkpoint.pt'):
        """
        Args:
            patience (int): validation loss가 개선된 후 기다리는 기간
                            Default: 7
            verbose (bool): True일 경우 각 validation loss의 개선 사항 메세지 출력
                            Default: False
            delta (float): 개선되었다고 인정되는 monitered quantity의 최소 변화
                            Default: 0
            path (str): checkpoint저장 경로
                            Default: 'checkpoint.pt'
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score + self.delta:
            self.counter += 1
            print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''validation loss가 감소하면 모델을 저장한다.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        self.val_loss_min = val_loss

In [25]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        self.criterion = nn.L1Loss().to(self.device)

        self.earlystopper = 

    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(epochs):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader): # for each batch
                x = x.float().to(self.device)
                self.optimizer.zero_grad() # gradient init
                _x = self.model(x) 
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f"EPOCH {epoch}/{epochs} : Train loss : [{np.mean(train_loss)}] Val_Score : [{score}]")

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                # torch.save(self.model.module.state_dict(), 'best_model.pth', _use_new_zipfile_serialization=False)
            
    def validation(self, eval_model, threshold):
        cossim = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad(): # with no backward pass(e.g. validating)
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)
                _x = self.model(x)
                cos_diff = cossim(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(cos_diff)<threshold, 1, 0).tolist()
                pred += batch_pred
                true += y.tolist()
        
        return f1_score(true, pred, average = 'macro')

In [26]:
model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = optim.Adam(params=model.parameters(), lr = lr)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()

EPOCH 0/100 : Train loss : [0.3695001160459859] Val_Score : [0.37491767509813806]
EPOCH 1/100 : Train loss : [0.17676134567175592] Val_Score : [0.48422320350681336]
EPOCH 2/100 : Train loss : [0.13922166026064328] Val_Score : [0.5039546987889401]
EPOCH 3/100 : Train loss : [0.12077264807053975] Val_Score : [0.5122168149853599]
EPOCH 4/100 : Train loss : [0.11022524056690079] Val_Score : [0.5194839457365952]
EPOCH 5/100 : Train loss : [0.10464145429432392] Val_Score : [0.5233955987980123]
EPOCH 6/100 : Train loss : [0.09557799835290227] Val_Score : [0.524761510660486]
EPOCH 7/100 : Train loss : [0.09192544354924134] Val_Score : [0.5259035289777105]
EPOCH 8/100 : Train loss : [0.08874996272580964] Val_Score : [0.5297544391655133]
EPOCH 9/100 : Train loss : [0.08474058258746352] Val_Score : [0.5304799497789369]
EPOCH 10/100 : Train loss : [0.08091525813298565] Val_Score : [0.5338913307706434]
EPOCH 11/100 : Train loss : [0.08006864972412586] Val_Score : [0.5382341593027656]
EPOCH 12/100 :