In [105]:
import pandas as pd
import numpy as np
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import os
import multiprocessing
import itertools
import argparse

from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error as MAE
from scipy import stats
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

pd.set_option('display.max_rows', None)

###  
### Argument Parsing

In [71]:
devicenum = 1
device = f"cuda:{str(devicenum)}" if torch.cuda.is_available() else "cpu"

###  
### Data Loading

In [122]:
finaltrain = pd.read_csv('finaltrain.csv')
finaltest = pd.read_csv('finaltest.csv')
sample_submission = pd.read_csv('sample_submission.csv')
meanstd = pd.read_csv('mean_std.csv')
meanstd = pd.DataFrame(np.array(meanstd), columns=['var', 'mean', 'std'])

###  
### Torch Dataset Define

In [168]:
class ParkingDataset(Dataset):
    
    def __init__(self, finaltrain):
        self.finaldata = finaltrain

    def __len__(self):
        return len(self.finaldata)

    def __getitem__(self, idx):
        target = [self.finaldata.iloc[idx]['등록차량수']]
        sample = np.array(self.finaldata[self.finaldata.columns.difference(['단지코드', '등록차량수'])].iloc[idx])

        return torch.Tensor(sample), torch.Tensor(target)

    
class ParkingTestset(Dataset):
    
    def __init__(self, finaltrain):
        self.finaldata = finaltrain

    def __len__(self):
        return len(self.finaldata)

    def __getitem__(self, idx):
        sample = np.array(self.finaldata[self.finaldata.columns.difference(['단지코드'])].iloc[idx])

        return self.finaldata.iloc[idx]['단지코드'], torch.Tensor(sample)
    
    
class SimpleDNN(nn.Module):

    def __init__(self, input_size, structure):
        super().__init__()
        
        self.layers_input = nn.Linear(input_size, structure[0])
        self.num_layers = len(structure)
        
        for num in range(self.num_layers - 1):
            exec(f'self.layers{str(num)} = nn.Linear(structure[num], structure[num+1])')
        
        self.layers_output = nn.Linear(structure[-1], 1)

    def forward(self, x):
        # hidden, cell state init
        x = F.relu(self.layers_input(x))
#         print('input')
        for num in range(self.num_layers - 1):
#             print(num)
#             exec(f'print(self.layers{str(num)})')
            exec(f'x = F.relu(self.layers{str(num)}(x))')
        
#         print('output')
#         print(self.layers_output)
        x = self.layers_output(x)
            
        return x

    
def train(num_epochs, model, data_loader, val_loader, patience,
          criterion, optimizer, saved_dir, device):
    #     print('Start training..')
    best_loss = 9999999
    model.train()
    for epoch in range(num_epochs):
        if epoch == 0:
            early_stopping = EarlyStopping(patience=patience, path=saved_dir, verbose=False)
        else:
            early_stopping = EarlyStopping(patience=patience, best_score=best_score,
                                           counter=counter, path=saved_dir, verbose=False)

        for step, (sequence, target) in enumerate(data_loader):
            sequence = sequence.type(torch.float32)
            target = target.type(torch.float32)
            sequence, target = sequence.to(device), target.to(device)
            
#             print(sequence)
            outputs = model(sequence)
            loss = criterion(outputs, target)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avrg_loss, mae = validation(model, val_loader, criterion, device)
        best_score, counter, finish = early_stopping(avrg_loss, model)

        if finish:
            model.load_state_dict(torch.load(saved_dir))
            model.eval()
            avrg_loss, b = validation(model, val_loader, criterion, device)
            break

    return best_score, mae


def validation(model, data_loader, criterion, device):
    b = []
    model.eval()
    with torch.no_grad():
        total_loss = 0
        cnt = 0
        for step, (sequence, target) in enumerate(data_loader):
            sequence = sequence.type(torch.float32)
            target = target.type(torch.float32)
            sequence, target = sequence.to(device), target.to(device)
            outputs = model(sequence)
            loss = criterion(outputs, target)
            b.append((outputs, target))
            total_loss += loss
            cnt += 1
        avrg_loss = total_loss / cnt
    #         print('Validation Average Loss: {:.4f}'.format(avrg_loss))
    model.train()
    return avrg_loss, b


class EarlyStopping:
    """주어진 patience 이후로 validation loss가 개선되지 않으면 학습을 조기 중지"""

    def __init__(self, patience=7, best_score=np.inf, counter=0, delta=0,
                 path=None, verbose=False):

        self.patience = patience
        self.verbose = verbose
        self.counter = counter
        self.best_score = best_score
        self.early_stop = False
        self.delta = delta
        self.path = path

    def __call__(self, val_loss, model):
        if val_loss > self.best_score + self.delta:
            self.counter += 1
            # print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
#                 print('Early Stopping Validated')
                self.early_stop = True

        else:
            self.save_checkpoint(val_loss, model)
            self.best_score = val_loss
            self.counter = 0

        return self.best_score, self.counter, self.early_stop

    def save_checkpoint(self, val_loss, model):
        '''validation loss가 감소하면 모델을 저장한다.'''
        if os.path.isfile(self.path):
            os.remove(self.path)
        if self.verbose:
            print(f'Validation loss decreased ({self.best_score:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.path)
        
        
def MAE_func(a, b):
    a1 = torch.Tensor.cpu(a).numpy()
    b1 = torch.Tensor.cpu(b).numpy()
    
    return MAE(a1.squeeze(1), b1.squeeze(1))


def r3(value):
    return str(round(value, 3))


def trainsave(vars):
    batch_size, learning_rate, structure = vars
    
    structure_string = ''
    for node in structure:
        structure_string += str(node) + '_'
    structure_string = structure_string[:-1]
        
    for pth in os.listdir(f'vars/'):
        if f'ckpt_batch_{batch_size}_lr_{learning_rate}_structure_{structure_string}' in pth:
            return False

    # Base Parameters
    kf = KFold(n_splits=5, shuffle=True, random_state=777)
    patience = 2
    batch_size = batch_size
    num_epochs = 500
    learning_rate = learning_rate

    bs_box = []  # Best Scores
    mae_box = []
    for trainid, valid in kf.split(range(finaltrain.shape[0])):
        # Dataloader 구축; 5-fold validation
        parking_train = ParkingDataset(finaltrain.iloc[trainid])
        parking_val = ParkingDataset(finaltrain.iloc[valid])
        train_loader = DataLoader(parking_train, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(parking_val, batch_size=batch_size, shuffle=False)


        # Training
        torch.manual_seed(7777)
        model = SimpleDNN(input_size=finaltrain.shape[1]-2, structure=structure)
        model = model.to(device)
        criterion = nn.L1Loss()
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        saved_dir = f'saved/ckpt_batch_{batch_size}_lr_{learning_rate}_structure_{structure_string}.pt'

        best_score, mae = train(num_epochs, model, train_loader, val_loader, patience,
                                criterion, optimizer, saved_dir, device)

        bs_box.append(best_score)
        mae_box.append(np.nanmean([MAE_func(r[0], r[1]) for r in mae]))

    cvloss = np.mean([i.cpu().item() for i in bs_box])
    MAE = np.mean(mae_box)
    print(f'ckpt path: {saved_dir}\nBest CV_Loss: {r3(cvloss)}\nBest MAE: {r3(MAE)}')
    f = open(f'vars/ckpt_batch_{batch_size}_lr_{learning_rate}_structure_{structure_string} CV_Loss {r3(cvloss)} MAE {r3(MAE)}.txt', 'w')
    
    for i in mae_box:
        data = str(i) + "\n"
        f.write(data)
    f.close()


###  
### Test 결과 

In [187]:
vs = []
k = ['batch', 'lr', 'structure', 'MSE', 'RSQ']
for res in os.listdir('vars/'):
    if 'ipynb' in res:
        continue
    vs.append([res[res.find('batch') + 6:][:res[res.find('batch') + 6:].find('_')],
    res[res.find('lr') + 3:][:res[res.find('lr') + 3:].find('_')],
    res[res.find('structure') + 10:][:res[res.find('structure') + 10:].find('MSE')-1],
    res[res.find('MSE') + 4:][:res[res.find('MSE') + 4:].find(' ')],
    res[res.find('RSQ') + 4:][:res[res.find('RSQ') + 4:].find('.txt')]])

result = pd.DataFrame(vs, columns=k)

result.sort_values(by=['MSE'])

Unnamed: 0,batch,lr,structure,MSE,RSQ
158,16,0.0003,400_400_400_400_400,0.274,0.308
52,16,0.001,400_400_400_400_400,0.276,0.33
178,16,0.001,400_400_400_400_400_400_400,0.283,0.33
0,16,0.003,400_400_400_400_400,0.286,0.408
194,32,0.001,400_400_400_400_400,0.286,0.327
130,16,0.003,300_300_300_300_300,0.287,0.357
152,8,0.0003,400_400_400_400_400,0.288,0.331
50,16,0.001,500_500_500_500_500,0.288,0.338
53,8,0.001,400_400_400_400_400_400_400,0.29,0.343
62,8,0.001,400_400_400_400_400,0.29,0.359


In [202]:
batch_size, learning_rate, structure = 16, 0.001, [400] * 5

structure_string = ''
for node in structure:
    structure_string += str(node) + '_'
structure_string = structure_string[:-1]

model = SimpleDNN(finaltrain.shape[1]-2, structure)

ckpt_path = f'ckpt_batch_{batch_size}_lr_{learning_rate}_structure_{structure_string}.pt'
model.load_state_dict(torch.load(f'saved/{ckpt_path}'))

parking_val = ParkingTestset(finaltest)
test_loader = DataLoader(parking_val)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

model.eval()
codeset = []
outputset = []
for step, (code, sequence) in enumerate(test_loader):
    sequence = sequence.type(torch.float32)
    outputs = model(sequence)
    codeset.append(code[0])
    outputset.append(outputs.detach().numpy()[0][0])

mean = meanstd[meanstd['var']=='등록차량수']['mean']
std = meanstd[meanstd['var']=='등록차량수']['std']

In [203]:
final = pd.DataFrame(codeset, columns=['code'])
final['num'] = np.array([float(i * std + mean) for i in outputset])
final1 = pd.merge(sample_submission, final, how='outer', on='code')
final1 = pd.DataFrame(np.array(final1[['code', 'num_y']]), columns=['code', 'num'])

In [206]:
final1.to_csv('mlp1.csv', index=False)