In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
import random
import os

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error

from statsmodels.stats.outliers_influence import variance_inflation_factor


In [None]:
mpl.rc('font', family='Malgun Gothic')

In [None]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test_.csv")

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark = True

In [None]:
seed_everything(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
class Custom_Dataset(Dataset):
    def __init__(self, Dataset, trainset=True):
        self.Dataset = Dataset
        self.trainset = trainset

        if trainset:
            self.target = pd.DataFrame(Dataset[Dataset.columns[-1]].values)
            self.data = pd.DataFrame(Dataset[Dataset.columns[:-1]].values)
        else:
            self.target = None
            self.data = pd.DataFrame(Dataset)

    def __len__(self):
        return len(self.data)
        

    def __getitem__(self, idx):
        data = torch.FloatTensor(self.data.iloc[idx].values)

        if self.target is not None:
            target = torch.FloatTensor(self.target.iloc[idx].values)
            return data, target
        else:
            return data

In [None]:
train_dataset = Custom_Dataset(df)
test_dataset = Custom_Dataset(test, trainset=False)
num_features_train = len(train_dataset[0][0])
model = nn.Linear(in_features= num_features_train, out_features= 1).to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.0015)
dataloader = DataLoader(train_dataset, batch_size=2)
criterion = nn.MSELoss()

In [None]:
class Regressor(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(num_features_train, 50)
        self.fc2 = nn.Linear(50, 30)
        self.fc3 = nn.Linear(30, 15)
        self.fc4 = nn.Linear(15, 1)
        self.dropout = nn.Dropout(0.5)
            
    def forward(self, x):
        x = F.sigmoid(self.fc1(x))
        x = self.dropout(F.sigmoid(self.fc2(x)))
        x = F.sigmoid(self.fc3(x))
        x = F.sigmoid(self.fc4(x))
        return x

In [None]:
model = Regressor().to(device)

criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.00015)
torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10, threshold=0.0001)

train_dataloader = DataLoader(train_dataset2, batch_size=2)

In [None]:
loss_ = []
n = len(train_dataloader)    # 파일 사이즈가 너무 커서 dataloader에서 train_dataloader로 변경
for epoch in range(25):
    running_loss = 0.0
    for data in dataloader:
        inputs, values = data
        inputs = inputs.to(device)
        values = values.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        
        loss = criterion(outputs, values)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    loss_.append(running_loss/n)
    
    print(loss_[epoch])
    
    if epoch % 5 == 0:
        print(epoch)

In [None]:
sns.lineplot(loss_)

In [None]:

def evaluation(dataloader):
    
    predicts = torch.tensor([], dtype=torch.float).to(device)
    actual = torch.tensor([], dtype=torch.float).to(device)
    
    with torch.no_grad():
        model.eval()
        
        for data in dataloader2:
            if len(data) == 2:
                inps, vals = data
                vals = vals.to(device)          
                actual = torch.cat((actual, vals), 0)
            else:
                inps = data
                
            inps = inps.to(device)                      
            outputs = model(inps)
            predicts = torch.cat((predicts, outputs), 0)
  
            
            
    predicts = predicts.to('cpu')
    predicts = predicts.numpy()
    actual = actual.to('cpu')
    actual = actual.numpy()
    mse = np.sqrt(mean_squared_error(predicts, actual))
    
    return mse

train_mse = evaluation(dataloader)
test_mse = evaluation(test_dataloader)
    
    
print('train:', train_mse)
print('test:', test_mse)