In [None]:
# Install wandb
!pip install wandb

In [None]:
# Mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Access data
%cd drive/MyDrive/scaled_data/

/content/drive/MyDrive/scaled_data


In [None]:
# Login wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mdeep-overflow[0m ([33mdeepintodeep[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Import
import torch
import torch.nn as nn # Neural Network
import torch.optim as optim # Optimizer
import torch.optim.lr_scheduler as lr_scheduler # Scheduler
from torch.utils.data import Dataset, DataLoader # Data

import wandb
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
from glob import glob
from sklearn.model_selection import train_test_split
import random
import os

# 경고 끄기
warnings.filterwarnings(action='ignore')

# Fix seed
torch.random.manual_seed(2020320120)
random.seed(2020320120)
np.random.seed(2020320120)

In [None]:
# train data file path list
data_list = glob('.//train/*.csv')

# column list that include column which isn't used in train
tr_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 ']
# column list that include column which isn't used in test
ts_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 ', '해당일자_전체평균가격(원)']

check_col = ['일자구분_중순', '일자구분_초순', '일자구분_하순','월구분_10월', '월구분_11월', '월구분_12월', '월구분_1월', '월구분_2월', '월구분_3월', 
             '월구분_4월','월구분_5월', '월구분_6월', '월구분_7월', '월구분_8월', '월구분_9월']

In [None]:
# function for data sample according to time window
def time_window(df, t, t_sep):
    seq_len = t
    seqence_length = seq_len + t_sep

    result = []
    for index in range(len(df) - seqence_length):
        tmp = df[index: index + seqence_length].values
        tmp = np.vstack(tmp).astype(np.float)
        tmp = torch.from_numpy(tmp)
        result.append(tmp)

    return np.array(result)

# function for creating dataset; return tuple(ndarray, ndarray)
def make_dataset(i):
    df_number = i.split("_")[-1].split(".")[0]
    df = pd.read_csv(i)

    for j in df.columns:
        df[j] = df[j].replace({' ': np.nan})

    # 사용할 열 선택 및 index 설정
    df.drop(tr_del_list, axis=1, inplace=True)
    df.set_index('datadate', drop=True, inplace=True)

    # nan 처리
    df = df.fillna(0)

    # 변수와 타겟 분리
    x, y = df[[i for i in df.columns if i != '해당일자_전체평균가격(원)']], df['해당일자_전체평균가격(원)']

    # 2주 입력을 통한 이후 4주 예측을 위해 y의 첫 14일을 제외
    y = y[14:]

    # time series window 생성
    data_x = time_window(x, 13, 1)
    data_y = time_window(y, 27, 1)

    # y의 길이와 같은 길이로 설정
    xdata = data_x[:len(data_y)]
    ydata = data_y

    return xdata, ydata

In [None]:
class Flag:
    def __init__(self, flags):
        for key, value in flags.items():
            if isinstance(value, dict):
                self.__dict__[key] = Flag(value)
            else:
                self.__dict__[key] = value

In [None]:
def make_Tensor(array):
    return torch.from_numpy(array)


def astype_data(data):
    df = data.astype(np.float32)
    return make_Tensor(df)


class testDataset(Dataset):
    def __init__(self, data):
        zero_csv = [0 for i in range(14)]
        df = pd.read_csv(data)

        if len(df) == 0:
            print('no data in Dataset!!')
            print(df)
            df['zero_non'] = zero_csv
            print(df)
            df = df.fillna(0)
            print(df)
            df.drop('zero_non', axis=1, inplace=True)
            df.drop('Unnamed: 0', axis=1, inplace=True)
            print(df)

        file_number = data.split('test_')[1].split('.')[0]

        # 사용할 열 선택, index 설정
        df.drop(ts_del_list, axis=1, inplace=True)
        df.set_index('datadate', drop=True, inplace=True)

        # train input 과 형상 맞추기
        add_col = [i for i in check_col if i not in df.columns]

        for a in add_col:
            df[a] = 0

        # ' ' -> nan 으로 변경
        for a in df.columns:
            df[a] = df[a].replace({' ': np.nan})

        # nan 처리
        df = df.fillna(0)

        # x_test  생성
        self.df_test = astype_data(df.values.reshape(1, df.values.shape[0], df.values.shape[1]))

    def __len__(self):
        return len(self.df_test)

    def __getitem__(self, idx):
        return self.df_test[idx]

In [None]:
def train_and_test(flags):
    name = 'Test_001'

    wandb.init(
        project="Nong_Linear",
        entity='deep-overflow',
        config=flags,
        name=name
    )

    flags = Flag(flags)

    for item_idx in range(37): # 37로 수정하기
        if item_idx < 30:
            continue
        data_path = f'./train/train_{item_idx}.csv'
        train_dataset = windowDataset(data_path)

        train_dataloader = DataLoader(
            dataset=train_dataset,
            batch_size=flags.batch_size,
            shuffle=True,
        )

        net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(14 * 49, flags.model.hid_dim),
            # nn.BatchNorm1d(flags.model.hid_dim),
            nn.LeakyReLU(),
            nn.Dropout(flags.dropout),
        )

        for _ in range(flags.model.nlayers - 2):
            net.append(nn.Linear(flags.model.hid_dim, flags.model.hid_dim))
            # net.append(nn.BatchNorm1d(flags.model.hid_dim))
            net.append(nn.LeakyReLU())
            net.append(nn.Dropout(flags.dropout))
        
        net.append(nn.Linear(flags.model.hid_dim, 28))
        net.append(nn.ReLU())

        for name, param in net.named_parameters():
            if name.split('.')[-1] == 'bias':
                continue
            nn.init.xavier_uniform_(param)
            # nn.init.kaiming_uniform_(param, nonlinearity='leaky_relu')

        criterion = nn.L1Loss()

        optimizer = optim.Adam(
            params=net.parameters(), 
            lr=flags.lr,
            betas=(0.9, 0.999),
        )

        def lr_schedule_fn(epoch):
            if epoch < 15:
                return 1.5 # 1e-4 * 1.5
            elif epoch >= 15 and epoch < 250:
                return 1.0
            elif epoch >= 250 and epoch < 350:
                return 1.5
            else:
                return 1.0

        scheduler = lr_scheduler.LambdaLR(
            optimizer=optimizer,
            lr_lambda=lr_schedule_fn
        )

        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        net = net.to(device)
        
        for epoch in range(flags.epochs):
            epoch_train_loss = 0.0

            for inputs, labels in train_dataloader:
                inputs, labels = inputs.float().to(device), labels.to(device)

                with torch.set_grad_enabled(True):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
                epoch_train_loss += loss.item()
            
            scheduler.step()
            
            wandb.log({
                'loss': epoch_train_loss / len(train_dataloader),
                'lr': optimizer.param_groups[0]['lr']
            })

            if (epoch + 1) % 10 == 0:
                print(f'[epoch : {epoch + 1} / {flags.epochs}] Train Loss : {epoch_train_loss / len(train_dataloader)}')
        
        # Test
        result_np = np.zeros((1, 28), dtype=np.float32)
        for set_num in range(10): # 10으로 수정하기
            data_path = f'./test/set_{set_num}/test_{item_idx}.csv'
            test_dataset = testDataset(data_path)

            inputs = test_dataset[0].reshape(1, 14, 49).to(device)

            with torch.no_grad():
                outputs = net(inputs)

            output_np = outputs.cpu().detach().numpy()

            result_np = np.concatenate([result_np, output_np], axis=0)
        
        result_pd = pd.DataFrame(result_np)
        result_pd.to_csv(f'./result_{item_idx}.csv')
        print(f'Save Result {item_idx}')

flags = {
    'epochs': 500,
    'lr': 1e-4,
    'batch_size': 64,
    'data_num': 0,
    'model': {
        'nlayers': 10,
        'hid_dim': 1024,
        'weight_init': 'xavier_uniform'
    },
    'optim': 'Adam',
    'criterion': 'L1Loss',
    'dropout': 0.5,
    'lr_scheduler': {
        'method': 'Lambda'
    }
}

train_and_test(flags)

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

[34m[1mwandb[0m: [32m[41mERROR[0m Control-C detected -- Run data was not synced


Problem at: <ipython-input-16-9ebc410e6223> 8 train_and_test


KeyboardInterrupt: ignored

In [None]:
def train(flags, idx):
    name = f'Exp_X_U_batch_128_{flags["data_num"]:03d}_{idx}'
    
    flags['data_path'] = f'./train/train_{flags["data_num"]}.csv'

    wandb.init(
        project="Nong_Linear", 
        entity="deep-overflow", 
        config=flags,
        name=name
    )

    # Flag # ====================
    # 다양한 Regularization 시도해보기
    # ===========================
    flags = Flag(flags)

    # Dataset # ====================
    train_dataset = windowDataset(flags.data_path)
    test_dataset = testDataset()

    # DataLoader # ====================
    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=flags.batch_size,
        shuffle=True
    )

    # Net # ====================
    # Dropout
    # BatchNorm
    # ==========================
    net = nn.Sequential(
        nn.Flatten(),
        nn.Linear(14 * 49, flags.model.hid_dim),
        # nn.BatchNorm1d(flags.model.hid_dim),
        nn.LeakyReLU(),
        nn.Dropout(flags.dropout),
    )

    for _ in range(flags.model.nlayers - 2):
        net.append(nn.Linear(flags.model.hid_dim, flags.model.hid_dim))
        # net.append(nn.BatchNorm1d(flags.model.hid_dim))
        net.append(nn.LeakyReLU())
        net.append(nn.Dropout(flags.dropout))
    
    net.append(nn.Linear(flags.model.hid_dim, 28))
    net.append(nn.ReLU())

    # Weight Initialization # ====================
    # Xavier
    # Kaiming
    # ============================================
    for name, param in net.named_parameters():
        if name.split('.')[-1] == 'bias':
            continue
        print(f'Init {name}')
        nn.init.xavier_uniform_(param)
        # nn.init.kaiming_uniform_(param, nonlinearity='leaky_relu')

    # Criterion # ====================
    # 다양한 로스 시도해보기
    # ================================
    criterion = nn.L1Loss()

    # Optimizer # ====================
    # 다양한 옵티마이저 시도해보기
    # ================================
    optimizer = optim.Adam(
        params=net.parameters(), 
        lr=flags.lr,
        betas=(0.9, 0.999),
    )

    # scheduler = lr_scheduler.ExponentialLR(
    #     optimizer=optimizer,
    #     gamma=0.9
    # )

    # scheduler = lr_scheduler.StepLR(
    #     optimizer=optimizer,
    #     step_size=flags.lr_scheduler.step_size,
    #     gamma=flags.lr_scheduler.gamma
    # )

    def lr_schedule_fn(epoch):
        if epoch < 15:
            return 1.5 # 1e-4 * 1.5
        elif epoch >= 15 and epoch < 250:
            return 1.0
        elif epoch >= 250 and epoch < 350:
            return 1.5
        else:
            return 1.0


    # =====
    # a: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 399: 1.5 // 400 - 499: 1 (O)
    # b: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 349: 2 // 350 - 499: 1
    # c: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 349: 10 // 350 - 499: 1 (X)
    # d: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 349: 3 // 350 - 499: 1
    
    scheduler = lr_scheduler.LambdaLR(
        optimizer=optimizer,
        lr_lambda=lr_schedule_fn
    )

    # scheduler = lr_scheduler.CosineAnnealingLR(
    #     optimizer=optimizer,
    #     T_max=100
    # )

    # Device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    net = net.to(device)

    # Train
    for epoch in range(flags.epochs):
            
        epoch_train_loss = 0.0

        for inputs, labels in train_dataloader:
            inputs, labels = inputs.float().to(device), labels.to(device)

            with torch.set_grad_enabled(True):
                outputs = net(inputs)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
            epoch_train_loss += loss.item()
        
        scheduler.step()
        
        wandb.log({
            'loss': epoch_train_loss / len(train_dataloader),
            'lr': optimizer.param_groups[0]['lr']
        })
        if (epoch + 1) % 10 == 0:
            print(f'[epoch : {epoch + 1} / {flags.epochs}] Train Loss : {epoch_train_loss / len(train_dataloader)}')

for i in range(1):
    flags = {
        'epochs': 500,
        'lr': 1e-4,
        'batch_size': 64,
        'data_num': 0,
        'model': {
            'nlayers': 10,
            'hid_dim': 1024,
            'weight_init': 'xavier_uniform'
        },
        'optim': 'Adam',
        'criterion': 'L1Loss',
        'dropout': 0.5,
        'lr_scheduler': {
            'method': 'Lambda'
        }
    }

    train(flags, i)