In [1]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting wandb
  Downloading wandb-0.13.2-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 8.9 MB/s 
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.9.8-py2.py3-none-any.whl (158 kB)
[K     |████████████████████████████████| 158 kB 62.0 MB/s 
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting setproctitle
  Downloading setproctitle-1.3.2-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 53.2 MB/s 
Collecting gitdb<5,>=4.0.1
  Downloading gitdb-4.0.9-p

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [4]:
%cd drive/MyDrive/data/aT/scaled_data/

/content/drive/MyDrive/data/aT/scaled_data


In [5]:
%ls

[0m[01;34maT_test_raw[0m/  [01;34mresult_001[0m/  sutmit_005.csv  [01;34mtest[0m/  [01;34mtrain[0m/  [01;34mwandb[0m/


In [6]:
!wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
from torch.utils.data import Dataset, DataLoader

import time
import wandb
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
from glob import glob
from sklearn.model_selection import train_test_split
import random
import os

# 경고 끄기
warnings.filterwarnings(action='ignore')

# 시드고정
torch.random.manual_seed(42)
random.seed(42)
np.random.seed(42)

In [9]:
data_list = glob('.//train/*.csv')
epoch = 100
batch = 15
tr_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 '] # train 에서 사용하지 않는 열
ts_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 ', '해당일자_전체평균가격(원)'] # test 에서 사용하지 않는 열
check_col = ['일자구분_중순', '일자구분_초순', '일자구분_하순','월구분_10월', '월구분_11월', '월구분_12월', '월구분_1월', '월구분_2월', '월구분_3월', 
             '월구분_4월','월구분_5월', '월구분_6월', '월구분_7월', '월구분_8월', '월구분_9월'] # 열 개수 맞추기

In [10]:
def time_window(df, t, t_sep):
    seq_len = t
    seqence_length = seq_len + t_sep

    result = []
    for index in range(len(df) - seqence_length):
        tmp = df[index: index + seqence_length].values
        tmp = np.vstack(tmp).astype(np.float)
        tmp = torch.from_numpy(tmp)
        result.append(tmp)

    return np.array(result)


def make_dataset(i):
    df_number = i.split("_")[-1].split(".")[0]
    df = pd.read_csv(i)

    for j in df.columns:
        df[j] = df[j].replace({' ': np.nan})

    # 사용할 열 선택 및 index 설정
    df.drop(tr_del_list, axis=1, inplace=True)
    df.set_index('datadate', drop=True, inplace=True)

    # nan 처리
    df = df.fillna(0)

    # 변수와 타겟 분리
    x, y = df[[i for i in df.columns if i != '해당일자_전체평균가격(원)']], df['해당일자_전체평균가격(원)']

    # 2주 입력을 통한 이후 4주 예측을 위해 y의 첫 14일을 제외
    y = y[14:]

    # time series window 생성
    data_x = time_window(x, 13, 1)
    data_y = time_window(y, 27, 1)

    # y의 길이와 같은 길이로 설정
    xdata = data_x[:len(data_y)]
    ydata = data_y

    return xdata, ydata


class windowDataset(Dataset):
    def __init__(self, data):
        self.xdata, self.ydata = make_dataset(data)

    def __len__(self):
        return len(self.xdata)

    def __getitem__(self, idx):
        return self.xdata[idx], self.ydata[idx].reshape(-1)

In [11]:
class Flag:
    def __init__(self, flags):
        for key, value in flags.items():
            if isinstance(value, dict):
                self.__dict__[key] = Flag(value)
            else:
                self.__dict__[key] = value

In [12]:
def make_Tensor(array):
    return torch.from_numpy(array)


def astype_data(data):
    df = data.astype(np.float32)
    return make_Tensor(df)


class testDataset(Dataset):
    def __init__(self, data):
        zero_csv = [0 for i in range(14)]
        df = pd.read_csv(data)

        if len(df) == 0:
            print('no data in Dataset!!')
            print(df)
            df['zero_non'] = zero_csv
            print(df)
            df = df.fillna(0)
            print(df)
            df.drop('zero_non', axis=1, inplace=True)
            df.drop('Unnamed: 0', axis=1, inplace=True)
            print(df)

        file_number = data.split('test_')[1].split('.')[0]

        # 사용할 열 선택, index 설정
        df.drop(ts_del_list, axis=1, inplace=True)
        df.set_index('datadate', drop=True, inplace=True)

        # train input 과 형상 맞추기
        add_col = [i for i in check_col if i not in df.columns]

        for a in add_col:
            df[a] = 0

        # ' ' -> nan 으로 변경
        for a in df.columns:
            df[a] = df[a].replace({' ': np.nan})

        # nan 처리
        df = df.fillna(0)

        # x_test  생성
        self.df_test = astype_data(df.values.reshape(1, df.values.shape[0], df.values.shape[1]))

    def __len__(self):
        return len(self.df_test)

    def __getitem__(self, idx):
        return self.df_test[idx]

# sampletestset = testDataset('/content/drive/MyDrive/Colab Notebooks/농산물/preprocess/test/set_0/test_0.csv')
# sampletestloader = DataLoader(sampletestset, batch_size = 1, shuffle=False)

# for (i, o) in sampletestloader:
#     print(i.shape, o.shape)
#     print(i)
#     print(o)
#     break

In [None]:
def train(flags, idx):
    name = f'Exp_X_U_batch_128_{flags["data_num"]:03d}_{idx}'
    
    flags['data_path'] = f'./train/train_{flags["data_num"]}.csv'

    wandb.init(
        project="Nong_Linear", 
        entity="deep-overflow", 
        config=flags,
        name=name
    )

    # Flag # ====================
    # 다양한 Regularization 시도해보기
    # ===========================
    flags = Flag(flags)

    # Dataset # ====================
    train_dataset = windowDataset(flags.data_path)
    test_dataset = testDataset()

    # DataLoader # ====================
    train_dataloader = DataLoader(
        dataset=train_dataset,
        batch_size=flags.batch_size,
        shuffle=True
    )

    # Net # ====================
    # Dropout
    # BatchNorm
    # ==========================
    net = nn.Sequential(
        nn.Flatten(),
        nn.Linear(14 * 49, flags.model.hid_dim),
        # nn.BatchNorm1d(flags.model.hid_dim),
        nn.LeakyReLU(),
        nn.Dropout(flags.dropout),
    )

    for _ in range(flags.model.nlayers - 2):
        net.append(nn.Linear(flags.model.hid_dim, flags.model.hid_dim))
        # net.append(nn.BatchNorm1d(flags.model.hid_dim))
        net.append(nn.LeakyReLU())
        net.append(nn.Dropout(flags.dropout))
    
    net.append(nn.Linear(flags.model.hid_dim, 28))
    net.append(nn.ReLU())

    # Weight Initialization # ====================
    # Xavier
    # Kaiming
    # ============================================
    for name, param in net.named_parameters():
        if name.split('.')[-1] == 'bias':
            continue
        print(f'Init {name}')
        nn.init.xavier_uniform_(param)
        # nn.init.kaiming_uniform_(param, nonlinearity='leaky_relu')

    # Criterion # ====================
    # 다양한 로스 시도해보기
    # ================================
    criterion = nn.L1Loss()

    # Optimizer # ====================
    # 다양한 옵티마이저 시도해보기
    # ================================
    optimizer = optim.Adam(
        params=net.parameters(), 
        lr=flags.lr,
        betas=(0.9, 0.999),
    )

    # scheduler = lr_scheduler.ExponentialLR(
    #     optimizer=optimizer,
    #     gamma=0.9
    # )

    # scheduler = lr_scheduler.StepLR(
    #     optimizer=optimizer,
    #     step_size=flags.lr_scheduler.step_size,
    #     gamma=flags.lr_scheduler.gamma
    # )

    def lr_schedule_fn(epoch):
        if epoch < 15:
            return 1.5 # 1e-4 * 1.5
        elif epoch >= 15 and epoch < 250:
            return 1.0
        elif epoch >= 250 and epoch < 350:
            return 1.5
        else:
            return 1.0


    # =====
    # a: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 399: 1.5 // 400 - 499: 1 (O)
    # b: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 349: 2 // 350 - 499: 1
    # c: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 349: 10 // 350 - 499: 1 (X)
    # d: 0 - 14: 1.5 // 15 - 249: 1 // 250 - 349: 3 // 350 - 499: 1
    
    scheduler = lr_scheduler.LambdaLR(
        optimizer=optimizer,
        lr_lambda=lr_schedule_fn
    )

    # scheduler = lr_scheduler.CosineAnnealingLR(
    #     optimizer=optimizer,
    #     T_max=100
    # )

    # Device
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    net = net.to(device)

    # Train
    for epoch in range(flags.epochs):
            
        epoch_train_loss = 0.0

        for inputs, labels in train_dataloader:
            inputs, labels = inputs.float().to(device), labels.to(device)

            with torch.set_grad_enabled(True):
                outputs = net(inputs)
                loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
            epoch_train_loss += loss.item()
        
        scheduler.step()
        
        wandb.log({
            'loss': epoch_train_loss / len(train_dataloader),
            'lr': optimizer.param_groups[0]['lr']
        })
        if (epoch + 1) % 10 == 0:
            print(f'[epoch : {epoch + 1} / {flags.epochs}] Train Loss : {epoch_train_loss / len(train_dataloader)}')

for i in range(1):
    flags = {
        'epochs': 500,
        'lr': 1e-4,
        'batch_size': 64,
        'data_num': 0,
        'model': {
            'nlayers': 10,
            'hid_dim': 1024,
            'weight_init': 'xavier_uniform'
        },
        'optim': 'Adam',
        'criterion': 'L1Loss',
        'dropout': 0.5,
        'lr_scheduler': {
            'method': 'Lambda'
        }
    }

    train(flags, i)

In [13]:
def train_and_test(flags):
    name = 'Test_001'

    wandb.init(
        project="Nong_Linear",
        entity='deep-overflow',
        config=flags,
        name=name
    )

    flags = Flag(flags)

    for item_idx in range(37): # 37로 수정하기
        
        data_path = f'./train/train_{item_idx}.csv'
        train_dataset = windowDataset(data_path)

        train_dataloader = DataLoader(
            dataset=train_dataset,
            batch_size=flags.batch_size,
            shuffle=True,
        )

        net = nn.Sequential(
            nn.Flatten(),
            nn.Linear(14 * 49, flags.model.hid_dim),
            # nn.BatchNorm1d(flags.model.hid_dim),
            nn.LeakyReLU(),
            nn.Dropout(flags.dropout),
        )

        for _ in range(flags.model.nlayers - 2):
            net.append(nn.Linear(flags.model.hid_dim, flags.model.hid_dim))
            # net.append(nn.BatchNorm1d(flags.model.hid_dim))
            net.append(nn.LeakyReLU())
            net.append(nn.Dropout(flags.dropout))
        
        net.append(nn.Linear(flags.model.hid_dim, 28))
        net.append(nn.ReLU())

        for name, param in net.named_parameters():
            if name.split('.')[-1] == 'bias':
                continue
            nn.init.xavier_uniform_(param)
            # nn.init.kaiming_uniform_(param, nonlinearity='leaky_relu')

        criterion = nn.L1Loss()

        optimizer = optim.Adam(
            params=net.parameters(), 
            lr=flags.lr,
            betas=(0.9, 0.999),
        )

        def lr_schedule_fn(epoch):
            if epoch < 15:
                return 1.5 # 1e-4 * 1.5
            elif epoch >= 15 and epoch < 250:
                return 1.0
            elif epoch >= 250 and epoch < 350:
                return 1.5
            else:
                return 1.0

        scheduler = lr_scheduler.LambdaLR(
            optimizer=optimizer,
            lr_lambda=lr_schedule_fn
        )

        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        net = net.to(device)

        #
        start_time = time.time()
        
        for epoch in range(flags.epochs):
            epoch_train_loss = 0.0

            for inputs, labels in train_dataloader:
                inputs, labels = inputs.float().to(device), labels.to(device)

                with torch.set_grad_enabled(True):
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            
                epoch_train_loss += loss.item()
            
            scheduler.step()
            
            wandb.log({
                'loss': epoch_train_loss / len(train_dataloader),
                'lr': optimizer.param_groups[0]['lr']
            })

            if (epoch + 1) % 10 == 0:
                print(f'[epoch : {epoch + 1} / {flags.epochs}] Train Loss : {epoch_train_loss / len(train_dataloader)}')
        
        torch.save(net.state_dict(), f'weights_{item_idx}.pth')

        end_time = time.time()

        print(f'Train Time: {end_time - start_time}')

        # Test
        result_np = np.zeros((1, 28), dtype=np.float32)
        for set_num in range(10): # 10으로 수정하기
            data_path = f'./test/set_{set_num}/test_{item_idx}.csv'
            test_dataset = testDataset(data_path)

            inputs = test_dataset[0].reshape(1, 14, 49).to(device)

            with torch.no_grad():
                outputs = net(inputs)

            output_np = outputs.cpu().detach().numpy()

            result_np = np.concatenate([result_np, output_np], axis=0)

            save_df = pd.DataFrame(result_np).T
            save_df.to_csv(f'./set_{set_num}/predict_{item_idx}.csv', index=False)
            print(f'Save Result set: {set_num}, item: {item_idx}')

flags = {
    'epochs': 200,
    'lr': 1e-4,
    'batch_size': 64,
    'data_num': 0,
    'model': {
        'nlayers': 10,
        'hid_dim': 1024,
        'weight_init': 'xavier_uniform'
    },
    'optim': 'Adam',
    'criterion': 'L1Loss',
    'dropout': 0.5,
    'lr_scheduler': {
        'method': 'Lambda'
    }
}

train_and_test(flags)

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdeep-overflow[0m. Use [1m`wandb login --relogin`[0m to force relogin


[epoch : 10 / 200] Train Loss : 1241.9858716881793
[epoch : 20 / 200] Train Loss : 1211.7211064877717
[epoch : 30 / 200] Train Loss : 1200.4775974439538
[epoch : 40 / 200] Train Loss : 1179.3368981402853
[epoch : 50 / 200] Train Loss : 1174.688333262568
[epoch : 60 / 200] Train Loss : 1168.7629076086957
[epoch : 70 / 200] Train Loss : 1166.357278575068
[epoch : 80 / 200] Train Loss : 1150.6319527004075
[epoch : 90 / 200] Train Loss : 1161.1629585597825
[epoch : 100 / 200] Train Loss : 1151.306736158288
[epoch : 110 / 200] Train Loss : 1150.4428976307745
[epoch : 120 / 200] Train Loss : 1146.6574813179348
[epoch : 130 / 200] Train Loss : 1141.5489183508832
[epoch : 140 / 200] Train Loss : 1136.6665782099185
[epoch : 150 / 200] Train Loss : 1144.7351923403533
[epoch : 160 / 200] Train Loss : 1138.4159466287365
[epoch : 170 / 200] Train Loss : 1138.2648023522418
[epoch : 180 / 200] Train Loss : 1137.1538988196332
[epoch : 190 / 200] Train Loss : 1124.4561608355978
[epoch : 200 / 200] Trai

Test Output

In [14]:
for k in tqdm(range(10)):

  globals()[f'set_df_{k}'] = pd.DataFrame()
  answer_df_list = glob(f'./set_{k}/*.csv') # 예측한 결과 불러오기
  pum_list = glob(f'./aT_test_raw/sep_{k}/*.csv') # 기존 test input 불러오기
  pummok = [a for a in pum_list if 'pummok' in a.split('/')[-1]]

  for i in answer_df_list:
    df = pd.read_csv(i)
    number = i.split('_')[-1].split('.')[0]

    base_number = 0
    for p in pummok:
      if number == p.split('_')[-1].split('.')[0]:
        pum_df = pd.read_csv(p)

        if len(pum_df) != 0:
           base_number = pum_df.iloc[len(pum_df)-1]['해당일자_전체평균가격(원)']  # 기존 각 sep 마다 test input의 마지막 target 값 가져오기 (변동률 계산을 위해)
        else:
          base_number = np.nan

    globals()[f'set_df_{k}'][f'품목{number}']  = [base_number] + list(df[df.columns[-1]].values) # 각 품목당 순서를 t, t+1 ... t+28 로 변경

  globals()[f'set_df_{k}'] = globals()[f'set_df_{k}'][[f'품목{col}' for col in range(37)]] # 열 순서를 품목0 ~ 품목36 으로 변경

100%|██████████| 10/10 [00:42<00:00,  4.22s/it]


In [15]:
set_df_0

Unnamed: 0,품목0,품목1,품목2,품목3,품목4,품목5,품목6,품목7,품목8,품목9,...,품목27,품목28,품목29,품목30,품목31,품목32,품목33,품목34,품목35,품목36
0,3871.125,1362.117613,2909.783785,3400.075583,3947.809169,9253.947514,2717.28,3361.030923,4911.899864,1173.018633,...,8640.811309,602.005658,1105.412623,1566.274239,3633.464557,5454.710444,5619.188362,5230.620027,2905.100888,2087.675036
1,2533.7441,2062.2368,2267.7173,1972.1344,1802.6433,3755.937,4498.77,1931.1205,4251.7407,1396.9463,...,6249.0977,1107.8527,1113.2965,859.7086,5976.3193,2835.676,3069.1453,3845.4934,2003.704,2405.182
2,2542.6628,2138.3774,2117.5432,1771.5428,2023.6758,3726.412,5145.9062,1962.6526,3732.8386,1434.5187,...,6072.5425,1110.1315,1038.4185,848.37103,6355.076,3044.8516,2789.64,4049.5881,2012.2743,2336.066
3,2519.9717,2002.0144,2195.209,1974.0044,2016.0502,3596.0034,5042.349,1877.8102,4187.588,1497.6428,...,6035.966,1107.7032,1064.3381,907.72327,5810.2417,2898.436,2797.16,4075.8433,2036.6324,2271.5088
4,2500.0532,2112.5527,2271.9988,1954.2081,1959.9198,3685.1228,5428.897,1995.6625,3915.8433,1486.9482,...,5891.401,1065.2474,1042.4783,920.60754,6165.758,2903.3774,2986.417,3839.3306,1976.3535,2344.6892
5,2566.9224,2182.5576,2182.943,1925.0095,1849.0072,3693.5398,4877.8354,1843.0558,4126.392,1494.0939,...,5775.318,1065.5354,1012.2719,853.1615,6058.119,2965.8276,2885.3132,4086.4397,1899.5911,2264.1519
6,2679.724,2099.6985,2127.1372,2057.4612,2018.698,3947.1604,4724.384,2025.3234,3858.4292,1451.6201,...,6013.942,1105.0894,1043.656,925.7166,5900.5127,2965.4443,2993.7678,3916.82,2045.004,2406.7673
7,2483.4197,2175.9,2060.6348,1894.8894,1947.5812,3908.2178,4863.4917,1994.662,4122.034,1401.6484,...,5925.289,1091.1233,1042.5397,919.82825,6455.265,2859.6118,2799.4602,4047.2192,2059.7434,2353.4263
8,2704.093,2079.9685,2160.8381,1938.2329,1900.1094,3699.3953,4840.43,2016.9584,3846.4043,1469.2426,...,5867.8604,1092.424,1058.1378,844.9608,6483.0645,3133.4512,2849.1294,4029.5203,2112.4727,2427.927
9,2649.7031,2218.6934,2025.2893,1964.7078,1833.2101,3767.8948,5226.598,1927.2921,4043.2769,1384.4662,...,5927.474,1138.4799,1062.1014,884.1202,6437.127,3011.6838,2899.2102,4085.4475,2228.401,2269.5806


In [16]:
date = [f'd+{i}' for i in range(1,15)] + ['d+22 ~ 28 평균']


for k in range(10):
  globals()[f'answer_df_{k}'] = pd.DataFrame()
  for c in globals()[f'set_df_{k}'].columns:
    base_d = globals()[f'set_df_{k}'][c][0] # 변동률 기준 t 값

    ans_1_14 = []
    for i in range(14):
      ans_1_14.append((globals()[f'set_df_{k}'][c].iloc[i+1]- base_d)/base_d)  # t+1 ~ t+14 까지는 (t+n - t)/t 로 계산

    ans_22_28 = (globals()[f'set_df_{k}'][c][22:29].mean() - base_d)/base_d # t+22 ~ t+28은 np.mean(t+22 ~ t+28) - t / t

    globals()[f'answer_df_{k}'][f'{c} 변동률'] = ans_1_14 + [ans_22_28]
  
  globals()[f'answer_df_{k}']['Set'] = k # set 번호 설정
  globals()[f'answer_df_{k}']['일자'] = date # 일자 설정

In [17]:
# 위에서 계산된 변동률 들을 합쳐주는 과정

all_df =pd.DataFrame()
for i in range(10):
  if i== 0 :
    all_df = pd.concat([all_df, globals()[f'answer_df_{i}']],axis=1)
  else:
    all_df = pd.concat([all_df, globals()[f'answer_df_{i}']])


all_df = all_df[['Set','일자'] + list(all_df.columns[:-2])]
all_df.reset_index(drop=True, inplace=True)

In [19]:
# set, 일자 기억하기위해 따로 저장

re_set = list(all_df['Set'])
re_date = list(all_df['일자'])


# 정답 양식 불러오기
out_ans = pd.read_csv('./answer_example.csv')

# 두 dataframe 합치기 (nan + 숫자 = nan 이용)
submit_df = all_df + out_ans

submit_df['Set'] = re_set
submit_df['일자'] = re_date


# 최종 저장
submit_df.to_csv('./submit.csv',index=False)