### Init

In [1]:
%ls

[0m[01;34mdrive[0m/  [01;34msample_data[0m/


In [2]:
%cd drive/MyDrive/data/aT/scaled_data/

%ls

/content/drive/MyDrive/data/aT/scaled_data
answer_example.csv  [0m[01;34mLinear_weight[0m/  submit.csv  [01;34mtrain[0m/
[01;34maT_test_raw[0m/        [01;34mprediction[0m/     [01;34mtest[0m/


In [38]:
import xgboost as xgb

from torch.utils.data import Dataset
import time
import pandas as pd
import numpy as np
from glob import glob
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
from sklearn.multioutput import MultiOutputRegressor
from tqdm import tqdm
import random
import os

# 시드고정
random.seed(42)
np.random.seed(42)

# 학습에 사용할 데이터 파일 (csv)
data_list = glob('.//train/*.csv')

tr_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 '] # train 에서 사용하지 않는 열
ts_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 ', '해당일자_전체평균가격(원)'] # test 에서 사용하지 않는 열
check_col = ['일자구분_중순', '일자구분_초순', '일자구분_하순','월구분_10월', '월구분_11월', '월구분_12월', '월구분_1월', '월구분_2월', '월구분_3월', 
             '월구분_4월','월구분_5월', '월구분_6월', '월구분_7월', '월구분_8월', '월구분_9월'] # 열 개수 맞추기

def time_window(df, t, t_sep):
    seq_len = t
    seqence_length = seq_len + t_sep

    result = []
    for index in range(len(df) - seqence_length):
        tmp = df[index: index + seqence_length].values
        tmp = np.vstack(tmp).astype(np.float32)
        result.append(tmp)

    return np.array(result)

def right_value(a, i):
    for j in range(i + 1, len(a)):
        if a[j] != 0:
            return a[j]
    return None

def left_value(a ,i):
    for j in range(0, i):
        if a[i - j - 1] != 0:
            return a[i - j - 1]
    return None

def nearest_value(a, i):
    size = len(a)
    rvalue = right_value(a, i)
    lvalue = left_value(a, i)

    if rvalue == None:
        if lvalue == None:
            return None
        else:
            return lvalue
    else:
        if lvalue == None:
            return rvalue
        else:
            return rvalue

### Data class

In [37]:
data = Data()

y_t: 1419
y_np: 1419


In [36]:
import torch

class Data:
    def __init__(self, data_path='./train/', item_idx=0, split_ratio=0.2):
        # 파일 경로 ex) item_idx = 0인 경우, file_path='./train/train_0.csv'이다.
        file_path = os.path.join(data_path, f'train_{item_idx}.csv')

        # csv 파일 읽기
        data_pd = pd.read_csv(file_path)

        # 데이터 전처리
        for column in data_pd.columns:
            data_pd[column] = data_pd[column].replace({' ': np.nan})
        
        data_pd.drop(tr_del_list, axis=1, inplace=True)
        data_pd.set_index('datadate', drop=True, inplace=True)

        data_pd = data_pd.fillna(0)

        x_pd, y_pd = data_pd[[i for i in data_pd.columns if i != '해당일자_전체평균가격(원)']], data_pd['해당일자_전체평균가격(원)']

        y_t = np.array(y_pd)
        y_t = y_t[13:-29]
        print(f'y_t: {len(y_t)}')

        y_pd = y_pd[14:]

        x_np = time_window(x_pd, 13, 1)
        y_np = time_window(y_pd, 27, 1)

        x_np = x_np[:len(y_np)]
        y_np = y_np.reshape(-1, 28)
        print(f'y_np: {len(y_np)}')

        # 변동률 예측
        for i in range(len(y_np)):
            for j in range(28):
                if y_t[i] == 0:
                    value_t = nearest_value(y_t, i)
                else:
                    value_t = y_t[i]

                y_np[i][j] = y_np[i][j] - value_t
                y_np[i][j] = y_np[i][j] / value_t

        self.data = {
            'inputs': x_np,
            'labels': y_np,
        }

        self.split_ratio = split_ratio

        if self.split_ratio > 0:
            self.train_val_split()
    
    def train_val_split(self):
        x_train, x_val, y_train, y_val = train_test_split(
            self.data['inputs'],
            self.data['labels'],
            test_size=self.split_ratio,
            random_state=42
        )

        self.train_data = (x_train, y_train)
        self.val_data = (x_val, y_val)

    def get_train(self, use_tensor=False):
        if use_tensor:
            return torch.tensor(self.train_data[0], dtype=torch.float32), torch.tensor(self.train_data[1], dtype=torch.float32)
        else:
            return self.train_data
    
    def get_val(self, use_tensor=False):
        if use_tensor:
            return torch.tensor(self.val_data[0], dtype=torch.float32), torch.tensor(self.val_data[1], dtype=torch.float32)
        else:
            return self.val_data

### testData class

In [15]:
class testData:
    def __init__(self, file_path):
        zero_csv = [0 for i in range(14)]
        data = pd.read_csv(file_path)

        if len(data) == 0:
            print('no data in Dataset!!')
            print(data)
            data['zero_non'] = zero_csv
            print(data)
            data = data.fillna(0)
            print(data)
            data.drop('zero_non', axis=1, inplace=True)
            data.drop('Unnamed: 0', axis=1, inplace=True)
            print(data)

        # 사용할 열 선택, index 설정
        data.drop(ts_del_list, axis=1, inplace=True)
        data.set_index('datadate', drop=True, inplace=True)

        # train input 과 형상 맞추기
        add_col = [i for i in check_col if i not in data.columns]

        for a in add_col:
            data[a] = 0

        # ' ' -> nan 으로 변경
        for a in data.columns:
            data[a] = data[a].replace({' ': np.nan})

        # nan 처리
        data = data.fillna(0)

        # x_test  생성
        self.data = data.to_numpy()

### Train and Test

In [42]:
for item_idx in range(37):
    data = Data(item_idx=item_idx)
    x_train, y_train = data.get_train()
    x_val, y_val = data.get_val()

    x_train = x_train.reshape(-1, 14 * 49)
    x_val = x_val.reshape(-1, 14 * 49)

    estimator = xgb.XGBRegressor(
        max_depth=3,
        learning_rate=0.1,
        n_estimators=100,
        objective='reg:squarederror',
        booster='dart',
        gamma=0,
        min_child_weight=1,
        max_delta_step=0,
        subsample=1,
        colsample_bytree=1,
        colsample_bylevel=1,
        colsample_bynode=1,
        reg_alpha=0,
        reg_lambda=1,
        scale_pos_weight=1,
        base_score=0.5,
        random_state=0,
        seed=42,
        importance_type='gain',
        missing=0,
        sample_type='uniform',
        normalize_type='tree',
        rate_drop=0.1,
        skip_drop=0.5,
    )

    model = MultiOutputRegressor(
        estimator=estimator,
        n_jobs=-1
    ).fit(x_train, y_train, eval_set=[(x_val, y_val)], verbose=True)

    # 테스트
    for set_num in range(10):
        x_test = testData(f'./test/set_{set_num}/test_{item_idx}.csv')
        xdata = x_test.data.reshape(1, -1)
        pred = model.predict(xdata)
        save_df = pd.DataFrame(pred).T
        save_df.to_csv(f'./prediction/set_{set_num}/predict_{item_idx}.csv', index=False)
        print(f'Save Result set: {set_num}, item: {item_idx}')

y_t: 1419
y_np: 1419
Save Result set: 0, item: 0
Save Result set: 1, item: 0
Save Result set: 2, item: 0
Save Result set: 3, item: 0
Save Result set: 4, item: 0
Save Result set: 5, item: 0
Save Result set: 6, item: 0
Save Result set: 7, item: 0
Save Result set: 8, item: 0
Save Result set: 9, item: 0
y_t: 1419
y_np: 1419
Save Result set: 0, item: 1
Save Result set: 1, item: 1
Save Result set: 2, item: 1
Save Result set: 3, item: 1
Save Result set: 4, item: 1
Save Result set: 5, item: 1
Save Result set: 6, item: 1
Save Result set: 7, item: 1
Save Result set: 8, item: 1
Save Result set: 9, item: 1
y_t: 1419
y_np: 1419
Save Result set: 0, item: 2
Save Result set: 1, item: 2
Save Result set: 2, item: 2
Save Result set: 3, item: 2
Save Result set: 4, item: 2
Save Result set: 5, item: 2
Save Result set: 6, item: 2
Save Result set: 7, item: 2
Save Result set: 8, item: 2
Save Result set: 9, item: 2
y_t: 1419
y_np: 1419
Save Result set: 0, item: 3
Save Result set: 1, item: 3
Save Result set: 2, 

### Save Test Result

In [43]:
%ls

answer_example.csv  [0m[01;34mLinear_weight[0m/  submit.csv  [01;34mtrain[0m/
[01;34maT_test_raw[0m/        [01;34mprediction[0m/     [01;34mtest[0m/


In [45]:
for set_idx in tqdm(range(10)):
    globals()[f'set_df_{set_idx}'] = pd.DataFrame()
    # 예측한 결과 파일 리스트
    ans_df_list = glob(f'./prediction/set_{set_idx}/*.csv')
    pum_list = glob(f'./aT_test_raw/sep_{set_idx}/*.csv')
    pummok = [pum for pum in pum_list if 'pummok' in pum.split('/')[-1]]
    print(f'ans_df_list: {len(ans_df_list)}')
    for answer_file_path in ans_df_list:
        # set_idx, item_idx에 대응하는 예측 결과
        answer_df = pd.read_csv(answer_file_path)
        # item idx
        item_idx = answer_file_path.split('_')[-1].split('.')[0]
        
        globals()[f'set_df_{set_idx}'][f'품목{item_idx}'] = list(answer_df[answer_df.columns[-1]].values)
    globals()[f'set_df_{set_idx}'] = globals()[f'set_df_{set_idx}'][[f'품목{col}' for col in range(37)]] # 열 순서를 품목0 ~ 품목36 으로 변경

date = [f'd+{i}' for i in range(1, 15)] + ['d+22 ~ 28 평균']

for set_idx in range(10):
    globals()[f'answer_df_{set_idx}'] = pd.DataFrame()

    for c in globals()[f'set_df_{set_idx}'].columns:

        ans_1_14 = []
        for i in range(14):
            ans_1_14.append(globals()[f'set_df_{set_idx}'][c].iloc[i])
        
        ans_22_28 = globals()[f'set_df_{set_idx}'][c][21:28].mean()
        globals()[f'answer_df_{set_idx}'][f'{c} 변동률'] = ans_1_14 + [ans_22_28]

    globals()[f'answer_df_{set_idx}']['Set'] = set_idx
    globals()[f'answer_df_{set_idx}']['일자'] = date

all_df = pd.DataFrame()
for set_idx in range(10):
    if set_idx == 0:
        all_df = pd.concat([all_df, globals()[f'answer_df_{set_idx}']], axis=1)
    else:
        all_df = pd.concat([all_df, globals()[f'answer_df_{set_idx}']])
    
all_df = all_df[['Set', '일자'] + list(all_df.columns[:-2])]
all_df.reset_index(drop=True, inplace=True)

# set, 일자 기억하기위해 따로 저장

re_set = list(all_df['Set'])
re_date = list(all_df['일자'])


# 정답 양식 불러오기
out_ans = pd.read_csv('./answer_example.csv')

# 두 dataframe 합치기 (nan + 숫자 = nan 이용)
submit_df = all_df + out_ans

submit_df['Set'] = re_set
submit_df['일자'] = re_date


# 최종 저장
submit_df.to_csv('./submit_001.csv',index=False)

 10%|█         | 1/10 [00:00<00:01,  6.09it/s]

ans_df_list: 37
ans_df_list: 37


 30%|███       | 3/10 [00:00<00:01,  6.12it/s]

ans_df_list: 37
ans_df_list: 37


 50%|█████     | 5/10 [00:00<00:00,  6.20it/s]

ans_df_list: 37
ans_df_list: 37


 70%|███████   | 7/10 [00:01<00:00,  6.48it/s]

ans_df_list: 37
ans_df_list: 37


 90%|█████████ | 9/10 [00:01<00:00,  6.59it/s]

ans_df_list: 37
ans_df_list: 37


100%|██████████| 10/10 [00:01<00:00,  6.38it/s]
