In [None]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [None]:
!pip install pandasql

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandasql
  Downloading pandasql-0.7.3.tar.gz (26 kB)
Building wheels for collected packages: pandasql
  Building wheel for pandasql (setup.py) ... [?25l[?25hdone
  Created wheel for pandasql: filename=pandasql-0.7.3-py3-none-any.whl size=26784 sha256=271efaaa99d52f7622b3a2bd01f69e09761f42d1d0b78759c2b9c0b3abb73542
  Stored in directory: /root/.cache/pip/wheels/5c/4b/ec/41f4e116c8053c3654e2c2a47c62b4fca34cc67ef7b55deb7f
Successfully built pandasql
Installing collected packages: pandasql
Successfully installed pandasql-0.7.3


In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
from glob import glob
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from pandasql import sqldf
import os

import matplotlib.pyplot as plt


# 경고 끄기
warnings.filterwarnings(action='ignore')

## 입력 shape 및 형태 정의 함수

In [None]:
def make_Tensor(array):
    return tf.convert_to_tensor(array, dtype=tf.float32)

def astype_data(data):
    df = data.astype(np.float32)
    return make_Tensor(df)

## Transformer 정의

- encoder

In [None]:
def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):

    x = layers.LayerNormalization(epsilon=1e-6)(inputs)
    x = layers.MultiHeadAttention(
        key_dim=head_size, num_heads=num_heads, dropout=dropout
    )(x, x)
    x = layers.Dropout(dropout)(x)
    res = x + inputs

    x = layers.LayerNormalization(epsilon=1e-6)(res)
    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(x)
    x = layers.Dropout(dropout)(x)
    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
    return x + res

- build

In [None]:
def build_model(input_shape, head_size, num_heads, ff_dim, num_transformer_blocks, mlp_units, dropout=0, mlp_dropout=0):
    inputs = keras.Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)

    x = layers.GlobalAveragePooling1D(data_format="channels_first")(x)
    for dim in mlp_units:
        x = layers.Dense(dim, activation="relu")(x)
        x = layers.Dropout(mlp_dropout)(x)
    outputs = layers.Dense(28)(x) # 4주 예측
    return keras.Model(inputs, outputs)

## keras eraly stop, chekpoint 정의

In [None]:
def call_back_set(name, epoch, batch_size):
    early_stopping = EarlyStopping(monitor='val_loss', patience=100)

    if os.path.exists(f'/content/drive/MyDrive/농산물예측/aT_data/check2') == False:
        os.mkdir(f'/content/drive/MyDrive/농산물예측/aT_data/check2')

    filename = f'/content/drive/MyDrive/농산물예측/aT_data/check2/{name}-{epoch}-{batch_size}.h5'

    checkpoint = ModelCheckpoint(filename,
                                 monitor='val_loss',
                                 verbose=1,
                                 save_best_only=True,
                                 save_weights_only=True,
                                 mode='auto'
                                 )
    return [early_stopping, checkpoint]

## Model 훈련 함수

In [None]:
def train(x_train, y_train, x_val, y_val, name, epoch, batch_size, learning_rate = 0.001, verbose = 1):


    model = build_model(
    x_train.shape[1:],
    head_size=256,
    num_heads=4,
    ff_dim=4,
    num_transformer_blocks=4,
    mlp_units=[128],
    mlp_dropout=0.4,
    dropout=0.25,
    )

    model.compile(
        loss="mean_squared_error",
        optimizer=keras.optimizers.Adam(learning_rate=learning_rate)
    )


    # Train the model
    with tf.device('/device:GPU:0'):
        history1 = model.fit(
            x_train, y_train,
            epochs = epoch,
            steps_per_epoch=len(x_train) / batch_size,
            batch_size=batch_size,
            validation_data=(x_val, y_val),
            validation_steps=len(x_val) / batch_size,
            shuffle=False,
            callbacks=call_back_set(name, epoch, batch_size),
            verbose=verbose)

    return model

## 시점 윈도우 생성 함수

In [None]:
def time_window(df, t, t_sep):
    seq_len = t
    seqence_length = seq_len + t_sep

    result = []
    for index in tqdm(range(len(df) - seqence_length)):
        result.append(df[index: index + seqence_length].values)

    return np.array(result)

## 데이터 불러오기 및 parameter 설정

In [None]:

data_list = glob('/content/drive/MyDrive/농산물예측/aT_data/data/train/*.csv')
epoch = 1000
batch = 15
tr_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 '] # train 에서 사용하지 않는 열
ts_del_list = ['단가(원)', '거래량', '거래대금(원)', '경매건수', '도매시장코드', '도매법인코드', '산지코드 ', '해당일자_전체평균가격(원)'] # test 에서 사용하지 않는 열
check_col = ['일자구분_중순', '일자구분_초순', '일자구분_하순','월구분_10월', '월구분_11월', '월구분_12월', '월구분_1월', '월구분_2월', '월구분_3월', 
             '월구분_4월','월구분_5월', '월구분_6월', '월구분_7월', '월구분_8월', '월구분_9월'] # 열 개수 맞추기

In [None]:
data_list

['/content/drive/MyDrive/농산물예측/aT_data/data/train/train_0.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_1.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_5.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_28.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_34.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_16.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_36.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_18.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_13.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_21.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_11.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_33.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_15.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_25.csv',
 '/content/drive/MyDrive/농산물예측/aT_data/data/train/train_7.csv',
 '/content/drive/MyDrive/농산물예

'27'

## Train 과정

In [None]:
weather = pd.read_csv('/content/drive/MyDrive/농산물예측/final_weather_with_pummok.csv')


for i in tqdm(data_list):
    df_number = i.split("_")[-1].split(".")[0]
    df = pd.read_csv(i)

    # 날씨 변환한거 다시바꿔주고 넣어주기
    weather_df = weather[weather['품목']==int(df_number)]
    df['주산지_0_초기온도(℃)']  = weather_df['주산지0_초기온도']
    df['주산지_0_최대온도(℃)'] = weather_df['주산지0_최대온도']
    df['주산지_0_최저온도(℃)']= weather_df['주산지0_최저온도']
    df['주산지_0_평균온도(℃)']= weather_df['주산지0_평균온도']
    df['주산지_0_강수량(ml)']= weather_df['주산지0_강수량(ml)']

    df['주산지_1_초기온도(℃)']  = weather_df['주산지1_초기온도']
    df['주산지_1_최대온도(℃)'] = weather_df['주산지1_최대온도']
    df['주산지_1_최저온도(℃)']= weather_df['주산지1_최저온도']
    df['주산지_1_평균온도(℃)']= weather_df['주산지1_평균온도']
    df['주산지_1_강수량(ml)']= weather_df['주산지1_강수량(ml)']   

    df['주산지_2_초기온도(℃)']  = weather_df['주산지2_초기온도']
    df['주산지_2_최대온도(℃)'] = weather_df['주산지2_최대온도']
    df['주산지_2_최저온도(℃)']= weather_df['주산지2_최저온도']
    df['주산지_2_평균온도(℃)']= weather_df['주산지2_평균온도']
    df['주산지_2_강수량(ml)']= weather_df['주산지2_강수량(ml)']


    # 습도 날리고
    df.drop('주산지_0_습도(%)',axis=1,inplace=True)
    df.drop('주산지_1_습도(%)',axis=1,inplace=True)
    df.drop('주산지_2_습도(%)',axis=1,inplace=True)





    for j in df.columns:
        df[j] = df[j].replace({' ': np.nan})

    # 사용할 열 선택 및 index 설정
    df.drop(tr_del_list, axis=1, inplace=True)
    df.set_index('datadate', drop=True, inplace=True)

    # nan 처리
    df = df.fillna(0)

    # 변수와 타겟 분리
    x, y = df[[i for i in df.columns if i != '해당일자_전체평균가격(원)']], df['해당일자_전체평균가격(원)']

    # 2주 입력을 통한 이후 4주 예측을 위해 y의 첫 14일을 제외
    y = y[14:]

    # time series window 생성
    data_x = time_window(x, 13, 1)
    data_y = time_window(y, 27, 1)

    # y의 길이와 같은 길이로 설정
    xdata = data_x[:len(data_y)]
    ydata = data_y

    # train, validation 분리 (8 : 2)
    x_train, x_val, y_train, y_val = train_test_split(xdata, ydata, test_size=0.2, shuffle=False, random_state=119)

    # transformer 모델 훈련
    transformer = train(astype_data(x_train), y_train, astype_data(x_val), y_val, f'transformer-{df_number}', epoch,
                        batch)
    transformer.load_weights(f'/content/drive/MyDrive/농산물예측/aT_data/check2/transformer-{df_number}-{epoch}-{batch}.h5')

    if os.path.exists(f'/content/drive/MyDrive/농산물예측/aT_data/model2') == False:
        os.mkdir(f'/content/drive/MyDrive/농산물예측/aT_data/model2')

    # 모델 저장
    transformer.save(f'/content/drive/MyDrive/농산물예측/aT_data/model2/transformer-{df_number}-{epoch}-{batch}.h5')


Epoch 55: val_loss did not improve from 1886137.50000
Epoch 56/1000
Epoch 56: val_loss did not improve from 1886137.50000
Epoch 57/1000
Epoch 57: val_loss did not improve from 1886137.50000
Epoch 58/1000
Epoch 58: val_loss did not improve from 1886137.50000
Epoch 59/1000

## Test 과정

In [None]:
zero_csv = [0 for i in range(14)]  # 시점이 비어있는 데이터 0으로 채우기 위한 변수

for i in tqdm(range(10)):
    data_list = glob(f'/content/drive/MyDrive/농산물예측/aT_data/data/test/set_{i}/*.csv')

    for idx,j in enumerate(data_list):
        df = pd.read_csv(j)

        if len(df) == 0:
            df['zero_non'] = zero_csv
            df = df.fillna(0)
            df.drop('zero_non', axis=1, inplace=True)


        file_number = j.split('test_')[1].split('.')[0]

        # 사용할 열 선택, index 설정
        df.drop(ts_del_list, axis=1, inplace=True)
        df.set_index('datadate', drop=True, inplace=True)

        # train input 과 형상 맞추기
        add_col = [i for i in check_col if i not in df.columns]

        for a in add_col:
            df[a] = 0

        # ' ' -> nan 으로 변경
        for a in df.columns:
            df[a] = df[a].replace({' ': np.nan})

        # nan 처리
        df = df.fillna(0)

        # x_test  생성
        df_test = astype_data(df.values.reshape(1, df.values.shape[0], df.values.shape[1]))


        # model test
        if os.path.exists('./model_output') == False:
            os.mkdir('./model_output')

        if os.path.exists(f'./model_output/set_{i}') == False:
            os.mkdir(f'./model_output/set_{i}')

        # 해당하는 모델 불러오기
        model_test = tf.keras.models.load_model(f'./model/transformer-{file_number}-{epoch}-{batch}.h5')
        pred = model_test.predict(df_test)


        # 결과 저장
        save_df = pd.DataFrame(pred).T
        save_df.to_csv(f'./model_output/set_{i}/predict_{file_number}.csv', index=False)

In [None]:
# 컬럼 삭제 후 남는거
df2 = df.drop(tr_del_list, axis=1)
df2.columns

Index(['datadate', '해당일자_전체평균가격(원)', '해당일자_전체거래물량(kg)', '하위가격 평균가(원)',
       '상위가격 평균가(원)', '하위가격 거래물량(kg)', '상위가격 거래물량(kg)', '일자별_도매가격_최대(원)',
       '일자별_도매가격_평균(원)', '일자별_도매가격_최소(원)', '일자별_소매가격_최대(원)', '일자별_소매가격_평균(원)',
       '일자별_소매가격_최소(원)', '수출중량(kg)', '수출금액(달러)', '수입중량(kg)', '수입금액(달러)',
       '무역수지(달러)', '주산지_0_초기온도(℃)', '주산지_0_최대온도(℃)', '주산지_0_최저온도(℃)',
       '주산지_0_평균온도(℃)', '주산지_0_강수량(ml)', '주산지_0_습도(%)', '주산지_1_초기온도(℃)',
       '주산지_1_최대온도(℃)', '주산지_1_최저온도(℃)', '주산지_1_평균온도(℃)', '주산지_1_강수량(ml)',
       '주산지_1_습도(%)', '주산지_2_초기온도(℃)', '주산지_2_최대온도(℃)', '주산지_2_최저온도(℃)',
       '주산지_2_평균온도(℃)', '주산지_2_강수량(ml)', '주산지_2_습도(%)', '일자구분_중순', '일자구분_초순',
       '일자구분_하순', '월구분_10월', '월구분_11월', '월구분_12월', '월구분_1월', '월구분_2월',
       '월구분_3월', '월구분_4월', '월구분_5월', '월구분_6월', '월구분_7월', '월구분_8월', '월구분_9월'],
      dtype='object')

In [None]:
# 널값 처리
df2.isna().sum()



# 해당일자

df['해당일자_전체평균가격(원)'] = df.fillna(df['해당일자_전체평균가격(원)'].mean())

df['해당일자_전체거래물량(kg)'] = df.fillna(df['해당일자_전체거래물량(kg)'].mean())
df['해당일자_전체평균가격(원)'] = df.fillna(df['해당일자_전체평균가격(원)'].mean())

5456.9896707867

In [None]:
for i in data_list:
  df = pd.read_csv(i)
  plt.figure(figsize=(20,5))
  plt.subplot(1, 2, 1) 
  df['해당일자_전체평균가격(원)'].hist()
  plt.subplot(1, 2, 2) 
  df['해당일자_전체평균가격(원)'].plot(kind='line')
  plt.tight_layout()
  plt.show()
  plt.close()

Output hidden; open in https://colab.research.google.com to view.