In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import random as python_random
from tqdm import tqdm

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Activation, LSTM, GRU, SimpleRNN
from keras import backend as K
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard, ReduceLROnPlateau

In [None]:
SEED = 42

os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

tf.random.set_seed(SEED)
np.random.seed(SEED)
python_random.seed(123)

In [None]:
Current_directory = './'
Model_directory = './model'
Tensorboard_directory = './tensorboard'

# 2. 데이터 전처리

## 2.1. 데이터 병합

In [None]:
os.chdir(Current_directory)

# months
months = []
for i in range(1,25):
  if i <= 12: 
    months.append(str(2000+ i))
  else: 
    months.append(str(2100 + (i+1)%13))
months.append('2206')
print(months)

df_all = []

for i in months:
  df_curr = pd.read_csv(Current_directory + 'uv_' + str(i) + '.csv', encoding='euc-kr')
  df_curr.drop(columns=['Unnamed: 0'], inplace=True) 
  df_curr['date'] = pd.to_datetime(df_curr[f'20{i}_uv.yyyymmdd'], format = '%Y%m%d')
  df_curr['time'] = df_curr[f'20{i}_uv.hhnn'].astype(str)
  for i in range(len(df_curr)):
    df_curr['time'][i] = '-'+'0'*(4-len(str(df_curr['time'][i]))) + str(df_curr['time'][i])

  df_curr['date_time'] = pd.to_datetime(df_curr['date'].astype(str) + df_curr['time'].astype(str) ,format = '%Y-%m-%d-%H%M')
  df_curr.drop(['date', 'time'], axis = 1, inplace = True)
  df_curr.set_axis([i.rsplit('.')[-1] for i in df_curr.columns], axis=1, inplace=True)
  df_curr.drop(['yyyymmdd', 'hhnn'], axis = 1, inplace = True)
  df_curr.set_index(['stn', 'date_time'], inplace = True)
  df_curr.sort_index()
  df_all.append(df_curr)

df_tot = df_all[0]

for i in range(1,25):
  df_tot = df_tot.append(df_all[i])

## 2.2. 이상치 처리(KNN Imputer)

In [None]:
df = df_tot
df.replace(-999.0, np.nan, inplace=True)
df.reset_index(inplace=True)

df_train = df.loc[(df["date_time"] >= '2020-01-01') & (df["date_time"] < '2022-01-01')]
df_test = df.loc[(df["date_time"] >= '2022-06-01') & (df["date_time"] < '2022-07-01')]
df_test = df_test.drop(columns=['uv'])

# Train data 지점별 knn
df_imputed = pd.DataFrame(columns=df_train.columns)
stn_list = list(df_train["stn"].unique())

for stn in tqdm(stn_list):
    df_train_1 = df_train.loc[df_train["stn"] == stn]
    df_train_1.set_index("date_time", inplace=True)
    imputer = KNNImputer(n_neighbors=4)
    np_imputed = imputer.fit_transform(df_train_1)
    df_train_2 = pd.DataFrame(np_imputed, columns=df_train_1.columns)
    df_train_1.reset_index(inplace=True)
    df_train_2["date_time"] = df_train_1["date_time"]
    df_imputed = pd.concat([df_imputed, df_train_2], axis=0)

# Test data 지점별 knn
df_test_imputed = pd.DataFrame(columns=df_test.columns)
stn_list = list(df_test["stn"].unique())

for stn in tqdm(stn_list):
    df_test_1 = df_test.loc[df_test["stn"] == stn]
    df_test_1.set_index("date_time", inplace=True)
    imputer = KNNImputer(n_neighbors=4)
    np_imputed = imputer.fit_transform(df_test_1)
    df_test_2 = pd.DataFrame(np_imputed, columns=df_test_1.columns)
    df_test_1.reset_index(inplace=True)
    df_test_2["date_time"] = df_test_1["date_time"]
    df_test_imputed = pd.concat([df_test_imputed, df_test_2], axis=0)

# Train, Test 결측치 채운 data 합치기
all_impute = pd.concat([df_imputed, df_test_imputed], axis=0)
all_impute = all_impute.astype({'stn':'int64', 'date_time':'datetime64[ns]', 'lon':'float64', 'lat':'float64',
                                'uv':'float64', 'band1':'float64', 'band2':'float64', 'band3':'float64', 
                                'band4':'float64', 'band5':'float64', 'band6':'float64', 'band7':'float64',
                                'band8':'float64', 'band9':'float64', 'band10':'float64', 'band11':'float64',
                                'band12':'float64', 'band13':'float64', 'band14':'float64', 'band15':'float64',
                                'band16':'float64', 'solarza':'float64', 'sateza':'float64', 'esr':'float64',
                                'height':'float64', 'landtype':'int64'})

## 2.3. 변수 변환

In [None]:
df = all_impute
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

df['month'] = df["date_time"].dt.month
df = encode(df, 'month', 12)
df['hour'] = df["date_time"].dt.hour
df = encode(df, 'hour', 23)

## 2.4. 파생변수 생성
## 2.7. 데이터 선택 및 추출

In [None]:
# solarza를 활용한 cos 공식으로 UV 계산 컬럼 추가
delta = -0.01
solar = df['solarza']
x = np.cos(solar*(2*np.pi/360)) + delta
df['uv_calculated'] = 11 * (((abs(x)+x)/2)**2)

df.drop(columns=["sateza", "height", "landtype", "month", "hour"], inplace=True)

df = df[['date_time','stn', 'uv', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 'lon', 'lat', 
        'band1', 'band2', 'band3', 'band4', 'band5',
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16', 'solarza', 'esr', 'uv_calculated']]

df_train_1 = df.loc[(df["date_time"] >= "2020-05-01") & (df["date_time"] < "2020-08-01")]
df_train_2 = df.loc[(df["date_time"] >= "2021-05-01") & (df["date_time"] < "2021-08-01")]

df_train = pd.concat([df_train_1, df_train_2], axis=0)

df_test = df.loc[(df["date_time"] >= "2022-06-01") & (df["date_time"] < "2202-07-01")]

df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

df_train.head()

## 2.5. 정규화

In [None]:
# df_train
df_train_1= df_train.iloc[:, :7]

scaler = MinMaxScaler()
df_train_std = df_train.iloc[:, 7:]
df_train_std = scaler.fit_transform(df_train_std)
df_train_std = pd.DataFrame(df_train_std, columns=df_train.columns[7:])

df_train = pd.concat([df_train_1, df_train_std], axis=1)

# df_test
df_test_1 = df_test.iloc[:, :7]

df_test_std = df_test.iloc[:, 7:]
df_test_std = scaler.transform(df_test_std)
df_test_std = pd.DataFrame(df_test_std, columns=df_test.columns[7:])

df_test = pd.concat([df_test_1, df_test_std], axis=1)

# 3. 분석기법
## 3.2. 데이터셋 구성

In [None]:
def build_dataset(time_series, seq_length):
  dataX = []
  dataY = []
  
  for i in tqdm(range(len(time_series) - seq_length)):
    x = time_series.iloc[i:i+seq_length, :-1]
    y = time_series.iloc[i+seq_length - 1, -1]

    dataX.append(x)
    dataY.append(y)

  return np.array(dataX), np.array(dataY)

In [None]:
df_train = df_train[['date_time','stn', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 
        'band1', 'band2', 'band3', 'band4', 'band5',
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16','solarza', 'esr', 'uv_calculated', 'uv']]
  
df_test = df_test[['date_time','stn', 'month_sin', 'month_cos', 'hour_sin', 'hour_cos', 
        'band1', 'band2', 'band3', 'band4', 'band5', 
       'band6', 'band7', 'band8', 'band9', 'band10', 'band11', 'band12',
       'band13', 'band14', 'band15', 'band16','solarza', 'esr', 'uv_calculated', 'uv']]

df_train_timeseries = df_train.iloc[:, 2:]
df_test_timeseries = df_test.iloc[:, 2:]

In [None]:
train_x, train_y = build_dataset(df_train_timeseries, 6)
test_x, test_y = build_dataset(df_test_timeseries, 6)

## 3.3. LSTM 모델 구축

In [None]:
def rmse(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true))) 

- 학습데이터 5 KFold 5세트로 나눠서 각각 학습
- 학습 후 val-loss 기준 상위 3개 모델의 결과값만 합침

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=SEED)
X_train = []
X_valid = []
y_train = []
y_valid = []

for train_index, valid_index in kf.split(train_x):
    X_tr, X_va = train_x[train_index], train_x[valid_index]
    y_tr, y_va = train_y[train_index], train_y[valid_index]
    X_train.append(X_tr)
    X_valid.append(X_va)
    y_train.append(y_tr)
    y_valid.append(y_va)

In [None]:
def set_lstm():

  model = Sequential()

  model.add(LSTM(16, input_shape=(6, 23)))
  model.add(Dense(1, activation='relu'))

  optimizer = keras.optimizers.Adam(learning_rate=0.001)

  model.compile(loss = rmse, optimizer = optimizer, metrics=['mae', 'mse'])

  return model

## 3.4. 모델 학습

In [None]:
for i in range(5):
  model_directory = Model_directory + '/lstm(kf)_' + str(i) + '/'
  tensorboard_directory = Tensorboard_directory + '/lstm(kf)_' + str(i) + '/'

  CP = ModelCheckpoint(filepath=model_directory+'lstm(16)_batch64_epochs200-{epoch:03d}-{val_loss:.4f}.hdf5',
               monitor='val_loss', save_weights_only=True, verbose=1, save_best_only=True, mode='min')
  TB = TensorBoard(log_dir=tensorboard_directory, write_graph=True, write_images=True)
  LR = ReduceLROnPlateau(monitor='val_loss',factor=0.8,patience=3, verbose=1, min_lr=1e-7)
  CALLBACK = [CP, TB, LR]

  train_x = X_train[i]
  train_y = y_train[i]
  valid_x = X_valid[i]
  valid_y = y_valid[i]
  model = set_lstm()
  model.fit(train_x, train_y, batch_size=64, callbacks=CALLBACK, shuffle=True, validation_data=(valid_x, valid_y), epochs=200)

### Predict

- 모델들의 가중치 파일은 아래 링크에서 받을 수 있습니다.
- https://github.com/PHJoon/Weather-Bigdata-Contest/tree/master/model


In [None]:
pretrained_model = set_lstm()
pretrained_model.load_weights('lstm(16)_batch64_epochs200-156-0.5067.hdf5')
extracted_layers = pretrained_model.layers[:]
lstm1 = keras.Sequential(extracted_layers)

pretrained_model = set_lstm()
pretrained_model.load_weights('lstm(16)_batch64_epochs200-135-0.5095.hdf5')
extracted_layers = pretrained_model.layers[:]
lstm2 = keras.Sequential(extracted_layers)

pretrained_model = set_lstm()
pretrained_model.load_weights('lstm(16)_batch64_epochs200-133-0.5091.hdf5')
extracted_layers = pretrained_model.layers[:]
lstm3 = keras.Sequential(extracted_layers)

pred1 = lstm1.predict(test_x)
pred2 = lstm2.predict(test_x)
pred3 = lstm3.predict(test_x)

pred = pred1 * 0.4  + pred2 * 0.3 + pred3 * 0.3

## Submission

In [None]:
df_sub = pd.read_csv("1-1_검증데이터셋.csv")
df_sub.set_index(['YearMonthDayHourMinute', 'STN'], inplace=True)
df_sub.sort_index(level='STN', inplace=True)

df_pred = pd.DataFrame(pred)
# 데이터 처음 6개는 비어있으므로 0으로 채워줌
df_sub["UV"][6:] = df_pred[0]
df_sub["UV"][:6] = 0
df_sub.reset_index(inplace=True)
df_sub.sort_values(by=['YearMonthDayHourMinute', 'STN'], inplace=True)
df_sub.reset_index(drop=True, inplace=True)

df_sub.to_csv("220136.csv", index=False, encoding='utf-8')