In [34]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt
import tensorflow as tf

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

In [35]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    for files in os.listdir(path):
        if files.endswith('.csv'):
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
        df_raw = pd.concat((df_raw,df),axis=0)
    return df_raw

In [36]:
path = 'Dataset'
_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

In [37]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission.copy()
submission.name = "submission"

In [38]:
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,
                              format=format)
    return df

In [39]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

In [40]:
df_water

Unnamed: 0_level_0,swl,inf,sfw,ecpc,tototf,tide_level,wl_1018662,fw_1018662,wl_1018680,fw_1018680,wl_1018683,fw_1018683,wl_1019630,fw_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2012-01-01 00:00:00,24.95,232,224.23,19.77,134,,278.7,269.4,273.2,0,261.0,175.16,253.3,310.31
2012-01-01 00:10:00,24.929,134.4,223.49,20.51,134.4,,279.7,274.78,273.2,0,261.0,176.6,258.3,356.89
2012-01-01 00:20:00,24.929,128.3,223.49,20.51,128.3,,279.7,274.78,273.2,0,261.0,179.49,261.3,386.43
2012-01-01 00:30:00,24.929,134.4,223.49,20.51,134.4,,278.7,269.4,273.2,0,261.0,173.72,262.3,396.55
2012-01-01 00:40:00,24.937,232.7,223.74,20.26,134.4,,279.7,274.78,273.2,0,261.0,76.99,262.3,396.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-18 23:10:00,25.04,259.23,212.86,31.14,259.23,510.0,0.0,319.84,0.0,,0.0,-456.41,0.0,974.4
2022-07-18 23:20:00,25.04,260.46,212.86,31.14,260.46,492.0,0.0,314.01,0.0,,0.0,-717.3,0.0,1006.88
2022-07-18 23:30:00,25.04,259.37,212.86,31.14,259.37,475.0,0.0,387.55,0.0,,0.0,-843.37,0.0,1039.9
2022-07-18 23:40:00,25.04,259.13,212.86,31.14,259.13,458.0,0.0,454.91,0.0,,0.0,-1023.37,0.0,1073.46


In [41]:
df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)

In [42]:
# data target 분리하기
target = df_water.loc[:,submission.columns]
data = pd.concat((df_rf,df_water.drop(submission.columns,axis=1)),axis=1)

In [43]:
# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
_target = target.reset_index(drop=True)
_data = data.reset_index(drop=True)

_data.index += 1

tot=pd.concat((_data,_target),axis=1)
tot=tot.sort_index()

tot=tot.iloc[1:-1]

target = tot.loc[:,submission.columns]
data = tot.drop(submission.columns,axis=1)

In [44]:
train_target=target.iloc[:-len(submission),:]
test_target=target.iloc[-len(submission):,:]

train_data=data.iloc[:-len(submission),:]
test_data=data.iloc[-len(submission):,:]

In [45]:
train_data.isna().sum()

rf_10184100         0
rf_10184110         0
rf_10184140         0
swl               707
inf               707
sfw               707
ecpc              707
tototf            707
tide_level      13384
fw_1018662      15164
fw_1018680     189936
fw_1018683       1279
fw_1019630         59
dtype: int64

In [46]:
#train_target.fillna(train_target.mean(),inplace=True)
#test_target.fillna(train_target.mean(),inplace=True)
train_data.fillna(train_data.mean(),inplace=True)
#test_data.fillna(train_data.mean(),inplace=True)

In [47]:
train_data

Unnamed: 0,rf_10184100,rf_10184110,rf_10184140,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018680,fw_1018683,fw_1019630
1,0.0,0.0,0.0,24.95,232,224.23,19.77,134,334.386259,269.4,0,175.16,310.31
2,0.0,0.0,0.0,24.929,134.4,223.49,20.51,134.4,334.386259,274.78,0,176.6,356.89
3,0.0,0.0,0.0,24.929,128.3,223.49,20.51,128.3,334.386259,274.78,0,179.49,386.43
4,0.0,0.0,0.0,24.929,134.4,223.49,20.51,134.4,334.386259,269.4,0,173.72,396.55
5,0.0,0.0,0.0,24.937,232.7,223.74,20.26,134.4,334.386259,274.78,0,76.99,396.55
...,...,...,...,...,...,...,...,...,...,...,...,...,...
530491,0.0,0.0,0.0,25.18,140.89,217.99,26.01,140.89,232.000000,314.01,,597.98,471.08
530492,0.0,0.0,0.0,25.18,140.94,217.99,26.01,140.94,220.000000,285.72,,575.57,493.58
530493,0.0,0.0,0.0,25.18,141.07,217.99,26.01,141.07,208.000000,274.78,,501.04,505.03
530494,0.0,0.0,0.0,25.18,141.01,217.99,26.01,141.01,196.000000,269.4,,425.89,505.03


In [48]:
train_data.isna().sum()

rf_10184100         0
rf_10184110         0
rf_10184140         0
swl               707
inf               707
sfw               707
ecpc              707
tototf            707
tide_level          0
fw_1018662      15164
fw_1018680     189936
fw_1018683       1279
fw_1019630         59
dtype: int64

In [33]:
train_target

Unnamed: 0,wl_1018662,wl_1018680,wl_1018683,wl_1019630
1,314.7,300.2,290.0,275.3
2,313.7,301.2,290.0,275.3
3,311.7,301.2,290.0,276.3
4,311.7,301.2,291.0,277.3
5,311.7,301.2,291.0,277.3
...,...,...,...,...
269419,281.7,281.2,278.0,271.3
269420,279.7,279.2,278.0,272.3
269421,278.7,277.2,277.0,272.3
269422,277.7,276.2,276.0,271.3


In [16]:
test_data

Unnamed: 0,rf_10184100,rf_10184110,rf_10184140,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018680,fw_1018683,fw_1019630
530496,0.0,0.0,0.0,25.19,140.56,218.36,25.64,140.56,173.0,269.4,,70.09,482.26
530497,0.0,0.0,0.0,25.19,140.6,218.36,25.64,140.6,162.0,269.4,,8.23,471.08
530498,0.0,0.0,0.0,25.19,140.78,218.36,25.64,140.78,151.0,280.22,,28.82,449.12
530499,0.0,0.0,0.0,25.2,755.9,218.73,25.27,140.9,141.0,296.87,,12.35,417.17
530500,0.0,0.0,0.0,25.19,0.0,218.36,25.64,140.94,130.0,302.53,,53.52,386.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...
537403,0.0,0.0,0.0,25.04,865.96,212.86,31.14,259.3,528.0,319.84,,25.27,926.7
537404,0.0,0.0,0.0,25.04,259.23,212.86,31.14,259.23,510.0,319.84,,-456.41,974.4
537405,0.0,0.0,0.0,25.04,260.46,212.86,31.14,260.46,492.0,314.01,,-717.3,1006.88
537406,0.0,0.0,0.0,25.04,259.37,212.86,31.14,259.37,475.0,387.55,,-843.37,1039.9


### LightGBM

In [None]:
train_target_1018662 = train_target['wl_1018662']
train_target_1018680 = train_target['wl_1018680']
train_target_1018683 = train_target['wl_1018683']
train_target_1019630 = train_target['wl_1019630']

In [None]:
from lightgbm import LGBMRegressor

lgbm =  LGBMRegressor(
            n_estimators=1000,
            n_jobs=-1,
            learning_rate=0.001,
            num_leaves = 32,
            max_depth = 128,
            min_child_samples = 100,
            subsample = 0.8,
            silent=-1,
            verbose=-1)
with tf.device("/device:GPU:0"):
    print('=====lgbm_1018662=====')
    lgbm.fit(train_data, train_target_1018662,verbose= 100)
    pred_1018662 = lgbm.predict(test_data)
    print('=====lgbm_1018680=====')
    lgbm.fit(train_data, train_target_1018680,verbose= 100)
    pred_1018680 = lgbm.predict(test_data)
    print('=====lgbm_1018683=====')
    lgbm.fit(train_data, train_target_1018683,verbose= 100)
    pred_1018683 = lgbm.predict(test_data)
    print('=====lgbm_1019630=====')
    lgbm.fit(train_data, train_target_1019630,verbose= 100)
    pred_1019630 = lgbm.predict(test_data)

In [None]:
_submission["wl_1018662"] = pred_1018662
_submission["wl_1018680"] = pred_1018680
_submission["wl_1018683"] = pred_1018683
_submission["wl_1019630"] = pred_1019630

sample_submission.to_csv('lgbm_test2.csv',index=False)