In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import os

import numpy as np
import pandas as pd

import seaborn as sns 
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize']=(10,10)
plt.rcParams['font.family']='AppleGothic'

import warnings
warnings.filterwarnings(action='ignore')

from sklearn.model_selection import KFold, train_test_split



In [25]:
def read_csv_by_dir(path, index_col=None):
    df_raw = pd.DataFrame()
    print("df_raw")
    print(df_raw)
    
    for files in os.listdir(path):
        if files.endswith('.csv'):
            print("files")
            print(files)
            df = pd.read_csv('/'.join([path,files]),
                            index_col=index_col)
            print("df")
            print(df)
        df_raw = pd.concat((df_raw,df),axis=0)
        print("df_raw")
        print(df_raw)
    return df_raw

In [26]:
path = '/content/drive/MyDrive/Colab Notebooks/팔방댐/competition_data'
_df_rf_raw = read_csv_by_dir('/'.join([path,'rf_data']),
                            index_col=0)

_df_water_raw = read_csv_by_dir('/'.join([path,'water_data']),
                               index_col=0)

_submission_raw = pd.read_csv('/'.join([path,'sample_submission.csv']),
                             index_col=0)

df_raw
Empty DataFrame
Columns: []
Index: []
files
rf_2012.csv
df
                  rf_10184100  rf_10184110  rf_10184140
ymdhm                                                  
2012-05-01 00:00          0.0          0.0          0.0
2012-05-01 00:10          0.0          0.0          0.0
2012-05-01 00:20          0.0          0.0          0.0
2012-05-01 00:30          0.0          0.0          0.0
2012-05-01 00:40          0.0          0.0          0.0
...                       ...          ...          ...
2012-10-31 23:10          0.0          0.0          0.0
2012-10-31 23:20          0.0          0.0          0.0
2012-10-31 23:30          0.0          0.0          0.0
2012-10-31 23:40          0.0          0.0          0.0
2012-10-31 23:50          0.0          0.0          0.0

[26496 rows x 3 columns]
df_raw
                  rf_10184100  rf_10184110  rf_10184140
ymdhm                                                  
2012-05-01 00:00          0.0          0.0          0.0
2012-

In [27]:
# raw_data 보존하기
df_rf=_df_rf_raw.copy()
df_rf.name = "rain_data"

df_water=_df_water_raw.copy()
df_water.name = "water_data"

submission=_submission_raw.copy()
submission.name = "submission"

In [28]:
def index_to_datetime(df,format):
    df.index = pd.to_datetime(df.index,format=format)
    # print("df ~~~~~~~~~~~~~~~~~~~~``")
    # print(df)
    return df

In [29]:
df_rf=index_to_datetime(df=df_rf,format='%Y-%m-%d %H:%M')
print("df_rf~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
print(df_rf)
df_water=index_to_datetime(df=df_water,format='%Y-%m-%d %H:%M')
submission=index_to_datetime(df=submission,format='%Y-%m-%d %H:%M')

df_rf~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
                     rf_10184100  rf_10184110  rf_10184140
ymdhm                                                     
2012-05-01 00:00:00          0.0          0.0          0.0
2012-05-01 00:10:00          0.0          0.0          0.0
2012-05-01 00:20:00          0.0          0.0          0.0
2012-05-01 00:30:00          0.0          0.0          0.0
2012-05-01 00:40:00          0.0          0.0          0.0
...                          ...          ...          ...
2022-07-18 23:10:00          0.0          0.0          0.0
2022-07-18 23:20:00          0.0          0.0          0.0
2022-07-18 23:30:00          0.0          0.0          0.0
2022-07-18 23:40:00          0.0          0.0          0.0
2022-07-18 23:50:00          0.0          0.0          0.0

[276336 rows x 3 columns]


In [30]:
df_rf.sort_index(inplace=True)
df_water.sort_index(inplace=True)
submission.sort_index(inplace=True)


In [31]:
# 데이터 시간대 확인하기
def check_datetime(df):
    print(df.name)
    print(df.select_dtypes('datetime64[ns]').head(1).index[0])
    print(df.select_dtypes('datetime64[ns]').tail(1).index[0])
    return None

check_datetime(df_rf)
check_datetime(df_water)
check_datetime(submission)

rain_data
2012-05-01 00:00:00
2022-07-18 23:50:00
water_data
2012-05-01 00:00:00
2022-07-18 23:50:00
submission
2022-06-01 00:00:00
2022-07-18 23:50:00


In [32]:
# data target 분리하기
target = df_water.loc[:,submission.columns]
print(target)
data = pd.concat((df_rf,df_water.drop(submission.columns,axis=1)),axis=1)
print("data")
print(data)

                     wl_1018662  wl_1018680  wl_1018683  wl_1019630
ymdhm                                                              
2012-05-01 00:00:00       310.7       300.2       290.0       275.3
2012-05-01 00:10:00       314.7       300.2       290.0       275.3
2012-05-01 00:20:00       313.7       301.2       290.0       275.3
2012-05-01 00:30:00       311.7       301.2       290.0       276.3
2012-05-01 00:40:00       311.7       301.2       291.0       277.3
...                         ...         ...         ...         ...
2022-07-18 23:10:00         0.0         0.0         0.0         0.0
2022-07-18 23:20:00         0.0         0.0         0.0         0.0
2022-07-18 23:30:00         0.0         0.0         0.0         0.0
2022-07-18 23:40:00         0.0         0.0         0.0         0.0
2022-07-18 23:50:00         0.0         0.0         0.0         0.0

[276336 rows x 4 columns]
data
                     rf_10184100  rf_10184110  rf_10184140     swl     inf  \
ymdhm 

In [11]:
# data와 target 하나 밀어주기 (과거데이터를 사용해야 함으로)
print("target")
print(target)

# _target = target.reset_index(drop=True)
# _data = data.reset_index(drop=True)
# # print("_target")
# # print(_target)
# print("_data")
# print(_data.head(-10))


# _data.index += 1
# print("_data!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
# print(_data.head(-10))



# tot=pd.concat((_data,_target),axis=1)
# tot=tot.sort_index()

# tot=tot.iloc[1:-1]

# target = tot.loc[:,submission.columns]
# data = tot.drop(submission.columns,axis=1)

target
                     wl_1018662  wl_1018680  wl_1018683  wl_1019630
ymdhm                                                              
2012-05-01 00:00:00       310.7       300.2       290.0       275.3
2012-05-01 00:10:00       314.7       300.2       290.0       275.3
2012-05-01 00:20:00       313.7       301.2       290.0       275.3
2012-05-01 00:30:00       311.7       301.2       290.0       276.3
2012-05-01 00:40:00       311.7       301.2       291.0       277.3
...                         ...         ...         ...         ...
2022-07-18 23:10:00         0.0         0.0         0.0         0.0
2022-07-18 23:20:00         0.0         0.0         0.0         0.0
2022-07-18 23:30:00         0.0         0.0         0.0         0.0
2022-07-18 23:40:00         0.0         0.0         0.0         0.0
2022-07-18 23:50:00         0.0         0.0         0.0         0.0

[276336 rows x 4 columns]


In [12]:
print(submission.columns)

Index(['wl_1018662', 'wl_1018680', 'wl_1018683', 'wl_1019630'], dtype='object')


In [13]:


target.head(10)

Unnamed: 0_level_0,wl_1018662,wl_1018680,wl_1018683,wl_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012-05-01 00:00:00,310.7,300.2,290.0,275.3
2012-05-01 00:10:00,314.7,300.2,290.0,275.3
2012-05-01 00:20:00,313.7,301.2,290.0,275.3
2012-05-01 00:30:00,311.7,301.2,290.0,276.3
2012-05-01 00:40:00,311.7,301.2,291.0,277.3
2012-05-01 00:50:00,311.7,301.2,291.0,277.3
2012-05-01 01:00:00,312.7,301.2,291.0,277.3
2012-05-01 01:10:00,311.7,301.2,291.0,277.3
2012-05-01 01:20:00,312.7,301.2,291.0,278.3
2012-05-01 01:30:00,312.7,301.2,291.0,278.3


In [14]:
target = target.iloc[269496:269496+6913]
target

Unnamed: 0_level_0,wl_1018662,wl_1018680,wl_1018683,wl_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-01 12:00:00,0.0,0.0,0.0,0.0
2022-06-01 12:10:00,0.0,0.0,0.0,0.0
2022-06-01 12:20:00,0.0,0.0,0.0,0.0
2022-06-01 12:30:00,0.0,0.0,0.0,0.0
2022-06-01 12:40:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2022-07-18 23:10:00,0.0,0.0,0.0,0.0
2022-07-18 23:20:00,0.0,0.0,0.0,0.0
2022-07-18 23:30:00,0.0,0.0,0.0,0.0
2022-07-18 23:40:00,0.0,0.0,0.0,0.0


In [15]:
data = data.iloc[269496:269496+6913]


In [16]:
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6840 entries, 2022-06-01 12:00:00 to 2022-07-18 23:50:00
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   rf_10184100  6840 non-null   float64
 1   rf_10184110  6840 non-null   float64
 2   rf_10184140  6840 non-null   float64
 3   swl          6804 non-null   float64
 4   inf          6804 non-null   float64
 5   sfw          6804 non-null   float64
 6   ecpc         6804 non-null   float64
 7   tototf       6804 non-null   float64
 8   tide_level   6836 non-null   float64
 9   fw_1018662   5640 non-null   float64
 10  fw_1018680   0 non-null      float64
 11  fw_1018683   6840 non-null   float64
 12  fw_1019630   6840 non-null   float64
dtypes: float64(13)
memory usage: 748.1 KB


In [17]:
# data = pd.DataFrame(data)
data = data[['swl','inf','sfw','ecpc','tototf','tide_level','fw_1018662','fw_1018683','fw_1018683']]

# data.fillna(0)
# data
# data.isna()

In [18]:
data

Unnamed: 0_level_0,swl,inf,sfw,ecpc,tototf,tide_level,fw_1018662,fw_1018683,fw_1018683
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-06-01 12:00:00,25.12,141.17,215.79,28.21,141.17,259.0,483.42,534.90,534.90
2022-06-01 12:10:00,25.13,752.52,216.15,27.85,140.85,246.0,461.96,565.01,565.01
2022-06-01 12:20:00,25.13,141.13,216.15,27.85,141.13,235.0,483.42,451.62,451.62
2022-06-01 12:30:00,25.13,140.78,216.15,27.85,140.78,222.0,476.21,295.55,295.55
2022-06-01 12:40:00,25.13,140.97,216.15,27.85,140.97,211.0,476.21,222.42,222.42
...,...,...,...,...,...,...,...,...,...
2022-07-18 23:10:00,25.04,259.23,212.86,31.14,259.23,510.0,319.84,-456.41,-456.41
2022-07-18 23:20:00,25.04,260.46,212.86,31.14,260.46,492.0,314.01,-717.30,-717.30
2022-07-18 23:30:00,25.04,259.37,212.86,31.14,259.37,475.0,387.55,-843.37,-843.37
2022-07-18 23:40:00,25.04,259.13,212.86,31.14,259.13,458.0,454.91,-1023.37,-1023.37


In [19]:
target

Unnamed: 0_level_0,wl_1018662,wl_1018680,wl_1018683,wl_1019630
ymdhm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-06-01 12:00:00,0.0,0.0,0.0,0.0
2022-06-01 12:10:00,0.0,0.0,0.0,0.0
2022-06-01 12:20:00,0.0,0.0,0.0,0.0
2022-06-01 12:30:00,0.0,0.0,0.0,0.0
2022-06-01 12:40:00,0.0,0.0,0.0,0.0
...,...,...,...,...
2022-07-18 23:10:00,0.0,0.0,0.0,0.0
2022-07-18 23:20:00,0.0,0.0,0.0,0.0
2022-07-18 23:30:00,0.0,0.0,0.0,0.0
2022-07-18 23:40:00,0.0,0.0,0.0,0.0


In [20]:
# train_target.fillna(train_target.mean(),inplace=True)
# test_target.fillna(train_target.mean(),inplace=True)
# train_data.fillna(train_data.mean(),inplace=True)
# test_data.fillna(train_data.mean(),inplace=True)

In [22]:
from sklearn.ensemble import RandomForestRegressor

X= data
y= target

# X.fillna(X.mean(),inplace=True)
# y.fillna(y.mean(),inplace=True)

X.fillna(X.mean())
y.fillna(y.mean())


X = X.fillna(X.mean())
y = y.fillna(y.mean())

#  random_state 인자는 수행시마다 동일한 결과를 얻기 위해 적용
X_train,X_test, y_train, y_test=train_test_split(X,y,test_size =0.2, random_state=42) 

# X_train.fillna(X_train.mean(),inplace=True)
# X_test.fillna(X_test.mean(),inplace=True)
# y_train.fillna(y_train.mean(),inplace=True)
# y_test.fillna(y_test.mean(),inplace=True)

X_train.fillna(X_train.mean())
X_test.fillna(X_test.mean())
y_train.fillna(y_train.mean())
y_test.fillna(y_test.mean())


model = RandomForestRegressor(n_estimators=10,random_state=42)

print("X_train")
print(X_train.info())
print("y_train")
print(y_train.info())


from sklearn.model_selection import GridSearchCV

# gridsearch가 찾을 파라미터 정의
# n_estimators: 반복 수행하는 트리의 개수
# min_samples_split: 노드를 분할하기 위한 최소한의 샘플 데이터 수
myparam = {'n_estimators': [10,20], 'min_samples_split':[2,3,4]}

gcv_model = GridSearchCV(model,param_grid=myparam,scoring='f1_macro',refit=True, cv=5)

gcv_model.fit(X_train,y_train)
print(gcv_model.best_score_) #학습된 데이터들 중 best score
print(gcv_model.best_params_) #학습된 데이터들 중 best_params
#print(gcv_model.cv_results_) #학습된 데이터들 중 cv_results  

X_train
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 2022-06-17 03:10:00 to 2022-06-07 11:20:00
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   swl         5442 non-null   float64
 1   inf         5442 non-null   float64
 2   sfw         5442 non-null   float64
 3   ecpc        5442 non-null   float64
 4   tototf      5442 non-null   float64
 5   tide_level  5468 non-null   float64
 6   fw_1018662  4514 non-null   float64
 7   fw_1018683  5472 non-null   float64
 8   fw_1018683  5472 non-null   float64
dtypes: float64(9)
memory usage: 427.5 KB
None
y_train
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5472 entries, 2022-06-17 03:10:00 to 2022-06-07 11:20:00
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   wl_1018662  5472 non-null   float64
 1   wl_1018680  5472 non-null   float64
 2   wl_1018683  5472 non-null   float

ValueError: ignored

In [None]:
model=gcv_model.best_estimator_
print(model)

model.fit(X_train,y_train)

In [None]:
y_pred=model.predict(X)
print(y_pred)
y_pred = pd.DataFrame(y_pred)

sample_submission = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/팔방댐/sample_submission.csv")

sample_submission.iloc[:,1:] = y_pred
sample_submission.index = sample_submission.index+1
sample_submission.to_csv('ans13.csv',index=False)

In [None]:
sample_submission

In [None]:
y_pred

In [None]:
print(submission)