## Reading Data

In [1]:
import pandas as pd
import numpy as np
import datetime

In [2]:
# 데이터 세트 로드
mosquitoDF = pd.read_csv('4. dms_seoul_avg.csv')

mosquitoDF

Unnamed: 0,date,mosquito,temp,rain_per_day,accum_rain,wind,humidity,sunshine
0,2015-04-06,199,12.214286,5.285714,0,3.371429,68.757143,8.880000
1,2015-04-07,146,11.571429,4.928571,0,3.428571,65.742857,9.891429
2,2015-04-08,90,10.914286,4.571429,0,3.257143,63.542857,9.927143
3,2015-04-09,172,10.171429,0.571429,0,2.871429,59.614286,10.884286
4,2015-04-10,249,10.314286,0.071429,0,2.871429,53.485714,12.777143
...,...,...,...,...,...,...,...,...
1168,2020-10-27,535,11.942857,0.000000,0,2.185714,63.242857,12.734286
1169,2020-10-28,884,11.900000,0.000000,0,2.271429,60.771429,14.241429
1170,2020-10-29,695,11.514286,0.000000,0,2.157143,57.100000,14.635714
1171,2020-10-30,543,11.985714,0.000000,0,1.942857,55.871429,14.294286


In [3]:
# pd.to_datetime(mosquitoDF['date'])

## Standardization

In [4]:
date = mosquitoDF['date']
mosquitoDF.drop(['date'], axis=1, inplace=True)

date.head()

0    2015-04-06
1    2015-04-07
2    2015-04-08
3    2015-04-09
4    2015-04-10
Name: date, dtype: object

In [5]:
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
mosquitoDF_sc = sc.fit_transform(mosquitoDF)

mosquitoDF_sc

array([[-1.27923383, -2.01584225,  0.07861879, ...,  2.64508713,
         0.52795174, -1.57229733],
       [-1.30085829, -2.15178105,  0.02924105, ...,  2.76861056,
         0.23124569, -1.34092865],
       [-1.32370677, -2.29074071, -0.02013669, ...,  2.39804028,
         0.01469247, -1.33275886],
       ...,
       [-1.07686158, -2.1638645 , -0.65217173, ...,  0.02021428,
        -0.61949913, -0.2556527 ],
       [-1.13887888, -2.06417604, -0.65217173, ..., -0.44299858,
        -0.74043145, -0.33375597],
       [-1.17519165, -1.91615379, -0.65217173, ..., -0.41211772,
        -0.56184535, -0.51741302]])

In [6]:
# DataFrame으로 변환
columns = ['mosquito', 'temp', 'rain_per_day', 'accum_rain', 'wind', 'humidity', 'sunshine']
mosquitoDF_sc = pd.DataFrame(mosquitoDF_sc, columns=columns)
mosquitoDF_sc = pd.concat(([date, mosquitoDF_sc]), axis=1)

mosquitoDF_sc

Unnamed: 0,date,mosquito,temp,rain_per_day,accum_rain,wind,humidity,sunshine
0,2015-04-06,-1.279234,-2.015842,0.078619,-0.353865,2.645087,0.527952,-1.572297
1,2015-04-07,-1.300858,-2.151781,0.029241,-0.353865,2.768611,0.231246,-1.340929
2,2015-04-08,-1.323707,-2.290741,-0.020137,-0.353865,2.398040,0.014692,-1.332759
3,2015-04-09,-1.290250,-2.447826,-0.573167,-0.353865,1.564257,-0.372010,-1.113808
4,2015-04-10,-1.258833,-2.417617,-0.642296,-0.353865,1.564257,-0.975265,-0.680809
...,...,...,...,...,...,...,...,...
1168,2020-10-27,-1.142143,-2.073239,-0.652172,-0.353865,0.081976,-0.014838,-0.690613
1169,2020-10-28,-0.999748,-2.082301,-0.652172,-0.353865,0.267261,-0.258108,-0.345847
1170,2020-10-29,-1.076862,-2.163864,-0.652172,-0.353865,0.020214,-0.619499,-0.255653
1171,2020-10-30,-1.138879,-2.064176,-0.652172,-0.353865,-0.442999,-0.740431,-0.333756


## Setting Target Variable & Seperating Train-Test Set

In [7]:
# 목표변수 설정
y_target = mosquitoDF_sc['mosquito']
X_data = mosquitoDF_sc.drop(['mosquito'], axis=1)

In [8]:
# Train-Test Set 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size=0.2, random_state=0)

In [9]:
X_train

Unnamed: 0,date,temp,rain_per_day,accum_rain,wind,humidity,sunshine
1042,2020-06-23,0.817727,-0.650197,-0.353865,-0.412118,-0.509816,1.309354
1097,2020-08-17,1.098667,2.634410,-0.353865,0.699593,2.216786,-1.343870
858,2019-06-23,0.171262,-0.371706,-0.353865,-0.751807,-0.028899,0.786487
772,2018-10-15,-1.916154,-0.513914,-0.353865,-1.276782,-0.581532,-0.453362
97,2015-07-12,0.995957,-0.069514,0.951006,1.255449,0.127188,-0.253038
...,...,...,...,...,...,...,...
1033,2020-06-14,0.814706,-0.507989,-0.308870,0.020214,-0.062648,1.674380
763,2018-10-06,-0.961561,1.164929,3.785727,-0.535641,0.110313,-0.503688
835,2019-05-31,-0.284888,-0.511939,-0.353865,0.174619,-1.214317,1.272100
559,2017-09-17,0.077615,-0.178145,-0.353865,0.637831,-0.796679,0.571131


In [10]:
train_date = X_train['date']
X_train.drop(['date'], axis=1, inplace=True)
test_date = X_test['date']
X_test.drop(['date'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [11]:
y_test

907    -0.401607
975    -1.071965
597    -0.907946
328     2.326338
425    -0.867553
          ...   
45      5.204022
1046   -0.170267
592    -0.490145
1036   -0.137626
609    -0.659877
Name: mosquito, Length: 235, dtype: float64

## Training LGBM & GBR & RFR

In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

### 1. LGBM(Light Gradient Boosting Machine)

In [13]:
lgb_reg = LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=7,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=10, n_jobs=-1, num_leaves=31, objective=None,
               random_state=0, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

lgb_reg.fit(X_train, y_train)

y_preds_lgb = lgb_reg.predict(X_test)
print('MAE : {:.3f}'.format(mean_absolute_error(y_test, y_preds_lgb)))
print('MSE : {:.3f}'.format(mean_squared_error(y_test, y_preds_lgb)))
print('RMSE : {:.3f}'.format(np.sqrt(mean_squared_error(y_test, y_preds_lgb))))
print('R2 : {:.3f}'.format(r2_score(y_test, y_preds_lgb)))

MAE : 0.427
MSE : 0.415
RMSE : 0.644
R2 : 0.408


In [14]:
y_preds_lgb = pd.DataFrame(y_preds_lgb, columns=['LGBM'])

y_preds_lgb

Unnamed: 0,LGBM
0,-0.123286
1,-0.503023
2,-0.455094
3,1.130293
4,-0.500257
...,...
230,0.234826
231,0.252088
232,-0.501346
233,0.476186


In [15]:
y_preds_lgb.to_csv('(1) y_preds_lgb.csv', index=False)

### 2. GBR(Gradient Boosting Regressor)

In [16]:
gb_reg = GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                           init=None, learning_rate=0.1, loss='ls', max_depth=2,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=10,
                           n_iter_no_change=None,random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0, warm_start=False)

gb_reg.fit(X_train, y_train)

y_preds_gb = gb_reg.predict(X_test)
print('MAE : {:.3f}'.format(mean_absolute_error(y_test, y_preds_gb)))
print('MSE : {:.3f}'.format(mean_squared_error(y_test, y_preds_gb)))
print('RMSE : {:.3f}'.format(np.sqrt(mean_squared_error(y_test, y_preds_gb))))
print('R2 : {:.3f}'.format(r2_score(y_test, y_preds_gb)))

MAE : 0.477
MSE : 0.515
RMSE : 0.717
R2 : 0.265


In [17]:
y_preds_gb = pd.DataFrame(y_preds_gb, columns=['GBR'])

y_preds_gb

Unnamed: 0,GBR
0,0.152233
1,-0.567270
2,-0.567270
3,0.422473
4,-0.151115
...,...
230,-0.355700
231,0.190321
232,-0.520141
233,0.422473


In [18]:
y_preds_gb.to_csv('(2) y_preds_gb.csv', index=False)

### 3. RFR(Random Forest Regressor)

In [19]:
rf_reg = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=50, n_jobs=-1, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

rf_reg.fit(X_train, y_train)

y_preds_rf = rf_reg.predict(X_test)
print('MAE : {:.3f}'.format(mean_absolute_error(y_test, y_preds_rf)))
print('MSE : {:.3f}'.format(mean_squared_error(y_test, y_preds_rf)))
print('RMSE : {:.3f}'.format(np.sqrt(mean_squared_error(y_test, y_preds_rf))))
print('R2 : {:.3f}'.format(r2_score(y_test, y_preds_rf)))

MAE : 0.466
MSE : 0.511
RMSE : 0.715
R2 : 0.270


In [20]:
y_preds_rf = pd.DataFrame(y_preds_rf, columns=['RFR'])

y_preds_rf

Unnamed: 0,RFR
0,0.258861
1,-0.846165
2,-0.846165
3,0.475359
4,-0.194254
...,...
230,-0.532312
231,0.355118
232,-0.846165
233,0.475359


In [21]:
y_preds_gb.to_csv('(3) y_preds_rf.csv', index=False)

## MAE, MSE, RMSE, $R^2$

In [22]:
print('##### y_test와 y_preds_lgb #####\n\nMAE : {}\nMSE : {}\nRMSE : {}\nR^2 : {}'.format(mean_absolute_error(y_test, y_preds_lgb), mean_squared_error(y_test, y_preds_lgb), np.sqrt(mean_squared_error(y_test, y_preds_lgb)), r2_score(y_test, y_preds_lgb)))

##### y_test와 y_preds_lgb #####

MAE : 0.4271417721807008
MSE : 0.41459246015374723
RMSE : 0.6438885463756497
R^2 : 0.4076141938247869


In [23]:
print('##### y_test와 y_preds_gb #####\n\nMAE : {}\nMSE : {}\nRMSE : {}\nR^2 : {}'.format(mean_absolute_error(y_test, y_preds_gb), mean_squared_error(y_test, y_preds_gb), np.sqrt(mean_squared_error(y_test, y_preds_gb)), r2_score(y_test, y_preds_gb)))

##### y_test와 y_preds_gb #####

MAE : 0.4771401466420605
MSE : 0.5145370699375001
RMSE : 0.7173123935479577
R^2 : 0.26480945440029346


In [24]:
print('##### y_test와 y_preds_rf #####\n\nMAE : {}\nMSE : {}\nRMSE : {}\nR^2 : {}'.format(mean_absolute_error(y_test, y_preds_rf), mean_squared_error(y_test, y_preds_rf), np.sqrt(mean_squared_error(y_test, y_preds_rf)), r2_score(y_test, y_preds_rf)))

##### y_test와 y_preds_rf #####

MAE : 0.4655783756370011
MSE : 0.5110388019001808
RMSE : 0.7148697796803141
R^2 : 0.26980791561383277


## Combining Data

In [25]:
y_test = pd.concat([test_date, y_test], axis=1)

In [26]:
y_test.to_csv('(4) y_test.csv', index=False)

In [27]:
combined_data = pd.concat([y_preds_lgb, y_preds_gb, y_preds_rf], axis=1)

combined_data.head()

Unnamed: 0,LGBM,GBR,RFR
0,-0.123286,0.152233,0.258861
1,-0.503023,-0.56727,-0.846165
2,-0.455094,-0.56727,-0.846165
3,1.130293,0.422473,0.475359
4,-0.500257,-0.151115,-0.194254


In [28]:
combined_data.to_csv('(5) combined_data.csv', index=False)

<span style="color:red">***y_test와 combined_data를 합칠 때 날짜 매칭이 잘 되지 않아서 수동으로 합쳐줘야 합니다...!***</span><br>
<br>
<span style="color:blue">***→ GitHub에 '(6) completed_data.csv'로 올렸어요!***</span>