### Decision Tree를 이용한 강수량 예측

In [135]:
# dataset 출처: https://www.kaggle.com/shivapan/rainprediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

data = pd.read_csv('rain.csv')
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [136]:
data.count()

Date             145460
Location         145460
MinTemp          143975
MaxTemp          144199
Rainfall         142199
Evaporation       82670
Sunshine          75625
WindGustDir      135134
WindGustSpeed    135197
WindDir9am       134894
WindDir3pm       141232
WindSpeed9am     143693
WindSpeed3pm     142398
Humidity9am      142806
Humidity3pm      140953
Pressure9am      130395
Pressure3pm      130432
Cloud9am          89572
Cloud3pm          86102
Temp9am          143693
Temp3pm          141851
RainToday        142199
RainTomorrow     142193
dtype: int64

In [137]:
# 결측치
data.isna().sum()

Date                 0
Location             0
MinTemp           1485
MaxTemp           1261
Rainfall          3261
Evaporation      62790
Sunshine         69835
WindGustDir      10326
WindGustSpeed    10263
WindDir9am       10566
WindDir3pm        4228
WindSpeed9am      1767
WindSpeed3pm      3062
Humidity9am       2654
Humidity3pm       4507
Pressure9am      15065
Pressure3pm      15028
Cloud9am         55888
Cloud3pm         59358
Temp9am           1767
Temp3pm           3609
RainToday         3261
RainTomorrow      3267
dtype: int64

In [139]:
# RainTomrrow, Rainfall, Date, Location은 필요 없어서 삭제
del data['RainTomorrow']
del data['Rainfall']
del data['Date']
del data['Location']

KeyError: 'RainTomorrow'

In [140]:
# 전체에서 상당수가 결측치인 Evaopartion, Sunshine, Cloud9am, Cloud3pm column 제거 & 결측치가 있는 나머지 row 제거 
data = data.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'],axis=1)
data = data.dropna()

In [66]:
data.count()

Date             112925
Location         112925
MinTemp          112925
MaxTemp          112925
Rainfall         112925
WindGustDir      112925
WindGustSpeed    112925
WindDir9am       112925
WindDir3pm       112925
WindSpeed9am     112925
WindSpeed3pm     112925
Humidity9am      112925
Humidity3pm      112925
Pressure9am      112925
Pressure3pm      112925
Temp9am          112925
Temp3pm          112925
RainToday        112925
RainTomorrow     112925
dtype: int64

In [141]:
# 범주화 변수를 dummy 변수로 변환 
data = pd.get_dummies(data)
data.head()

Unnamed: 0,MinTemp,MaxTemp,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
0,13.4,22.9,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,...,0,0,0,0,0,0,1,0,1,0
1,7.4,25.1,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,...,0,0,0,0,0,0,0,1,1,0
2,12.9,25.7,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,...,0,0,0,0,0,0,0,1,1,0
3,9.2,28.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,...,0,0,0,0,0,0,0,0,1,0
4,17.5,32.3,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,...,0,0,0,0,0,0,0,0,1,0


In [142]:
# dummy 변수가 n-1이 되도록 삭제 
del data['RainToday_No']
del data['WindDir3pm_WSW']

In [143]:
print(len(data[data['RainToday_Yes']==1]))
print(len(data[data['RainToday_Yes']==0]))

25613
88066


In [144]:
# Decision Tree 적용
from sklearn.tree import DecisionTreeClassifier
import sklearn.model_selection

# 비가 온 날과 오지 않은 날의 비율이 50:50이 되도록 비가 오지 않은 날의 데이터를 임의로 추출
rain = data[data['RainToday_Yes']==1]
no_rain = data[data['RainToday_Yes']==0].sample(len(rain))

X = pd.concat([rain,no_rain], ignore_index=True)
y = X['RainToday_Yes']
del X['RainToday_Yes']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)

model = DecisionTreeClassifier(random_state=0)
model.fit(X_train,y_train)
y_test_pred = model.predict(X_test)
print(y_test_pred)

DecisionTreeClassifier(random_state=0)

[0 1 1 ... 1 1 1]


In [145]:
# 예측 모델 평가
results_test = pd.DataFrame({'y_test':y_test, 'y_pred':y_test_pred})
print(model.score(X_test,y_test))
print(model.score(X_train,y_train)) 

0.7116420707425627
1.0


In [146]:
# 모델이 굉장히 과적합되었음을 확인 -> 트리의 깊이를 얕게하여 모델 단순화
X = pd.concat([rain,no_rain], ignore_index=True)
y = X['RainToday_Yes']
del X['RainToday_Yes']
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X,y)

model = DecisionTreeClassifier(random_state=0, max_depth=5)
model.fit(X_train,y_train)
print(model.score(X_test,y_test))
print(model.score(X_train,y_train)) 

DecisionTreeClassifier(max_depth=5, random_state=0)

0.7594284375732021
0.7607954397563705


In [150]:
# 모델에 기여하는 변수 확인
importance = pd.DataFrame({'feauture_names':X.columns, "coefficient":model.feature_importances_})
importance.sort_values(by='coefficient',ascending=False)

Unnamed: 0,feauture_names,coefficient
5,Humidity9am,0.565103
6,Humidity3pm,0.114987
1,MaxTemp,0.108015
7,Pressure9am,0.092289
2,WindGustSpeed,0.053578
0,MinTemp,0.048626
10,Temp3pm,0.00773
30,WindDir9am_N,0.006106
32,WindDir9am_NNE,0.003566
53,WindDir3pm_SSE,0.0
