# 내일 호주에는 비가 올까? (이진분류분석)

In [110]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [134]:
weather = pd.read_csv("weatherAUS.csv")
weather.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No


- RISK_MM: The amount of next day rain in mm. Used to create response variable RainTomorrow. A kind of measure of the "risk".
- RainTomorrow(Target): 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0

In [135]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
Date             142193 non-null object
Location         142193 non-null object
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
Evaporation      81350 non-null float64
Sunshine         74377 non-null float64
WindGustDir      132863 non-null object
WindGustSpeed    132923 non-null float64
WindDir9am       132180 non-null object
WindDir3pm       138415 non-null object
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Cloud9am         88536 non-null float64
Cloud3pm         85099 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null obje

## 데이터 전처리

In [179]:
# 머신러닝 알고리즘에 불필요한 속성 제거, 인덱스 재설정
def drop_features(df):
    df.drop(['Location','WindGustDir','WindDir9am','WindDir3pm','Evaporation', 'Sunshine'],axis=1,inplace=True)
    df = df.set_index(['Date'])
    return df

# Null 처리 함수 - 데이터 갯수가 많으므로 null row 다 드랍
def fillna(df):
    df = df.dropna()
    return df

# 레이블 인코딩 수행 - RainToday, RainTomorrow 칼럼
def format_features(df):
    df['RainToday'] = df['RainToday'].replace(['Yes','No'],[1,0])
    df['RainTomorrow'] = df['RainTomorrow'].replace(['Yes','No'],[1,0])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = drop_features(df)
    df = fillna(df)
    df = format_features(df)
    return df

In [189]:
# 원본 데이터를 재로딩 하고, Label 데이터 셋 추출. 
weather_df = pd.read_csv("weatherAUS.csv")
y_weather_df = transform_features(weather_df)
y_weather_df = y_weather_df['RainTomorrow']

In [190]:
# 원본 데이터를 재로딩 하고, feature 데이터 셋 추출. 
weather_df = pd.read_csv("weatherAUS.csv")
X_weather_df = transform_features(weather_df)
X_weather_df= X_weather_df.drop('RainTomorrow',axis=1)

In [191]:
y_weather_df

Date
2008-12-05    0
2008-12-12    1
2008-12-13    1
2008-12-17    1
2008-12-18    1
             ..
2017-05-25    0
2017-05-26    0
2017-05-29    0
2017-05-30    0
2017-06-24    0
Name: RainTomorrow, Length: 73411, dtype: int64

In [192]:
X_weather_df.head(3)

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2008-12-05,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,0,0.2
2008-12-12,15.9,21.7,2.2,31.0,15.0,13.0,89.0,91.0,1010.5,1004.2,8.0,8.0,15.9,17.0,1,15.6
2008-12-13,15.9,18.6,15.6,61.0,28.0,28.0,76.0,93.0,994.3,993.0,8.0,8.0,17.4,15.8,1,3.6


In [193]:
# test 셋과 training set 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_weather_df, y_weather_df, \
                                                  test_size=0.3, random_state=90)

In [194]:
y_test

Date
2014-07-19    1
2011-09-14    0
2009-05-29    1
2014-07-14    0
2009-09-02    0
             ..
2012-05-09    0
2016-12-01    0
2017-05-06    0
2014-06-05    0
2015-11-19    0
Name: RainTomorrow, Length: 22024, dtype: int64

In [195]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

dt_clf=DecisionTreeClassifier()

dt_clf.fit(X_train,y_train)
pred=dt_clf.predict(X_test)
#print(pred)
print("예측 정확도:{0:.4f}".format(accuracy_score(y_test,pred)))

예측 정확도:0.9999


In [149]:
from sklearn.model_selection import KFold

def exec_kfold(clf, folds=5):
    # 폴드 세트를 5개인 KFold객체를 생성, 폴드 수만큼 예측결과 저장을 위한  리스트 객체 생성.
    kfold = KFold(n_splits=folds)
    scores = []
    
    # KFold 교차 검증 수행. 
    for iter_count , (train_index, test_index) in enumerate(kfold.split(X_weather_df)):
        # X_titanic_df 데이터에서 교차 검증별로 학습과 검증 데이터를 가리키는 index 생성
        X_train, X_test = X_weather_df.values[train_index], X_weather_df.values[test_index]
        y_train, y_test = y_weather_df.values[train_index], y_weather_df.values[test_index]
        
        # Classifier 학습, 예측, 정확도 계산 
        clf.fit(X_train, y_train) 
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))     
    
    # 5개 fold에서의 평균 정확도 계산. 
    mean_score = np.mean(scores)
    print("평균 정확도: {0:.4f}".format(mean_score)) 
# exec_kfold 호출
exec_kfold(dt_clf , folds=5)

교차 검증 0 정확도: 1.0000
교차 검증 1 정확도: 1.0000
교차 검증 2 정확도: 1.0000
교차 검증 3 정확도: 1.0000
교차 검증 4 정확도: 1.0000
평균 정확도: 1.0000
