# 내일 호주에는 비가 올까? (이진분류분석)

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics import f1_score, confusion_matrix, precision_recall_curve, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [25]:
weather = pd.read_csv("weatherAUS.csv")
weather.head(8)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
5,2008-12-06,Albury,14.6,29.7,0.2,,,WNW,56.0,W,...,23.0,1009.2,1005.4,,,20.6,28.9,No,0.0,No
6,2008-12-07,Albury,14.3,25.0,0.0,,,W,50.0,SW,...,19.0,1009.6,1008.2,1.0,,18.1,24.6,No,0.0,No
7,2008-12-08,Albury,7.7,26.7,0.0,,,W,35.0,SSE,...,19.0,1013.4,1010.1,,,16.3,25.5,No,0.0,No


- RISK_MM: The amount of next day rain in mm. Used to create response variable RainTomorrow. A kind of measure of the "risk".
- RainTomorrow(Target): 1 if precipitation (mm) in the 24 hours to 9am exceeds 1mm, otherwise 0

In [26]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 24 columns):
Date             142193 non-null object
Location         142193 non-null object
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
Evaporation      81350 non-null float64
Sunshine         74377 non-null float64
WindGustDir      132863 non-null object
WindGustSpeed    132923 non-null float64
WindDir9am       132180 non-null object
WindDir3pm       138415 non-null object
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Cloud9am         88536 non-null float64
Cloud3pm         85099 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null obje

**Null 값 처리**

In [34]:
weather = weather.drop(['Location', 'WindGustDir','WindDir9am','WindDir3pm','Evaporation', 'Sunshine'], axis=1)

In [35]:
weather.set_index(['Date'])

Unnamed: 0_level_0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2008-12-01,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
2008-12-02,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2008-12-03,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
2008-12-04,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
2008-12-05,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-06-20,3.5,21.8,0.0,31.0,15.0,13.0,59.0,27.0,1024.7,1021.2,,,9.4,20.9,No,0.0,No
2017-06-21,2.8,23.4,0.0,31.0,13.0,11.0,51.0,24.0,1024.6,1020.3,,,10.1,22.4,No,0.0,No
2017-06-22,3.6,25.3,0.0,22.0,13.0,9.0,56.0,21.0,1023.5,1019.1,,,10.9,24.5,No,0.0,No
2017-06-23,5.4,26.9,0.0,37.0,9.0,9.0,53.0,24.0,1021.0,1016.8,,,12.5,26.1,No,0.0,No


In [38]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142193 entries, 0 to 142192
Data columns (total 18 columns):
Date             142193 non-null object
MinTemp          141556 non-null float64
MaxTemp          141871 non-null float64
Rainfall         140787 non-null float64
WindGustSpeed    132923 non-null float64
WindSpeed9am     140845 non-null float64
WindSpeed3pm     139563 non-null float64
Humidity9am      140419 non-null float64
Humidity3pm      138583 non-null float64
Pressure9am      128179 non-null float64
Pressure3pm      128212 non-null float64
Cloud9am         88536 non-null float64
Cloud3pm         85099 non-null float64
Temp9am          141289 non-null float64
Temp3pm          139467 non-null float64
RainToday        140787 non-null object
RISK_MM          142193 non-null float64
RainTomorrow     142193 non-null object
dtypes: float64(15), object(3)
memory usage: 19.5+ MB


In [39]:
weather.dropna()

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
4,2008-12-05,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
11,2008-12-12,15.9,21.7,2.2,31.0,15.0,13.0,89.0,91.0,1010.5,1004.2,8.0,8.0,15.9,17.0,Yes,15.6,Yes
12,2008-12-13,15.9,18.6,15.6,61.0,28.0,28.0,76.0,93.0,994.3,993.0,8.0,8.0,17.4,15.8,Yes,3.6,Yes
15,2008-12-17,14.1,20.9,0.0,22.0,11.0,9.0,69.0,82.0,1012.2,1010.4,8.0,1.0,17.2,18.1,No,16.8,Yes
16,2008-12-18,13.5,22.9,16.8,63.0,6.0,20.0,80.0,65.0,1005.8,1002.2,8.0,1.0,18.0,21.5,Yes,10.6,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142162,2017-05-25,14.6,26.3,0.0,37.0,19.0,20.0,61.0,36.0,1022.0,1018.6,7.0,1.0,15.4,25.0,No,0.4,No
142163,2017-05-26,14.3,27.6,0.4,39.0,0.0,20.0,68.0,27.0,1020.5,1016.1,4.0,7.0,16.1,27.2,No,0.0,No
142166,2017-05-29,12.7,22.2,0.0,37.0,19.0,13.0,59.0,34.0,1024.3,1021.7,8.0,8.0,13.9,21.0,No,0.0,No
142167,2017-05-30,9.4,22.7,0.0,35.0,13.0,17.0,62.0,32.0,1026.6,1023.6,8.0,4.0,11.8,22.3,No,0.0,No


In [41]:
def fillna(df):
    df = df.dropna()
    return df

fillna(weather)

Unnamed: 0,Date,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
4,2008-12-05,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No
11,2008-12-12,15.9,21.7,2.2,31.0,15.0,13.0,89.0,91.0,1010.5,1004.2,8.0,8.0,15.9,17.0,Yes,15.6,Yes
12,2008-12-13,15.9,18.6,15.6,61.0,28.0,28.0,76.0,93.0,994.3,993.0,8.0,8.0,17.4,15.8,Yes,3.6,Yes
15,2008-12-17,14.1,20.9,0.0,22.0,11.0,9.0,69.0,82.0,1012.2,1010.4,8.0,1.0,17.2,18.1,No,16.8,Yes
16,2008-12-18,13.5,22.9,16.8,63.0,6.0,20.0,80.0,65.0,1005.8,1002.2,8.0,1.0,18.0,21.5,Yes,10.6,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142162,2017-05-25,14.6,26.3,0.0,37.0,19.0,20.0,61.0,36.0,1022.0,1018.6,7.0,1.0,15.4,25.0,No,0.4,No
142163,2017-05-26,14.3,27.6,0.4,39.0,0.0,20.0,68.0,27.0,1020.5,1016.1,4.0,7.0,16.1,27.2,No,0.0,No
142166,2017-05-29,12.7,22.2,0.0,37.0,19.0,13.0,59.0,34.0,1024.3,1021.7,8.0,8.0,13.9,21.0,No,0.0,No
142167,2017-05-30,9.4,22.7,0.0,35.0,13.0,17.0,62.0,32.0,1026.6,1023.6,8.0,4.0,11.8,22.3,No,0.0,No


In [None]:
from sklearn.preprocessing import LabelEncoder

# 머신러닝 알고리즘에 불필요한 속성 제거, 인덱스 재설정
def drop_features(df):
    df.drop(['Location','WindGustDir','WindDir9am','WindDir3pm','Evaporation', 'Sunshine'],axis=1,inplace=True)
    df.set_index(['Date'])
    return df

# Null 처리 함수
def fillna(df):
    df.dropna()
    return df

# 레이블 인코딩 수행. 
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin','Sex','Embarked']
    for feature in features:
        le = LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df