# FDA HW2 - Decision Tree
## Rain Tomorrow or not 
* dataset: [Rain in Australia](https://www.kaggle.com/jsphyg/weather-dataset-rattle-package)
* object: to predict whether it will rain tomorrow or not

### prepare data

In [30]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split

import os
print(os.listdir('./data'))

['weather-dataset-rattle-package.zip', 'suicide-rates-overview-1985-to-2016.zip', 'weatherAUS.csv', 'master.csv']


In [50]:
df = pd.read_csv('./data/weatherAUS.csv')
df.head(5)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,25.0,1010.6,1007.8,,,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,0.0,No
3,2008-12-04,Albury,9.2,28.0,0.0,,,NE,24.0,SE,...,16.0,1017.6,1012.8,,,18.1,26.5,No,1.0,No
4,2008-12-05,Albury,17.5,32.3,1.0,,,W,41.0,ENE,...,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,0.2,No


In [51]:
df = df.fillna(-1)
df.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,-1.0,-1.0,W,44.0,W,...,22.0,1007.7,1007.1,8.0,-1.0,16.9,21.8,No,0.0,No
1,2008-12-02,Albury,7.4,25.1,0.0,-1.0,-1.0,WNW,44.0,NNW,...,25.0,1010.6,1007.8,-1.0,-1.0,17.2,24.3,No,0.0,No
2,2008-12-03,Albury,12.9,25.7,0.0,-1.0,-1.0,WSW,46.0,W,...,30.0,1007.6,1008.7,-1.0,2.0,21.0,23.2,No,0.0,No


In [64]:
# corr
df.corr()

pandas.core.frame.DataFrame

### drop the attribute (columns) that not significant

In [53]:
df = df.drop(['Date', 'Location', 'RISK_MM', 'WindDir9am', 
              #'WindDir3pm', ], axis=1)
df.head()

ValueError: At based indexing on an integer index can only have integer indexers

In [42]:
X = pd.get_dummies(df.drop('RainTomorrow', axis=1))
Y = pd.get_dummies(df['RainTomorrow'])

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [43]:
X_train.head(5)

Unnamed: 0,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,...,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_-1,RainToday_No,RainToday_Yes
17969,15.1,23.9,0.0,-1.0,-1.0,67.0,19.0,22.0,38.0,68.0,...,0,0,0,0,1,0,0,0,1,0
124769,9.7,14.2,7.6,-1.0,-1.0,50.0,15.0,28.0,91.0,56.0,...,0,0,0,0,1,0,0,0,0,1
39287,13.2,25.4,0.0,3.2,8.8,30.0,6.0,17.0,79.0,63.0,...,0,0,0,0,0,0,0,0,1,0
127749,7.6,14.8,0.0,4.0,7.0,94.0,30.0,35.0,52.0,45.0,...,0,0,0,0,0,1,0,0,1,0
40651,12.9,22.2,0.0,4.0,7.9,37.0,15.0,20.0,69.0,52.0,...,0,1,0,0,0,0,0,0,1,0


### Building the decision tree

In [44]:
dtree = DecisionTreeClassifier(max_depth=3)
dtree = dtree.fit(X_train, y_train)

In [45]:
from sklearn.metrics import accuracy_score

y_predict = dtree.predict(X_test)
accuracy_score(y_predict, y_test)


0.8271387882836949

In [46]:
from sklearn.externals.six import StringIO
import pydotplus

###

dot_data = StringIO()
export_graphviz(dtree, 
                out_file=dot_data,  
                filled=True, 
                feature_names=list(X_train),
                class_names=[X_train.columns],
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_pdf("tree.pdf")
graph.write_png("tree.png")


###

True

In [47]:
dtree.feature_importances_

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.05243055, 0.        , 0.        , 0.        , 0.78218702,
       0.        , 0.        , 0.        , 0.04241798, 0.01152203,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.11144243, 0.        ])

## Decision Tree
![tree](tree.png)

### Comclusion
* G