## [作業重點]
目前你應該已經要很清楚資料集中，資料的型態是什麼樣子囉！包含特徵 (features) 與標籤 (labels)。因此要記得未來不管什麼專案，必須要把資料清理成相同的格式，才能送進模型訓練。
今天的作業開始踏入決策樹這個非常重要的模型，請務必確保你理解模型中每個超參數的意思，並試著調整看看，對最終預測結果的影響為何

## 作業

1. 試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [4]:
from sklearn import datasets, metrics

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [164]:
?DecisionTreeRegressor

In [124]:
import numpy as np

In [146]:
def DT_clf(name, dataset, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None):   
    x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=4)
    clf = DecisionTreeClassifier(criterion=criterion, 
                                 splitter=splitter, 
                                 max_depth=max_depth, min_samples_split=min_samples_split, 
                                 min_samples_leaf=min_samples_split, 
                                 min_weight_fraction_leaf=min_weight_fraction_leaf, 
                                 max_features=max_features, 
                                 max_leaf_nodes=max_leaf_nodes)
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc = metrics.accuracy_score(y_test, y_pred)
    #print(name)
    #print("Acuuracy: ", acc)
    #print(dataset['feature_names'])
    np.set_printoptions(edgeitems=30, linewidth=100000, 
                        formatter=dict(float=lambda x: "%.3g" % x))    
    #print("Feature importance", clf.feature_importances_)
    print(acc, clf.feature_importances_)

In [159]:
def DT_reg(name, dataset, criterion='mse', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, max_leaf_nodes=None):   
    x_train, x_test, y_train, y_test = train_test_split(dataset.data, dataset.target, test_size=0.25, random_state=4)
    regr = DecisionTreeRegressor(criterion=criterion, 
                                splitter=splitter, 
                                max_depth=max_depth, min_samples_split=min_samples_split, 
                                min_samples_leaf=1, min_weight_fraction_leaf=min_weight_fraction_leaf, 
                                max_features=max_features,  
                                max_leaf_nodes=max_leaf_nodes)
    regr.fit(x_train, y_train)
    y_pred = regr.predict(x_test)
    #print('name')
    np.set_printoptions(edgeitems=30, linewidth=100000, 
                        formatter=dict(float=lambda x: "%.3g" % x))     
    
    print("%.2f" % metrics.mean_squared_error(y_test, y_pred))

In [108]:
iris = datasets.load_iris()
boston = datasets.load_boston()
wine = datasets.load_wine()

In [119]:
def view_dataset(name, dataset):
    print(name)
    print(dataset.keys())
    print('data', dataset['data'].shape)
    print('target', dataset['target'].shape)
    try:
        print('target_names', dataset['target_names'])
    except:
        pass
    try:
        print('feature_names', dataset['feature_names'])
    except:
        pass
    #print('description', dataset['DESCR'])

In [110]:
view_dataset('iris', iris)

iris
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
data (150, 4)
target (150,)
target_names ['setosa' 'versicolor' 'virginica']
feature_names ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [111]:
view_dataset('wine', wine)

wine
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names'])
data (178, 13)
target (178,)
target_names ['class_0' 'class_1' 'class_2']
feature_names ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']


In [112]:
view_dataset('boston', boston)

boston
dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])
data (506, 13)
target (506,)
feature_names ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [127]:
DT_clf('iris', iris)

iris
Acuuracy:  0.973684210526
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance [0.0134 0 0.554 0.433]


In [134]:
DT_reg('iris', iris)

iris
Acuuracy:  0.973684210526
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance [0 0.00957 0.0319 0.959]


In [138]:
DT_clf('iris', iris, criterion='entropy')

iris
Acuuracy:  1.0
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance [0.0282 0 0.078 0.894]


In [148]:
for i in range(1,10):
    DT_clf('iris', iris, max_depth=i)

0.684210526316 [0 0 0 1]
0.973684210526 [0 0 0 1]
0.973684210526 [0 0 0.558 0.442]
0.973684210526 [0.0205 0 0.547 0.433]
0.973684210526 [0.0205 0 0.0627 0.917]
0.973684210526 [0.0134 0.00705 0.547 0.433]
0.973684210526 [0.0205 0 0.547 0.433]
0.973684210526 [0.0134 0.00705 0.547 0.433]
0.973684210526 [0.0205 0 0.0627 0.917]


In [150]:
for i in range(1,10):
    DT_clf('iris', iris, criterion='entropy', max_depth=i)

0.684210526316 [0 0 0 1]
0.973684210526 [0 0 0 1]
0.973684210526 [0 0 0.671 0.329]
0.973684210526 [0.0282 0.00733 0.647 0.317]
0.973684210526 [0.0282 0.00733 0.078 0.886]
0.973684210526 [0.0355 0 0.078 0.886]
0.973684210526 [0.0355 0 0.647 0.317]
0.973684210526 [0.0282 0.00733 0.647 0.317]
0.973684210526 [0.0282 0.00733 0.078 0.886]


In [149]:
DT_clf('wine', wine)

0.866666666667 [0.0138 0.0311 0.007 0 0.0446 0 0.0825 0 0 0.341 0 0.0868 0.393]


In [151]:
for i in range(1,10):
    DT_clf('wine', wine, max_depth=i)

0.644444444444 [0 0 0 0 0 0 0 0 0 0 0 0 1]
0.888888888889 [0 0 0 0 0 0.101 0 0 0 0.417 0 0 0.481]
0.911111111111 [0 0.00733 0 0 0 0 0.132 0 0 0.404 0 0.0454 0.412]
0.888888888889 [0.0142 0 0 0 0.046 0 0.0852 0 0.00723 0.352 0 0.0896 0.406]
0.888888888889 [0.0138 0.0311 0 0 0 0.0505 0.0825 0 0 0.386 0 0.0434 0.393]
0.866666666667 [0.0138 0 0 0 0.0446 0 0.133 0 0 0.372 0 0.0434 0.393]
0.888888888889 [0.0138 0.0311 0 0 0.0446 0.0825 0.0435 0 0 0.348 0 0.0434 0.393]
0.866666666667 [0.0138 0 0 0 0.0446 0.0825 0 0 0 0.372 0.007 0.0868 0.393]
0.888888888889 [0.0138 0.0311 0 0 0.0446 0 0 0 0 0.348 0 0.169 0.393]


In [152]:
for i in range(1,10):
    DT_clf('wine', wine, criterion='entropy', max_depth=i)

0.533333333333 [0 0 0 0 0 0 1 0 0 0 0 0 0]
0.955555555556 [0 0 0 0 0 0 0.465 0 0 0.159 0 0 0.376]
0.955555555556 [0 0 0 0 0.0787 0 0.42 0 0 0.144 0 0 0.357]
0.955555555556 [0 0 0 0 0 0 0.424 0 0 0.221 0 0 0.355]
0.977777777778 [0.0181 0 0 0 0 0 0.418 0.00609 0 0.221 0 0 0.337]
0.933333333333 [0 0 0 0 0.0783 0 0.418 0 0 0.143 0 0.00609 0.355]
0.933333333333 [0 0.0181 0 0.00609 0.0783 0 0.418 0 0 0.143 0 0 0.337]
0.955555555556 [0 0.0181 0 0 0.0783 0 0.418 0 0.00609 0.143 0 0 0.337]
0.955555555556 [0 0 0 0.0181 0.0783 0.00609 0.418 0 0 0.143 0 0 0.337]


In [162]:
for i in range(1,20):
    DT_reg('boston', boston, max_depth=i)

54.29
35.76
34.27
27.73
25.13
24.72
26.87
27.35
27.73
26.69
28.06
29.63
25.73
31.97
25.37
29.25
26.66
29.95
26.06
