## [作業重點]
目前你應該已經要很清楚資料集中，資料的型態是什麼樣子囉！包含特徵 (features) 與標籤 (labels)。因此要記得未來不管什麼專案，必須要把資料清理成相同的格式，才能送進模型訓練。
今天的作業開始踏入決策樹這個非常重要的模型，請務必確保你理解模型中每個超參數的意思，並試著調整看看，對最終預測結果的影響為何

## 作業

1. 試著調整 DecisionTreeClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型的結果進行比較

In [1]:
from sklearn import datasets, metrics
import numpy as np

# 如果是分類問題，請使用 DecisionTreeClassifier，若為回歸問題，請使用 DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split

# Iris, change test size

In [2]:
iris = datasets.load_iris()

sizes = np.linspace(0.1, 0.9, 9)
print(f'random_state: 4')

for size in sizes:

    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = size, random_state = 4)

    clf = DecisionTreeClassifier()

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)

    print('**************************************')
    print(f'test_size: {size}')
    
    acc = metrics.accuracy_score(y_test, y_pred)
    print("Acuuracy: ", acc)

    print(iris.feature_names)
    print("Feature importance: ", clf.feature_importances_)

random_state: 4
**************************************
test_size: 0.1
Acuuracy:  0.9333333333333333
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.         0.01482213 0.0502622  0.93491566]
**************************************
test_size: 0.2
Acuuracy:  0.9666666666666667
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.01677501 0.         0.51670178 0.46652322]
**************************************
test_size: 0.30000000000000004
Acuuracy:  0.9782608695652174
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.01943199 0.         0.51203975 0.46852826]
**************************************
test_size: 0.4
Acuuracy:  0.9666666666666667
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.         0.02237971 0.08637986 0.89124043]
********************************

# Iris, change random_state

In [3]:
print(f'test_size: 0.2')

for i in range(4, 44, 2):
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size = 0.2, random_state = i)

    clf = DecisionTreeClassifier()

    clf.fit(x_train, y_train)

    y_pred = clf.predict(x_test)
    
    print('**************************************')
    print(f'random_state: {i}')
    
    acc = metrics.accuracy_score(y_test, y_pred)
    print("Acuuracy: ", acc)

    print(iris.feature_names)
    print("Feature importance: ", clf.feature_importances_)

test_size: 0.2
**************************************
random_state: 4
Acuuracy:  0.9666666666666667
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.         0.01677501 0.05652535 0.92669965]
**************************************
random_state: 6
Acuuracy:  0.9333333333333333
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.04520172 0.         0.55501128 0.39978699]
**************************************
random_state: 8
Acuuracy:  0.9
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.01667014 0.         0.53122259 0.45210727]
**************************************
random_state: 10
Acuuracy:  0.9333333333333333
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
Feature importance:  [0.         0.02588186 0.90897639 0.06514174]
**************************************
random_state: 12
Acu

# Boston

In [4]:
boston = datasets.load_boston()

reg = DecisionTreeRegressor()
    
print('random_state: 4')
    
for size in sizes:

    x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = size, random_state = 4)

    reg.fit(x_train, y_train)

    y_pred = reg.predict(x_test)

    print('**************************************')
    print(f'test_size: {size}')
    
    print("Mean square error: ", metrics.mean_squared_error(y_test, y_pred))

    print(boston.feature_names)
    print("Feature importance: ", reg.feature_importances_)

random_state: 4
**************************************
test_size: 0.1
Mean square error:  24.27156862745098
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Feature importance:  [0.05710086 0.00144388 0.00995851 0.00136985 0.01493315 0.56235628
 0.00635615 0.07840483 0.00236227 0.01160218 0.02489037 0.01875482
 0.21046683]
**************************************
test_size: 0.2
Mean square error:  25.679117647058828
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Feature importance:  [3.66119654e-02 6.73080439e-04 9.36102752e-03 1.09161468e-05
 3.22172537e-02 5.98445698e-01 1.06647144e-02 6.24050670e-02
 3.94575947e-04 1.45672492e-02 1.65573805e-02 7.09948727e-03
 2.10991584e-01]
**************************************
test_size: 0.30000000000000004
Mean square error:  30.376118421052638
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Feature importance:  [4.98427291e-02 1.5

In [5]:
print('test size: 0.2')

for i in range(4, 44, 2):

    x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = 0.2, random_state = i)

    reg.fit(x_train, y_train)

    y_pred = reg.predict(x_test)

    print('**************************************')
    print(f'random_state: {i}')
    
    print("Mean square error: ", metrics.mean_squared_error(y_test, y_pred))

    print(boston.feature_names)
    print("Feature importance: ", reg.feature_importances_)

test size: 0.2
**************************************
random_state: 4
Mean square error:  25.020392156862744
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Feature importance:  [5.31360991e-02 7.74283683e-04 6.00844816e-03 5.58232631e-04
 3.27029085e-02 5.82317103e-01 1.00166111e-02 6.04558888e-02
 6.64610086e-04 1.80126970e-02 1.69168235e-02 8.25321655e-03
 2.10183078e-01]
**************************************
random_state: 6
Mean square error:  39.95245098039216
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
Feature importance:  [6.22292956e-02 1.37026797e-03 1.50004604e-02 2.44524280e-04
 1.56574358e-02 5.84803528e-01 1.05292746e-02 6.84812246e-02
 6.14847676e-04 1.28425614e-02 5.61120894e-03 1.93797992e-02
 2.03235572e-01]
**************************************
random_state: 8
Mean square error:  27.415686274509806
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']
F

In [6]:
import sys

boston = datasets.load_boston()
sizes = np.linspace(0.1, 0.9, 9)

reg = DecisionTreeRegressor()

best_MSE = sys.maxsize
best_size = 0
best_random_state = 0
best_importance = []
    
    
for size in sizes:
    for state in range(4, 44, 2):

        x_train, x_test, y_train, y_test = train_test_split(boston.data, boston.target, test_size = size, random_state = state)

        reg.fit(x_train, y_train)

        y_pred = reg.predict(x_test)
        
        MSE = metrics.mean_squared_error(y_test, y_pred)
        
        if MSE < best_MSE:
            best_MSE = MSE
            best_size = size
            best_state = state
            best_importance = reg.feature_importances_
        
print(f'best size: {best_size}')
print(f'best random_state: {best_state}')
print(f'best MSE: {best_MSE}')
print(f'Feature importance: {best_importance}')

best size: 0.2
best random_state: 42
best MSE: 9.828529411764706
Feature importance: [5.02201870e-02 1.27500952e-03 1.05493897e-02 3.27237297e-04
 2.77182120e-02 5.89454224e-01 1.42655304e-02 6.54265388e-02
 2.13063592e-03 1.07963363e-02 2.28570578e-02 1.30906767e-02
 1.91888965e-01]


# wine

In [7]:
wine = datasets.load_wine()

sizes = np.linspace(0.1, 0.9, 9)

clf = DecisionTreeClassifier()

best_acc = 0
best_size = 0
best_random_state = 0
best_importance = []

for size in sizes:
    for state in range(4, 44, 2):

        x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size = size, random_state = state)

        clf.fit(x_train, y_train)

        y_pred = clf.predict(x_test)
        
        acc = metrics.accuracy_score(y_test, y_pred)
        
        if acc > best_acc:
            best_acc = acc
            best_size = size
            best_state = state
            best_importance = reg.feature_importances_
        
print(f'best size: {best_size}')
print(f'best random_state: {best_state}')
print(f'best acc: {best_acc}')
print(f'Feature importance: {best_importance}')

best size: 0.1
best random_state: 34
best acc: 1.0
Feature importance: [6.10287681e-02 7.23240813e-05 3.03861592e-05 0.00000000e+00
 2.90546869e-01 1.47358883e-01 2.32997429e-02 6.34027959e-03
 9.04603492e-04 2.90300826e-04 2.13958741e-04 3.47119883e-02
 4.35201897e-01]
