## 作業

1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [2]:
from sklearn import datasets, metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [3]:
wine = datasets.load_wine()
df = pd.DataFrame(wine.data , columns=wine.feature_names)
df.head(2)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0


In [4]:
#stratify=target 
# if variable 'y' is a binary categorical variable with values 0 and 1 and there are 25% of zeros and 75% of ones,
# 'stratify=y(target)' will make sure that your random split has 25% of 0's and 75% of 1's.
x_train,x_test,y_train,y_test = train_test_split(wine.data ,wine.target ,stratify = wine.target) #,random_state=42
rf = RandomForestClassifier(n_estimators=10) #n_estimators:default=10 #random_state:default=None
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print(f'Accuacy= {metrics.accuracy_score(y_test, y_pred):.3f}')
print(f'MSE = {mean_squared_error(y_test, y_pred):.3f}')

Accuacy= 0.933
MSE = 0.067


### n_estimators tuning (default=10)

In [5]:
n = [3,5,10,15,35,60]
Accuacy={}
for nn in n:
    rf = RandomForestClassifier(n_estimators = nn)
    rf.fit(x_train,y_train)
    y_pred = rf.predict(x_test)
    Accuacy[nn] = metrics.accuracy_score(y_test, y_pred)
    print(f'when n={nn} ACC={Accuacy[nn]:.3F}')
Accuacy_df = pd.DataFrame(data=Accuacy,index=[0])
Accuacy_df

when n=3 ACC=0.822
when n=5 ACC=0.889
when n=10 ACC=0.956
when n=15 ACC=0.911
when n=35 ACC=0.978
when n=60 ACC=0.978


Unnamed: 0,3,5,10,15,35,60
0,0.822222,0.888889,0.955556,0.911111,0.977778,0.977778


### criterion  (default=”gini”)

In [6]:
c = ['gini','entropy']
Accuacy={}
for cc in c:
    rf = RandomForestClassifier(n_estimators =5,criterion = cc)
    rf.fit(x_train,y_train)
    y_pred = rf.predict(x_test)
    Accuacy[cc] = metrics.accuracy_score(y_test, y_pred)
    print(f'when criterion={cc} ACC={Accuacy[cc]:.3F}')
Accuacy_df = pd.DataFrame(data=Accuacy,index=[0])
Accuacy_df

when criterion=gini ACC=0.933
when criterion=entropy ACC=0.889


Unnamed: 0,gini,entropy
0,0.933333,0.888889


### bootstrap (default=True)

In [7]:
b = ['True','False']
Accuacy={}
for bb in b:
    rf = RandomForestClassifier(n_estimators =5,bootstrap = bb)
    rf.fit(x_train,y_train)
    y_pred = rf.predict(x_test)
    Accuacy[bb] = metrics.accuracy_score(y_test, y_pred)
    print(f'when bootstrap={bb} ACC={Accuacy[bb]:.3F}')
Accuacy_df = pd.DataFrame(data=Accuacy,index=[0])
Accuacy_df

when bootstrap=True ACC=0.956
when bootstrap=False ACC=0.889


Unnamed: 0,True,False
0,0.955556,0.888889


## 2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [8]:
wine = datasets.load_wine()
df = pd.DataFrame(wine.data , columns=wine.feature_names)
df.head(2)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0


In [9]:
X = wine.data  # wine用的是data!!!
Y = wine.target
X_train,X_test,Y_train,Y_test = train_test_split(X ,Y, test_size=0.25 ,random_state=4)

## RandomForestClassifier

In [10]:
rf = RandomForestClassifier()
rf.fit(X_train,Y_train)
Y_pred = rf.predict(X_test)
print(f'Accuacy= {metrics.accuracy_score(Y_test, Y_pred):.3f}')
print(f'MSE = {mean_squared_error(Y_test, Y_pred):.3f}')

Accuacy= 0.978
MSE = 0.022


## DecisionTreeClassifier

In [11]:
dt = DecisionTreeClassifier()
dt.fit(X_train,Y_train)
Y_pred = dt.predict(X_test)
print(f'Accuacy= {metrics.accuracy_score(Y_test, Y_pred):.3f}')
print(f'MSE = {mean_squared_error(Y_test, Y_pred):.3f}')

Accuacy= 0.911
MSE = 0.089
