# Julia 機器學習：DecisionTree 決策樹

## 作業 030：乳癌預測資料集

請使用隨機森林模型建立一個分類模型，來預測乳癌資料集中，為良性或是惡性的腫瘤。

In [1]:
using DecisionTree, RDatasets, DataFrames, MLDataUtils, Statistics

## 讀取資料

In [2]:
biopsy = dataset("MASS", "biopsy")

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,Class
Unnamed: 0_level_1,String,Int32,Int32,Int32,Int32,Int32,Int32⍰,Int32,Int32,Int32,Categorical…
1,1000025,5,1,1,1,2,1,3,1,1,benign
2,1002945,5,4,4,5,7,10,3,2,1,benign
3,1015425,3,1,1,1,2,2,3,1,1,benign
4,1016277,6,8,8,1,3,4,3,7,1,benign
5,1017023,4,1,1,3,2,1,3,1,1,benign
6,1017122,8,10,10,8,7,10,9,7,1,malignant
7,1018099,1,1,1,1,2,10,3,1,1,benign
8,1018561,2,1,2,1,2,1,3,1,1,benign
9,1033078,2,1,1,1,2,1,1,1,5,benign
10,1033078,4,2,1,1,2,1,2,1,1,benign


## 處理遺失值

In [3]:
dropmissing!(biopsy)

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,Class
Unnamed: 0_level_1,String,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Categorical…
1,1000025,5,1,1,1,2,1,3,1,1,benign
2,1002945,5,4,4,5,7,10,3,2,1,benign
3,1015425,3,1,1,1,2,2,3,1,1,benign
4,1016277,6,8,8,1,3,4,3,7,1,benign
5,1017023,4,1,1,3,2,1,3,1,1,benign
6,1017122,8,10,10,8,7,10,9,7,1,malignant
7,1018099,1,1,1,1,2,10,3,1,1,benign
8,1018561,2,1,2,1,2,1,3,1,1,benign
9,1033078,2,1,1,1,2,1,1,1,5,benign
10,1033078,4,2,1,1,2,1,2,1,1,benign


## 分割train data和test data

In [5]:
indecies = MLDataUtils.shuffleobs(collect(1:nrow(biopsy))) #將原資料的順序洗牌
train_ind, test_ind = MLDataUtils.splitobs(indecies, at = 0.8); #0.8的資料當訓練資料，0.2的資料當測試資料

In [33]:
train = biopsy[train_ind, :] #建立訓練用資料的DataFrame
test = biopsy[test_ind, :] #建立測試用資料的DataFrame

Unnamed: 0_level_0,ID,V1,V2,V3,V4,V5,V6,V7,V8,V9,Class
Unnamed: 0_level_1,String,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Int32,Categorical…
1,657753,3,1,1,4,3,1,2,2,1,benign
2,167528,4,1,1,1,2,1,3,6,1,benign
3,764974,5,1,1,1,2,1,3,1,2,benign
4,1225799,10,6,4,3,10,10,9,10,1,malignant
5,1344449,1,1,1,1,1,1,2,1,1,benign
6,1033078,4,2,1,1,2,1,2,1,1,benign
7,837480,7,4,4,3,4,10,6,9,1,malignant
8,476903,10,5,7,3,3,7,3,3,8,malignant
9,1201870,4,1,1,3,1,1,2,1,1,benign
10,1110102,10,3,6,2,3,5,4,10,2,malignant


## 將原始資料分成 features 和 labels

In [34]:
features = Matrix(train[2:10])
labels = Vector{String}(train[11])

│   caller = top-level scope at In[34]:1
└ @ Core In[34]:1
│   caller = top-level scope at In[34]:2
└ @ Core In[34]:2


546-element Array{String,1}:
 "benign"
 "malignant"
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"
 "malignant"
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 ⋮
 "benign"
 "benign"
 "benign"
 "benign"
 "benign"
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"

## 建立與訓練模型

In [7]:
model = DecisionTree.RandomForestClassifier(max_depth=5)

RandomForestClassifier
n_trees:             10
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           5
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             nothing
ensemble:            nothing

In [20]:
DecisionTree.fit!(model, features, labels)

RandomForestClassifier
n_trees:             10
n_subfeatures:       -1
partial_sampling:    0.7
max_depth:           5
min_samples_leaf:    1
min_samples_split:   2
min_purity_increase: 0.0
classes:             ["benign", "malignant"]
ensemble:            Ensemble of Decision Trees
Trees:      10
Avg Leaves: 13.5
Avg Depth:  5.0

## 建立測試資料並進行預測

In [35]:
test_features = Matrix(test[2:10])

│   caller = top-level scope at In[35]:1
└ @ Core In[35]:1


137×9 Array{Int32,2}:
  3   1   1   4   3   1   2   2   1
  4   1   1   1   2   1   3   6   1
  5   1   1   1   2   1   3   1   2
 10   6   4   3  10  10   9  10   1
  1   1   1   1   1   1   2   1   1
  4   2   1   1   2   1   2   1   1
  7   4   4   3   4  10   6   9   1
 10   5   7   3   3   7   3   3   8
  4   1   1   3   1   1   2   1   1
 10   3   6   2   3   5   4  10   2
  1   1   1   1   2   1   3   1   1
  5   3   3   1   2   1   2   1   1
  1   1   1   1   2   1   2   1   1
  ⋮                   ⋮          
  5   2   4   1   1   1   1   1   1
  3   1   1   1   2   1   1   1   1
  1   2   2   1   2   1   1   1   1
 10  10  10  10  10   1   8   8   8
  3  10   3  10   6  10   5   1   4
  3   3   5   2   3  10   7   1   1
  1   1   1   1   2   1   2   1   1
  1   1   1   1   2   1   2   1   1
  9  10  10  10  10   5  10  10  10
  2   1   1   1   3   1   2   1   1
  5   1   1   2   2   1   2   1   1
  7   3   2  10   5  10   5   4   4

In [36]:
result = DecisionTree.predict(model, test_features[:, :])

137-element Array{String,1}:
 "benign"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"
 "malignant"
 "benign"
 "malignant"
 "benign"
 "benign"
 "benign"
 ⋮
 "benign"
 "benign"
 "benign"
 "malignant"
 "malignant"
 "malignant"
 "benign"
 "benign"
 "malignant"
 "benign"
 "benign"
 "malignant"

## 評估模型

In [41]:
# 宣告 accuracy 函數
accuracy(real, predict) = mean(real .== predict)
# 建立 real label的 Vector
test_labels = Vector{String}(test[11])
# 進行比對
accuracy(result, test_labels[:, :])

│   caller = top-level scope at In[41]:2
└ @ Core In[41]:2


0.9635036496350365