# Decision Tree Regressor

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer

In [2]:
grd = pd.read_csv("../data/graphene_data_final.csv")

In [3]:
X, Y = grd[['Graphene_percentage', 'FEED', 'RPM', 'DOC']], grd['MRR_gm_per_sec']
Y2 = grd['Ra']

In [4]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=1)
X_train, X_test, Y2_train, Y2_test = train_test_split(X, Y2, test_size = 0.3, random_state=1)

In [5]:
print(tree.DecisionTreeRegressor().fit(X_train,Y_train).score(X_test,Y_test))

0.8456605882420749


#### Maximum depth vs Test score

In [6]:
for i in range(1,10):
    tree_0 = tree.DecisionTreeRegressor(max_depth=i).fit(X_train,Y_train)   
    print(str(i)+"  "+str(tree_0.score(X_test,Y_test)))

1  0.34215504461832535
2  0.6584615859318392
3  0.7683696377168678
4  0.8427968886279225
5  0.8294883692112021
6  0.8316129305883023
7  0.8497825924673458
8  0.8456605882420749
9  0.8456605882420749


#### Minimum Sample Split vs Test score

In [7]:
for i in range(1,20):
    tree_0 = tree.DecisionTreeRegressor(min_samples_split=i+1).fit(X_train,Y_train)   
    print(str(i+1)+"  "+str(tree_0.score(X_test,Y_test)))

2  0.8456605882420749
3  0.8470616635414071
4  0.860390263796653
5  0.8701307304048008
6  0.8473999412257508
7  0.8467823180570306
8  0.8461547716851098
9  0.8383191686380057
10  0.8432851400474274
11  0.8432564135002852
12  0.8464894105095316
13  0.8513055680304796
14  0.8513055680304796
15  0.8513055680304796
16  0.8513055680304795
17  0.8513055680304796
18  0.8534638877180607
19  0.8332526763504087
20  0.8332526763504087


#### Leaf count vs Test score

In [8]:
for i in range(1,20):
    tree_0 = tree.DecisionTreeRegressor(min_samples_leaf=i+1).fit(X_train,Y_train)   
    print(str(i+1)+"  "+str(tree_0.score(X_test,Y_test)))

2  0.8629876868642068
3  0.8472328331455622
4  0.8522967533572814
5  0.8551022398855789
6  0.8493352290787524
7  0.8496606267024551
8  0.8496606267024551
9  0.8330052587584644
10  0.758129224778583
11  0.758129224778583
12  0.7272398824831834
13  0.7213185183452002
14  0.7213185183452002
15  0.7213185183452002
16  0.7213185183452002
17  0.7213185183452002
18  0.7213185183452002
19  0.5695004993981048
20  0.5695004993981045


In [9]:
best_tree = tree.DecisionTreeRegressor(min_samples_leaf=2,min_samples_split=5,max_depth=7)
best_tree.fit(X_train,Y_train)

DecisionTreeRegressor(max_depth=7, min_samples_leaf=2, min_samples_split=5)

In [10]:
print(best_tree.score(X_test,Y_test))

0.8701307304048008


Now let's save the best performing decision tree model as a "Pickle" file.

In [13]:
import pickle

# save
with open('../trained_models/best_decision_tree.pkl','wb') as f:
    pickle.dump(best_tree,f)


In [14]:
best_tree = tree.DecisionTreeRegressor(min_samples_leaf=2,min_samples_split=5,max_depth=7)
best_tree.fit(X_train,Y2_train)

DecisionTreeRegressor(max_depth=7, min_samples_leaf=2, min_samples_split=5)

In [15]:
print(best_tree.score(X_test,Y2_test))

0.28595401085148686


In [16]:
for i in range(1,20):
    tree_0 = tree.DecisionTreeRegressor(min_samples_leaf=i+1).fit(X_train,Y2_train)   
    print(str(i+1)+"  "+str(tree_0.score(X_test,Y2_test)))

2  0.28759496531981643
3  0.08718111471671053
4  0.3430460230811968
5  0.3668768010962986
6  0.34724831489740293
7  0.34599182608458545
8  0.2298433767390714
9  0.2438203750859521
10  0.2418367106224163
11  0.09577674907516054
12  0.09577674907516054
13  0.05366391035544793
14  0.05366391035544804
15  0.053663910355448374
16  0.053663910355448374
17  0.10742661123350994
18  0.10742661123351027
19  0.10742661123350983
20  0.09644700256230099


In [17]:
for i in range(1,20):
    tree_0 = tree.DecisionTreeRegressor(min_samples_split=i+1).fit(X_train,Y2_train)   
    print(str(i+1)+"  "+str(tree_0.score(X_test,Y2_test)))

2  -0.4381715753155493
3  -0.37211759527868304
4  -0.1622020882877082
5  -0.10812753300146727
6  0.2651152598904555
7  0.2651152598904555
8  0.3041259178415653
9  0.3338598489829536
10  0.3283335230337189
11  0.22171064898793924
12  0.22171064898793924
13  0.22951589835206887
14  0.2443557809184047
15  0.24133275086576855
16  0.1449890788897077
17  0.17546180204292827
18  0.23461732780400157
19  0.23461732780400157
20  0.23461732780400157


In [18]:
for i in range(1,10):
    tree_0 = tree.DecisionTreeRegressor(max_depth=i).fit(X_train,Y2_train)   
    print(str(i)+"  "+str(tree_0.score(X_test,Y2_test)))

1  0.02754132755869987
2  0.10230743951433563
3  0.23529894670289597
4  0.4006317412179581
5  0.25043046608885466
6  -0.35560013250944267
7  -0.4336708161559406
8  -0.4396137256545152
9  -0.439745350898771


In [19]:
best1 = tree.DecisionTreeRegressor(min_samples_leaf=5,min_samples_split=9,max_depth=4)
best1.fit(X_train,Y2_train)

DecisionTreeRegressor(max_depth=4, min_samples_leaf=5, min_samples_split=9)

In [20]:
print(best1.score(X_train,Y2_train))

0.5540966457072056
