# **Tutorial Decision Tree Regression - Scikit Learn**

# **Data Preparation Stage**

In [None]:
# import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# read data
df = pd.read_csv('https://github.com/YBIFoundation/Dataset/raw/main/Admission%20Chance.csv')
df.head()

In [None]:
# define target and features
y = df['Chance of Admit ']
X = df.drop(['Serial No','Chance of Admit '],axis=1)

# **Model Selection Stage**

In [None]:
# split into train and test sample
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=2529)

In [None]:
# select model
from sklearn.tree import DecisionTreeRegressor,plot_tree
dtr=DecisionTreeRegressor(max_depth=3, random_state=2529)

In [None]:
# train model
dtr.fit(X_train,y_train)

In [None]:
# evaluate the model on training sample
dtr.score(X_train,y_train)

In [None]:
# cross validate model on train sample
from sklearn.model_selection import cross_validate
cross_validate(dtr, X_train, y_train, cv=5)

In [None]:
# evaluate the model on the test sample
dtr.score(X_test,y_test)

# **Model Validation Stage**

In [None]:
# model hyperparameters
dtr.get_params()

In [None]:
# hyperparameters selection
params = {'model__max_depth' : [2,3,4,5,6,7,8, 9, 10],
        'model__criterion' : ['squared_error', 'absolute_error'],
        'model__random_state' : [2529]}

In [None]:
# import pipeline
from sklearn.pipeline import Pipeline

In [None]:
# define pipe line
pipe = Pipeline([('model', dtr)])

In [None]:
# grid search cv
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe,params,cv=5)

In [None]:
grid

In [None]:
# train grid search
grid.fit(X_train,y_train)

In [None]:
# find best parama
grid.best_params_

In [None]:
# select best estimator
best = grid.best_estimator_

In [None]:
# fit best estimator
best.fit(X_train,y_train)

In [None]:
# predict with best estimator
y_pred=best.predict(X_test)

In [None]:
# model error
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

In [None]:
# mean absolute error
mean_absolute_error(y_test,y_pred)

In [None]:
# mean absolute percentage error
mean_absolute_percentage_error(y_test,y_pred)

In [None]:
# r square
r2_score(y_test,y_pred)

In [None]:
# plot tree
fig,ax = plt.subplots(figsize=(15,10))
final=DecisionTreeRegressor(max_depth=3, random_state=2529)
final.fit(X_train,y_train)
plot_tree(final,feature_names=X.columns,filled=True);

# **Save Model**

In [None]:
# save model
import pickle

In [None]:
# create an iterator object with write permission - model.pickle
with open('dtr_pkl', 'wb') as files:
    pickle.dump(best, files)

In [None]:
# load saved model
with open('dtr_pkl' , 'rb') as f:
    dtr2 = pickle.load(f)

In [None]:
# predict with saved model
dtr2.predict(X_test)