# Decision Tree Regression

In [1]:
import pandas as pd
df = pd.read_csv('50_Startups.csv')

In [2]:
df

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94
5,131876.9,99814.71,362861.36,New York,156991.12
6,134615.46,147198.87,127716.82,California,156122.51
7,130298.13,145530.06,323876.68,Florida,155752.6
8,120542.52,148718.95,311613.29,New York,152211.77
9,123334.88,108679.17,304981.62,California,149759.96


### Seperate X and Y

In [3]:
X = df.drop(labels=['PROFIT'],axis=1)
Y = df[['PROFIT']]

In [4]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [5]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [6]:
cat = X.columns[X.dtypes=='object']
con = X.columns[X.dtypes!='object']

In [7]:
cat

Index(['STATE'], dtype='object')

In [8]:
con

Index(['RND', 'ADMIN', 'MKT'], dtype='object')

### Preprocessing with sklearn pipeline

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [10]:
num_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='mean'))])

cat_pipe = Pipeline(steps=[('imputer',SimpleImputer(strategy='most_frequent')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore'))])

pre = ColumnTransformer([('num',num_pipe,con),
                         ('cat',cat_pipe,cat)])

In [11]:
X_pre = pre.fit_transform(X)
X_pre

array([[1.6534920e+05, 1.3689780e+05, 4.7178410e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.6259770e+05, 1.5137759e+05, 4.4389853e+05, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.5344151e+05, 1.0114555e+05, 4.0793454e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.4437241e+05, 1.1867185e+05, 3.8319962e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.4210734e+05, 9.1391770e+04, 3.6616842e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.3187690e+05, 9.9814710e+04, 3.6286136e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.3461546e+05, 1.4719887e+05, 1.2771682e+05, 1.0000000e+00,
        0.0000000e+00, 0.0000000e+00],
       [1.3029813e+05, 1.4553006e+05, 3.2387668e+05, 0.0000000e+00,
        1.0000000e+00, 0.0000000e+00],
       [1.2054252e+05, 1.4871895e+05, 3.1161329e+05, 0.0000000e+00,
        0.0000000e+00, 1.0000000e+00],
       [1.2333488e+05, 1.0867917e+05,

In [12]:
cols = pre.get_feature_names_out()
cols

array(['num__RND', 'num__ADMIN', 'num__MKT', 'cat__STATE_California',
       'cat__STATE_Florida', 'cat__STATE_New York'], dtype=object)

In [13]:
X_pre = pd.DataFrame(X_pre,columns=cols)

In [14]:
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,165349.2,136897.8,471784.1,0.0,0.0,1.0
1,162597.7,151377.59,443898.53,1.0,0.0,0.0
2,153441.51,101145.55,407934.54,0.0,1.0,0.0
3,144372.41,118671.85,383199.62,0.0,0.0,1.0
4,142107.34,91391.77,366168.42,0.0,1.0,0.0


### Train Test Split

In [15]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,test_size=0.2,random_state=21)

### Build final model

In [16]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(xtrain,ytrain)

In [17]:
model.score(xtrain,ytrain)

1.0

In [18]:
model.score(xtest,ytest)

0.8920725825901341

In [19]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


In [20]:
plt.figure(figsize=(16,16))
plot_tree(model,feature_names=xtrain.columns,filled=True)
plt.show()

InvalidParameterError: The 'feature_names' parameter of plot_tree must be an instance of 'list' or None. Got Index(['num__RND', 'num__ADMIN', 'num__MKT', 'cat__STATE_California',
       'cat__STATE_Florida', 'cat__STATE_New York'],
      dtype='object') instead.

<Figure size 1600x1600 with 0 Axes>

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(16,16))
plot_tree(model,feature_names=xtrain.columns,filled=True,max_depth=2)
plt.show()

### Model Evaluation

In [None]:
ypred_tr = model.predict(xtrain)
ypred_ts = model.predict(xtest)

In [None]:
ypred_tr[0:5]

In [None]:
ytrain.head()

In [None]:
ypred_ts[0:5]

In [None]:
ytest.head()

In [None]:
from custom_def import evaluate_model
evaluate_model(xtrain,ytrain,xtest,ytest,model)

### Hyperparameter tuning

In [None]:
params = {'max_depth':[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16],
          'min_samples_split':[6,7,8,9,10,11,12,13,14,15],
          'criterion':['squared_error','absolute_error']}

In [None]:
from sklearn.model_selection import GridSearchCV
model1 = DecisionTreeRegressor()
gscv = GridSearchCV(model1,param_grid=params,cv=5,scoring='neg_mean_squared_error')
gscv.fit(xtrain,ytrain)

In [None]:
gscv.best_params_

In [None]:
gscv.best_score_

In [None]:
best_model = gscv.best_estimator_
best_model

In [None]:
best_model.score(xtrain,ytrain)

In [None]:
best_model.score(xtest,ytest)

In [None]:
evaluate_model(xtrain,ytrain,xtest,ytest,best_model)

In [None]:
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
plt.figure(figsize=(16,16))
plot_tree(best_model,feature_names=xtrain.columns,filled=True,max_depth=2)
plt.show()

In [None]:
ypred_tr1 = best_model.predict(xtrain)
ypred_ts1 = best_model.predict(xtest)

In [None]:
ypred_tr1[0:5]

In [None]:
ytrain.head()

In [None]:
ypred_ts1[0:5]

In [None]:
ytest.head()

### Feature importance

In [None]:
imp = best_model.feature_importances_
imp

In [None]:
imp = pd.Series(imp)
imp.index = xtrain.columns
imp

In [None]:
imp = imp.sort_values(ascending=False)
imp

In [None]:
imp.plot(kind='bar',title='Feature Importance Plot')