## Decision Tree Regression

In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

In [13]:
import pandas as pd 
df = pd.read_csv("50_Startups (1).csv")
df.head()

Unnamed: 0,RND,ADMIN,MKT,STATE,PROFIT
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   RND     50 non-null     float64
 1   ADMIN   50 non-null     float64
 2   MKT     50 non-null     float64
 3   STATE   50 non-null     object 
 4   PROFIT  50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [15]:
df.isna().sum()

RND       0
ADMIN     0
MKT       0
STATE     0
PROFIT    0
dtype: int64

In [16]:
df.duplicated().sum()

0

In [23]:
X = df.drop(columns=['PROFIT'])
Y = df[['PROFIT']]

In [24]:
X.head()

Unnamed: 0,RND,ADMIN,MKT,STATE
0,165349.2,136897.8,471784.1,New York
1,162597.7,151377.59,443898.53,California
2,153441.51,101145.55,407934.54,Florida
3,144372.41,118671.85,383199.62,New York
4,142107.34,91391.77,366168.42,Florida


In [25]:
Y.head()

Unnamed: 0,PROFIT
0,192261.83
1,191792.06
2,191050.39
3,182901.99
4,166187.94


In [26]:
cat = list(X.columns[X.dtypes=='object'])
con = list(X.columns[X.dtypes!='object'])

In [27]:
cat

['STATE']

In [28]:
con

['RND', 'ADMIN', 'MKT']

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import ColumnTransformer

In [32]:
num_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='mean')),
                           ('scaler',StandardScaler())])

In [33]:
cat_pipe = Pipeline(steps=[('impute',SimpleImputer(strategy='most_frequent')),
                           ('ohe',OneHotEncoder(handle_unknown='ignore',sparse_output=False))])

In [35]:
pre = ColumnTransformer([('num',num_pipe,con),
                        ('cat',cat_pipe,cat)]).set_output(transform='pandas')

In [36]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
0,2.016411,0.560753,2.153943,0.0,0.0,1.0
1,1.95586,1.082807,1.9236,1.0,0.0,0.0
2,1.754364,-0.728257,1.626528,0.0,1.0,0.0
3,1.554784,-0.096365,1.42221,0.0,0.0,1.0
4,1.504937,-1.079919,1.281528,0.0,1.0,0.0


In [37]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,test_size=0.33,random_state=21)

In [38]:
xtrain.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
13,0.402078,0.510179,0.343957,1.0,0.0,0.0
28,-0.168793,2.210141,-0.767189,0.0,1.0,0.0
49,-1.622362,-0.157226,-1.369985,1.0,0.0,0.0
40,-0.989577,-0.1009,-0.315786,1.0,0.0,0.0
45,-1.60035,0.101254,-1.7274,0.0,0.0,1.0


In [39]:
ytrain.head()

Unnamed: 0,PROFIT
13,134307.35
28,103282.38
49,14681.4
40,78239.91
45,64926.08


In [40]:
xtest.head()

Unnamed: 0,num__RND,num__ADMIN,num__MKT,cat__STATE_California,cat__STATE_Florida,cat__STATE_New York
7,1.245057,0.87198,0.932186,0.0,1.0,0.0
44,-1.134305,1.206419,-1.509074,1.0,0.0,0.0
43,-1.281134,0.217682,-1.449605,0.0,0.0,1.0
25,-0.199312,0.656489,-0.603517,1.0,0.0,0.0
14,1.017181,1.269199,0.375742,0.0,1.0,0.0


In [41]:
ytest.head()

Unnamed: 0,PROFIT
7,155752.6
44,65200.33
43,69758.98
25,107404.34
14,132602.65


In [43]:
from sklearn.tree import DecisionTreeRegressor
model1 = DecisionTreeRegressor(max_depth=1,
                               min_samples_leaf=5,
                               min_samples_split=5,
                               criterion='absolute_error')
model1.fit(xtrain,ytrain)

In [44]:
model1.score(xtrain,ytrain)

0.6469035753352927

In [45]:
model1.score(xtest,ytest)

0.5597678327735671

In [46]:
params = {'max_depth':[2, 3, 4, 5, 6, 7],
          'min_samples_split':[2, 3, 4, 5, 6, 7],
          'min_samples_leaf':[2, 3, 4, 5, 6, 7],
          'criterion':['absolute_error', 'squared_error']}

In [47]:
from sklearn.model_selection import GridSearchCV
dtr = DecisionTreeRegressor()
gscv = GridSearchCV(dtr, param_grid=params, cv=5, scoring='neg_mean_absolute_error')
gscv.fit(xtrain, ytrain)