## Decision Tree Regressor

#### Business Scenario: Build an efficient model that predicts Power Enegery of a powerplant with changes in Atmospheric temp,pressure,Vaccum and humidity taken into consideration

## Step1: Data Gathering 

In [4]:
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
import pandas as pd

In [22]:
path = r'https://raw.githubusercontent.com/sindhura-nk/Datasets/refs/heads/main/PowerPlant.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,AT,V,AP,RH,PE
0,8.34,40.77,1010.84,90.01,480.48
1,23.64,58.49,1011.4,74.2,445.75
2,29.74,56.9,1007.15,41.91,438.76
3,19.07,49.69,1007.22,76.79,453.09
4,11.8,40.66,1017.13,97.2,464.43



Dataset: Powerplant

    AT-Atmospheric Temperature
    V-Vaccum
    AP-Atmospheric Pressue
    RH-Relative Humidity
    PE - Power Energy generated

## Step2: Perform basic data quality checks

In [23]:
df.columns

Index(['AT', 'V', 'AP', 'RH', 'PE'], dtype='object')

In [24]:
df.shape

(9568, 5)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9568 entries, 0 to 9567
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   AT      9568 non-null   float64
 1   V       9568 non-null   float64
 2   AP      9568 non-null   float64
 3   RH      9568 non-null   float64
 4   PE      9568 non-null   float64
dtypes: float64(5)
memory usage: 373.9 KB


In [26]:
# missing data
df.isna().sum()

AT    0
V     0
AP    0
RH    0
PE    0
dtype: int64

In [27]:
# Chaeck the duplicates
df.duplicated().sum()

np.int64(41)

#### Drop The duplicates


In [30]:
df = df.drop_duplicates()

In [31]:
df.duplicated().sum()

np.int64(0)

## Separate X and Y features
    Y: target label - PE
    X: Remaining features

In [32]:
X = df.drop(columns="PE")
Y = df[["PE"]]

In [33]:
X.head()

Unnamed: 0,AT,V,AP,RH
0,8.34,40.77,1010.84,90.01
1,23.64,58.49,1011.4,74.2
2,29.74,56.9,1007.15,41.91
3,19.07,49.69,1007.22,76.79
4,11.8,40.66,1017.13,97.2


In [34]:
Y.head()

Unnamed: 0,PE
0,480.48
1,445.75
2,438.76
3,453.09
4,464.43


## Step3: Data Preprocessing and Data Cleaning

In [36]:
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#### In this data set only continuous data that's why direct make continuous pipeline.

In [37]:
pre = make_pipeline(
    SimpleImputer(strategy='median'),
    StandardScaler()
).set_output(transform='pandas')

In [40]:
X_pre = pre.fit_transform(X)
X_pre.head()

Unnamed: 0,AT,V,AP,RH
0,-1.520448,-1.066041,-0.403535,1.141599
1,0.534897,0.330813,-0.309262,0.059223
2,1.354348,0.205475,-1.024725,-2.1514
3,-0.07902,-0.362884,-1.012941,0.236538
4,-1.055645,-1.074713,0.655349,1.633837


## Step4: Split the data into training and testing

In [42]:
from sklearn.model_selection import train_test_split

In [43]:
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,train_size=0.60,random_state=21)

In [44]:
xtrain.head()

Unnamed: 0,AT,V,AP,RH
6088,-1.595676,-1.19138,1.916247,0.808876
297,-0.754731,-0.433831,0.737838,0.131107
609,0.133231,0.963024,0.286676,-0.103031
4931,1.547792,1.162462,-0.849647,-1.616715
4010,-1.638664,-0.983271,3.271417,0.085923


In [45]:
xtest.head()

Unnamed: 0,AT,V,AP,RH
4903,0.340109,0.68318,1.061059,-0.367977
2944,0.023076,0.405701,-0.043279,1.225807
1771,-0.354409,-0.744418,0.192403,-0.730823
1450,-0.586811,-1.02032,-1.16445,0.631561
5399,-0.310078,-1.148812,-1.723353,-0.596639


In [46]:
ytrain.head()

Unnamed: 0,PE
6088,484.65
297,464.09
609,456.49
4931,441.26
4010,483.11


## Step5: Build the model

In [48]:
from sklearn.tree import DecisionTreeRegressor

In [51]:
model = DecisionTreeRegressor(
    max_depth=3,
    min_samples_leaf = 4,
    min_samples_split = 6,
    criterion='squared_error'
)
model.fit(xtrain,ytrain)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,3
,min_samples_split,6
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [52]:
model.score(xtrain,ytrain)

0.9122595866013398

In [53]:
model.score(xtest,ytest)

0.9065507170255205


## Hyperparameter tuning with GridsearchCV

In [55]:
params = {
    'max_depth':[3,4,5,6,7,8,9,10],
    'min_samples_split':[1,2,3,4,5,6,7,8,9],
    'min_samples_leaf':[1,2,3,4,5,6,7,8,9],
    'criterion':['squared_error','absolute_error']
}

In [56]:
from sklearn.model_selection import GridSearchCV

In [57]:
base_model = DecisionTreeRegressor()

In [58]:
gscv = GridSearchCV(estimator=base_model,param_grid=params,cv=3,scoring='r2')
gscv.fit(xtrain,ytrain)

0,1,2
,estimator,DecisionTreeRegressor()
,param_grid,"{'criterion': ['squared_error', 'absolute_error'], 'max_depth': [3, 4, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [1, 2, ...]}"
,scoring,'r2'
,n_jobs,
,refit,True
,cv,3
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,8
,min_samples_split,3
,min_samples_leaf,9
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [59]:
gscv.best_params_

{'criterion': 'squared_error',
 'max_depth': 8,
 'min_samples_leaf': 9,
 'min_samples_split': 3}

In [60]:
gscv.best_score_

np.float64(0.939409224977215)

In [None]:
best_dtr = gscv.best_estimator_

In [None]:
best_dtr.score(xtrain,ytrain)

In [61]:
best_dtr.score(xtest,ytest)

NameError: name 'best_dtr' is not defined

In [None]:
## Step6: Model Evaluation