#### `Decision Tree Regression`
##### `Power Plant CSV File Analysis`

    1. AT - Ambinent Temperature degree celsius
    2. V - Vacuum pressure in pascals
    3. AP - Atmospheric Pressure pascals
    4. RH - Relative Humidity %
    5. PE - Power Output of plant

In [1]:
from warnings import filterwarnings
filterwarnings("ignore")

#----------------------
#Step-1: Data Ingestion
#----------------------
import pandas as pd
df = pd.read_csv("PowerPlant.csv"); print('Actual PowerPlant.csv file data', df.shape)

#----------------------------------------
#Step-2: Data Sanity - Duplicate removals
#----------------------------------------

duplicate_count = df.duplicated().sum()

if duplicate_count > 0:
    print(f'Duplicates Found: {duplicate_count}', 'Removing Duplicates...')
    df = df.drop_duplicates(keep = "first").reset_index(drop = True)
    print('Removed Duplicates...')
else:
    print('No Duplicates Found')

#------------------------
#Step-3: Separate X and Y
#------------------------

X = df.drop(columns = ["PE"]); Y = df["PE"]

#-----------------------------------
#Step-4: Remove High Unique Cat Cols
#-----------------------------------

card = df.select_dtypes(include = "object").nunique() / len(df); high_card = card[card >= 0.9]
X = X.drop(columns = high_card.index)

#------------------------
#Step-5: Train Test Split
#------------------------

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2, random_state=42)

#--------------------------------
#Step-6: Apply Preprocessing on X
#--------------------------------

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler()).set_output(transform="pandas")
pipe.fit(xtrain)

xtrain_pre = pipe.transform(xtrain); xtest_pre = pipe.transform(xtest)

#--------------------------------------------
#Step-7: Build Model: Decision Tree Regressor
#--------------------------------------------

from sklearn.tree import DecisionTreeRegressor

dtr_model = DecisionTreeRegressor(max_depth=1, min_samples_split=5, min_samples_leaf=5)
dtr_model.fit(xtrain_pre, ytrain)
print(f'Train Model Score: {dtr_model.score(xtrain_pre, ytrain):.2f}')
print(f'Test Model Score: {dtr_model.score(xtest_pre, ytest):.2f}')

#-----------------------------
#Step-8: Hyperparameter Tuning
#-----------------------------
from sklearn.model_selection import RandomizedSearchCV
base = DecisionTreeRegressor(random_state=42)
params = {"max_depth": [8, 9, 10, 11], "min_samples_leaf": [6, 7, 8, 9, 10], "min_samples_split": [5, 6, 7, 8]}

rscv = RandomizedSearchCV(base, params, cv=5, scoring = "r2")
rscv.fit(xtrain_pre, ytrain)

best_dtr = rscv.best_estimator_
print(f'Best DTR Train Model Score: {best_dtr.score(xtrain_pre, ytrain):.2f}')
print(f'Best DTR Test Model Score: {best_dtr.score(xtest_pre, ytest):.2f}')

#---------------------------------------------------
#Step-10: Model Inference (Out of Sample Prediction)
#---------------------------------------------------
xnew = pd.read_csv('test_PowerPlant.csv')
xnew_pre = pipe.transform(xnew) 

ytrain_predict = best_dtr.predict(xnew_pre)
xnew["PE"] = ytrain_predict.round(2)

#------------------------------------------------------------
#Step-11:Save & Load Model using joblib.dump(), joblib.load()
#------------------------------------------------------------
import joblib

# joblib.dump(pipe, "pre.joblib")
# joblib.dump(best_dtr, "plant_model.joblib")
p = joblib.load("pre.joblib")
m = joblib.load("plant_model.joblib")

Actual PowerPlant.csv file data (9568, 5)
Duplicates Found: 41 Removing Duplicates...
Removed Duplicates...
Train Model Score: 0.72
Test Model Score: 0.73
Best DTR Train Model Score: 0.96
Best DTR Test Model Score: 0.95
