In [103]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy.dialects.mssql.information_schema import columns
from sklearn.preprocessing import OneHotEncoder

# Experiment 1
In this experiment, apart from previous attempts, fuels are to be separated by the samples (Wood, Coal, Digestate, etc.), not their main categories (Biomass, Plastic, Mix).
## Aim of the action
- As a result of regular 80-20 train-test-split approach, part of the data points from the same sample measurement at the same parameters fall into the test split (statistically, around one-third of the same sample at same conditions). This leads data leakage, and unreliable accuracy performance of model on test dataset.

See below example:
- Same fuel type (characteristics), 3 measurements at the same conditions


sample temperature	residence_time	pressure	heat_rate	wc	vm	fc	ac	c	h	o	n	s	cl	hc	oc	lhv

sewage_82	600	0.5	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943

sewage_83	600	0.5	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943

sewage_84	600	0.5	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943

**sewage_85	600	1	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943**

**sewage_86	600	1	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943**

**sewage_87	600	1	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943**


## Details
- 90-10 train-test split
- test data contains only **HTC, ebs1, hemicellulose** fuels
- max r2: 0.57 with Gradient Boosting (Baselines w/o hyperparameter tuning)

## Comments | To be improved
- shuffled dataset(?)
- different fuel_type selection




In [104]:
features = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\preprocess\features.csv", delimiter=';')
labels = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\preprocess\labels.csv", delimiter=';')

In [105]:
features.columns

Index(['sample', 'wc_1', 'vm_1', 'fc_1', 'ac_1', 'c_1', 'h_1', 'o_1', 'n_1',
       's_1', 'cl_1', 'hc_1', 'oc_1', 'lhv_1', 'wc_2', 'vm_2', 'fc_2', 'ac_2',
       'c_2', 'h_2', 'o_2', 'n_2', 's_2', 'cl_2', 'hc_2', 'oc_2', 'lhv_2',
       'atmosphere', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'x_fuel1', 'x_fuel2'],
      dtype='object')

In [106]:
wc = features['wc_1']*features['x_fuel1'] + features['wc_2']*features['x_fuel2']
vm = features['vm_1']*features['x_fuel1'] + features['vm_2']*features['x_fuel2']
fc = features['fc_1']*features['x_fuel1'] + features['fc_2']*features['x_fuel2']
ac = features['ac_1']*features['x_fuel1'] + features['ac_2']*features['x_fuel2']
c = features['c_1']*features['x_fuel1'] + features['c_2']*features['x_fuel2']
h = features['h_1']*features['x_fuel1'] + features['h_2']*features['x_fuel2']
o = features['o_1']*features['x_fuel1'] + features['o_2']*features['x_fuel2']
n = features['n_1']*features['x_fuel1'] + features['n_2']*features['x_fuel2']
s = features['s_1']*features['x_fuel1'] + features['s_2']*features['x_fuel2']
cl = features['cl_1']*features['x_fuel1'] + features['cl_2']*features['x_fuel2']
hc = features['hc_1']*features['x_fuel1'] + features['hc_2']*features['x_fuel2']
oc = features['oc_1']*features['x_fuel1'] + features['oc_2']*features['x_fuel2']
lhv = features['lhv_1']*features['x_fuel1'] + features['lhv_2']*features['x_fuel2']

In [107]:
features.drop(['wc_1', 'vm_1', 'fc_1', 'ac_1', 'c_1', 'h_1', 'o_1', 'n_1',
       's_1', 'cl_1', 'hc_1', 'oc_1', 'lhv_1', 'wc_2', 'vm_2', 'fc_2', 'ac_2',
       'c_2', 'h_2', 'o_2', 'n_2', 's_2', 'cl_2', 'hc_2', 'oc_2', 'lhv_2',
       'atmosphere','x_fuel1', 'x_fuel2'], axis=1, inplace=True)

In [108]:
features = pd.concat([features, wc, vm, fc, ac, c,h,o,n,s,cl,hc,oc,lhv ],  axis=1)


In [109]:
features.columns = ['sample', 'temperature', 'residence_time', 'pressure',
            'heat_rate', 'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc', 'lhv']

In [110]:
features.head()

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,o,n,s,cl,hc,oc,lhv
0,brown_coal_1,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
1,brown_coal_2,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
2,brown_coal_3,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
3,brown_coal_4,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
4,brown_coal_5,600,1.0,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13


In [111]:
def classify_fuel(sample):
    sample = str(sample).lower()

    if "gumm" in sample or "gm" in sample:
        return "gummipulver"
    elif "wod" in sample or 'wood' in sample or 'wd' in sample:
        return "wood"
    elif "coal" in sample or 'bks' in sample:
        return "coal"
    elif "swage" in sample or 'swg' in sample or 'sewage' in sample:
        return "sewage"
    elif "refuse" in sample or 'ref' in sample:
        return "ebs2"
    elif "digestate" in sample or 'dgst' in sample or "dgt" in sample:
        if 'pe' not in sample and 'pp' not in sample and 'car' not in sample:
            return "digestate"
        if 'pe' in sample:
            return "digestate_pe"
        if 'car' in sample:
            return "digestate_car"
        if 'pp' in sample:
            return "digestate_pp"
    elif "htc" in sample:
        return "HTC"
    elif "cel" in sample:
        return "cellulose"
    elif "hc" in sample:
        return "Hemicellulose"
    elif "lig" in sample:
        return "Lignin"
    elif "ebs2" in sample:
        return "ebs2"
    elif "ebs1" in sample:
        return "ebs1"
    else:
        return "unknown"  # Default category


features['fuel_type'] = features['sample'].apply(classify_fuel)

print(features.head())

         sample  temperature  residence_time  pressure  heat_rate    wc    vm  \
0  brown_coal_1          600             0.5       1.0       1000  10.8  44.2   
1  brown_coal_2          600             0.5       1.0       1000  10.8  44.2   
2  brown_coal_3          600             0.5       1.0       1000  10.8  44.2   
3  brown_coal_4          600             0.5       1.0       1000  10.8  44.2   
4  brown_coal_5          600             1.0       1.0       1000  10.8  44.2   

     fc   ac     c    h     o    n    s   cl   hc   oc    lhv fuel_type  
0  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
1  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
2  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
3  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
4  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  


In [112]:
def fuel_category(sample):
    sample = str(sample).lower()

    if "gumm" in sample or "gm" in sample:
        return "plastic"
    elif "wod" in sample or 'wood' in sample or 'wd' in sample:
        return "biomass"
    elif "coal" in sample or 'bks' in sample:
        return "coal"
    elif "swage" in sample or 'swg' in sample or 'sewage' in sample:
        return "biomass"
    elif "refuse" in sample or 'ref' in sample:
        return "plastic"
    elif "digestate" in sample or 'dgst' in sample or "dgt" in sample:
        if 'pe' not in sample and 'pp' not in sample and 'car' not in sample:
            return "plastic"
        if 'pe' in sample:
            return "mix"
        if 'car' in sample:
            return "mix"
        if 'pp' in sample:
            return "mix"
    elif "htc" in sample:
        return "biomass"
    elif "cel" in sample:
        return "biomass"
    elif "hc" in sample:
        return "biomass"
    elif "lig" in sample:
        return "biomass"
    elif "ebs2" in sample:
        return "plastic"
    elif "ebs1" in sample:
        return "plastic"
    else:
        return "unknown"  # Default category

features['fuel_category'] = features['sample'].apply(fuel_category)

In [113]:
features.columns

Index(['sample', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc',
       'lhv', 'fuel_type', 'fuel_category'],
      dtype='object')

In [114]:
features.isnull().sum()

sample              0
temperature         0
residence_time      0
pressure            0
heat_rate           0
wc                  0
vm                  0
fc                  0
ac                  0
c                   0
h                   0
o                   0
n                   0
s                   0
cl                  0
hc                  0
oc                  0
lhv               206
fuel_type           0
fuel_category       0
dtype: int64

In [115]:
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(features[['fuel_category']])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
features_encoded = pd.concat([features, one_hot_df], axis=1)
features_encoded.drop(columns=['fuel_category'], inplace=True)
print(features_encoded.head())


         sample  temperature  residence_time  pressure  heat_rate    wc    vm  \
0  brown_coal_1          600             0.5       1.0       1000  10.8  44.2   
1  brown_coal_2          600             0.5       1.0       1000  10.8  44.2   
2  brown_coal_3          600             0.5       1.0       1000  10.8  44.2   
3  brown_coal_4          600             0.5       1.0       1000  10.8  44.2   
4  brown_coal_5          600             1.0       1.0       1000  10.8  44.2   

     fc   ac     c  ...    s   cl   hc   oc    lhv  fuel_type  \
0  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
1  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
2  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
3  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
4  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   

   fuel_category_biomass  fuel_category_coal fuel_category_mix  \
0                    0.0                 1.0            

In [142]:
features_encoded = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\features_encoded.csv")

In [143]:
fuel_type_counts = features_encoded['fuel_type'].value_counts()
print(fuel_type_counts)

fuel_type
sewage           241
digestate        236
wood             236
digestate_car    206
coal             195
digestate_pp     183
digestate_pe     174
ebs2             138
HTC              134
ebs1              62
gummipulver       56
Hemicellulose     44
Lignin            44
cellulose         41
Name: count, dtype: int64


In [144]:
data = pd.concat([features_encoded, labels['devol_yield']], axis=1)

In [145]:
data.columns

Index(['sample', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc',
       'lhv', 'fuel_type', 'fuel_category_biomass', 'fuel_category_coal',
       'fuel_category_mix', 'fuel_category_plastic', 'devol_yield'],
      dtype='object')

# Train-Test Split
- **HTC, ebs1, hemicellulose** -> Test Data
- 90-10 train-test split


In [146]:
train_data = data.loc[(data['fuel_type'] != 'ebs1') & (data['fuel_type'] != 'HTC') & (data['fuel_type'] != 'hemicellulose')].reset_index(drop=True).drop(columns=['fuel_type'])
train_data.describe()

Unnamed: 0,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
count,1794.0,1794.0,1794.0,1794.0,1794.0,1794.0,1794.0,1794.0,1794.0,1794.0,...,1794.0,1794.0,1794.0,1794.0,1588.0,1794.0,1794.0,1794.0,1794.0,1794.0
mean,806.962096,5.26728,2.479933,935.41806,6.70026,62.545532,15.718578,15.0465,50.182138,6.059713,...,0.606277,0.296887,1.415988,1.699834,21.300064,0.337793,0.108696,0.313824,0.239688,51.713729
std,194.46918,4.398707,4.087087,230.697597,2.722819,12.919325,10.642208,11.699581,12.518899,1.796539,...,0.731098,0.581173,0.355244,5.155856,6.454991,0.473089,0.311344,0.464175,0.427012,22.490343
min,200.0,0.5,0.5,10.0,2.6,44.2,3.064326,0.0,32.4,3.9,...,0.04642,0.0,0.461416,0.0,13.943,0.0,0.0,0.0,0.0,-20.537715
25%,600.0,1.0,1.0,1000.0,4.998,53.7,7.546,4.2,37.5,4.6,...,0.1465,0.0,1.4,0.2,15.055,0.0,0.0,0.0,0.0,36.270342
50%,800.0,5.0,1.0,1000.0,7.6,59.635834,15.076181,12.561,50.4,5.7,...,0.2,0.0,1.5,0.325901,19.843,0.0,0.0,0.0,0.0,53.535348
75%,1000.0,10.0,1.0,1000.0,7.949662,75.3875,16.4,23.7,61.224,7.1004,...,0.6,0.0035,1.675,0.4,23.18104,1.0,0.0,1.0,0.0,67.950371
max,1200.0,20.0,20.0,1000.0,10.8,91.735674,40.9,36.69,82.3,10.13205,...,2.5,1.585535,1.857,26.10225,35.8,1.0,1.0,1.0,1.0,99.756944


In [147]:
train_data

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,brown_coal_1,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,...,0.6,0.0,0.7,0.2,20.13,0.0,1.0,0.0,0.0,3.437227
1,brown_coal_2,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,...,0.6,0.0,0.7,0.2,20.13,0.0,1.0,0.0,0.0,3.163017
2,brown_coal_3,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,...,0.6,0.0,0.7,0.2,20.13,0.0,1.0,0.0,0.0,4.827940
3,brown_coal_4,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,...,0.6,0.0,0.7,0.2,20.13,0.0,1.0,0.0,0.0,4.509894
4,brown_coal_5,600,1.0,1.0,1000,10.8,44.2,40.9,4.2,63.4,...,0.6,0.0,0.7,0.2,20.13,0.0,1.0,0.0,0.0,4.128819
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1789,gm_39,1000,1.0,10.0,10,2.6,58.9,27.6,10.9,82.3,...,2.5,0.1,1.0,0.0,35.80,0.0,0.0,0.0,1.0,66.583182
1790,gm_40,1000,1.0,10.0,50,2.6,58.9,27.6,10.9,82.3,...,2.5,0.1,1.0,0.0,35.80,0.0,0.0,0.0,1.0,68.838085
1791,gm_41,1000,1.0,10.0,100,2.6,58.9,27.6,10.9,82.3,...,2.5,0.1,1.0,0.0,35.80,0.0,0.0,0.0,1.0,68.634051
1792,gm_42,1000,1.0,10.0,500,2.6,58.9,27.6,10.9,82.3,...,2.5,0.1,1.0,0.0,35.80,0.0,0.0,0.0,1.0,69.513635


In [148]:
test_data = data.loc[data['fuel_type'].isin(['ebs1', 'HTC', 'hemicellulose'])].reset_index(drop=True).drop(columns=['fuel_type'])

In [149]:
test_data

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,htc_1,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,17.427386
1,htc_9,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,11.277050
2,htc_15,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,16.384181
3,htc_96,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,6.785137
4,htc_152,200,1.0,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,2.878561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,_ebs1_39,1000,1.0,10.0,10,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,81.351024
192,_ebs1_40,1000,1.0,10.0,50,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,86.075198
193,_ebs1_41,1000,1.0,10.0,100,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,86.096187
194,_ebs1_42,1000,1.0,10.0,500,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,85.577563


In [150]:
X_train = train_data.drop(columns=['sample', 'devol_yield'])
y_train = train_data['devol_yield']

X_test = test_data.drop(columns=['sample', 'devol_yield'])
y_test = test_data['devol_yield']

In [151]:
X_train.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\X_train.csv", index=False)
X_test.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\X_test.csv", index=False)
y_train.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\y_train.csv", index=False)
y_test.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\y_test.csv", index=False)

# kNN Imputation & Standard Scaling

In [152]:
import shap
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

In [155]:
# Step 1: Apply KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)  # You can adjust n_neighbors
X_train_imputed = knn_imputer.fit_transform(X_train)
X_test_imputed = knn_imputer.transform(X_test)  # Do not fit on test data!

# Step 2: Apply Standard Scaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)  # Do not fit on test data!

# Convert back to DataFrame (optional, if needed)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Baseline Models
## Simple Baselines

In [157]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor

# Dummy models
mean_model = DummyRegressor(strategy="mean")
median_model = DummyRegressor(strategy="median")

# KNN Regression
knn_model = KNeighborsRegressor(n_neighbors=5)

mean_model.fit(X_train_scaled, y_train)
median_model.fit(X_train_scaled, y_train)
knn_model.fit(X_train_scaled, y_train)
mean_predictions = mean_model.predict(X_test_scaled)
median_predictions = median_model.predict(X_test_scaled)
knn_predictions = knn_model.predict(X_test_scaled)

print('Mean Model Score:', mean_model.score(X_test_scaled, y_test))
print('Median Model Score:', median_model.score(X_test_scaled, y_test))
print('KNN Model Score:', knn_model.score(X_test_scaled, y_test))
from sklearn.metrics import mean_absolute_error, mean_squared_error

print("Mean Model MAE:", mean_absolute_error(y_test, mean_predictions))
print("Median Model MAE:", mean_absolute_error(y_test, median_predictions))
print("KNN Model MAE:", mean_absolute_error(y_test, knn_predictions))

Mean Model Score: -0.048695508919001984
Median Model Score: -0.016344168297513173
KNN Model Score: 0.24190642249238115
Mean Model MAE: 15.57198958617905
Median Model MAE: 15.031100576239796
KNN Model MAE: 12.339735015287758


## 2. Linear Models
Linear Regression (OLS)

Ridge Regression (L2 regularization)

Lasso Regression (L1 regularization)

Elastic Net (Combination of L1 and L2)

In [158]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

lin_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=0.1)
elastic_model = ElasticNet(alpha=0.1, l1_ratio=0.5)
lin_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)
elastic_model.fit(X_train_scaled, y_train)
print('Linear Model Score:', lin_model.score(X_test_scaled, y_test))
print('Ridge Model Score:', ridge_model.score(X_test_scaled, y_test))
print('Lasso Model Score:', lasso_model.score(X_test_scaled, y_test))
print('Elastic Model Score:', elastic_model.score(X_test_scaled, y_test))

Linear Model Score: -0.5331588995454577
Ridge Model Score: 0.13028924662527708
Lasso Model Score: 0.4375609780409363
Elastic Model Score: 0.4191626678331708


## 3. Tree-Based Models
Decision Tree Regression: Simple non-linear model.

Random Forest Regression: Ensemble of decision trees.

Gradient Boosting Regression (GBR)

XGBoost

LightGBM
CatBoost

In [160]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

dt_model = DecisionTreeRegressor(max_depth=5)
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5)
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
dt_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
gbr_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)
lgb_model.fit(X_train_scaled, y_train)
dt_predictions = dt_model.predict(X_test_scaled)
gbr_predictions = gbr_model.predict(X_test_scaled)
lgb_predictions = lgb_model.predict(X_test_scaled)
xgb_predictions = xgb_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test_scaled)
print('Decision Tree Regressor Score:', mean_squared_error(y_test, dt_predictions))
print('Gradient Boosting Regressor Score:', mean_squared_error(y_test, gbr_predictions))
print('lightGBM Regressor Score:', mean_squared_error(y_test, lgb_predictions))
print('xgboost Regressor Score:', mean_squared_error(y_test, xgb_predictions))
print('RF Regressor Score:', mean_squared_error(y_test, rf_predictions))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1525
[LightGBM] [Info] Number of data points in the train set: 1794, number of used features: 21
[LightGBM] [Info] Start training from score 51.713729
Decision Tree Regressor Score: 254.66124820031087
Gradient Boosting Regressor Score: 162.21887997993437
lightGBM Regressor Score: 194.6909178822075
xgboost Regressor Score: 193.96822951289573
RF Regressor Score: 259.7891118551846


In [161]:
print('Decision Tree Regressor Score:', dt_model.score(X_test_scaled, y_test))
print('Gradient Boosting Regressor Score:', gbr_model.score(X_test_scaled, y_test))
print('lightGBM Regressor Score:', lgb_model.score(X_test_scaled, y_test))
print('XGBoost Regressor Score:', xgb_model.score(X_test_scaled, y_test))
print('RF Regressor Score:', rf_model.score(X_test_scaled, y_test))

Decision Tree Regressor Score: 0.33871368646797984
Gradient Boosting Regressor Score: 0.5787614885055249
lightGBM Regressor Score: 0.4944403976877504
XGBoost Regressor Score: 0.49631702371920194
RF Regressor Score: 0.3253980128953795


## 4. Nonlinear & Probabilistic Models
Gaussian Process Regression (GPR)

Support Vector Regression (SVR)

Neural Networks (MLP)

In [162]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

gpr_model = GaussianProcessRegressor()
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', max_iter=2000)
gpr_model.fit(X_train_scaled, y_train)
svr_model.fit(X_train_scaled, y_train)
mlp_model.fit(X_train_scaled, y_train)
print('Gaussian Process Regressor Score:', gpr_model.score(X_test_scaled, y_test))
print('SVR Score:', svr_model.score(X_test_scaled, y_test))
print('MLP Score:', mlp_model.score(X_test_scaled, y_test))

Gaussian Process Regressor Score: -7.770767061490957
SVR Score: 0.49780001034597887
MLP Score: 0.2966012184275231
