In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy.dialects.mssql.information_schema import columns
from sklearn.preprocessing import OneHotEncoder

# Experiment 2
In this experiment, apart from previous attempts, fuels are to be separated by the samples (Wood, Coal, Digestate, etc.), not their main categories (Biomass, Plastic, Mix).
## Aim of the action
- As a result of regular 80-20 train-test-split approach, part of the data points from the same sample measurement at the same parameters fall into the test split (statistically, around one-third of the same sample at same conditions). This leads data leakage, and unreliable accuracy performance of model on test dataset.

See below example:
- Same fuel type (characteristics), 3 measurements at the same conditions


sample temperature	residence_time	pressure	heat_rate	wc	vm	fc	ac	c	h	o	n	s	cl	hc	oc	lhv

sewage_82	600	0.5	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943

sewage_83	600	0.5	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943

sewage_84	600	0.5	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943

**sewage_85	600	1	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943**

**sewage_86	600	1	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943**

**sewage_87	600	1	1	1000	3.2	53.7	6.41	36.69	32.4	4.6	17.11	3.9	2.1	1.5	1.703703704	0.396064815	13.943**


## Details & Key Changes
- test data contains only **coal** fuels
- 90-10 train-test split
- max r2: 0.90 with Gaussian Process (Baselines w/o hyperparameter tuning)


## Comments | To be improved
- Some of the models (decision tree, basic linear models etc.) are sensitive to outliers or prone to overfitting. Proper tuning is promising to increase model performances.
- Outlier removal **by grouping each fuel type by the temperatures**

In [46]:
features = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\preprocess\features.csv", delimiter=';')
labels = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\preprocess\labels.csv", delimiter=';')

In [47]:
features.columns

Index(['sample', 'wc_1', 'vm_1', 'fc_1', 'ac_1', 'c_1', 'h_1', 'o_1', 'n_1',
       's_1', 'cl_1', 'hc_1', 'oc_1', 'lhv_1', 'wc_2', 'vm_2', 'fc_2', 'ac_2',
       'c_2', 'h_2', 'o_2', 'n_2', 's_2', 'cl_2', 'hc_2', 'oc_2', 'lhv_2',
       'atmosphere', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'x_fuel1', 'x_fuel2'],
      dtype='object')

In [48]:
wc = features['wc_1']*features['x_fuel1'] + features['wc_2']*features['x_fuel2']
vm = features['vm_1']*features['x_fuel1'] + features['vm_2']*features['x_fuel2']
fc = features['fc_1']*features['x_fuel1'] + features['fc_2']*features['x_fuel2']
ac = features['ac_1']*features['x_fuel1'] + features['ac_2']*features['x_fuel2']
c = features['c_1']*features['x_fuel1'] + features['c_2']*features['x_fuel2']
h = features['h_1']*features['x_fuel1'] + features['h_2']*features['x_fuel2']
o = features['o_1']*features['x_fuel1'] + features['o_2']*features['x_fuel2']
n = features['n_1']*features['x_fuel1'] + features['n_2']*features['x_fuel2']
s = features['s_1']*features['x_fuel1'] + features['s_2']*features['x_fuel2']
cl = features['cl_1']*features['x_fuel1'] + features['cl_2']*features['x_fuel2']
hc = features['hc_1']*features['x_fuel1'] + features['hc_2']*features['x_fuel2']
oc = features['oc_1']*features['x_fuel1'] + features['oc_2']*features['x_fuel2']
lhv = features['lhv_1']*features['x_fuel1'] + features['lhv_2']*features['x_fuel2']

In [49]:
features.drop(['wc_1', 'vm_1', 'fc_1', 'ac_1', 'c_1', 'h_1', 'o_1', 'n_1',
       's_1', 'cl_1', 'hc_1', 'oc_1', 'lhv_1', 'wc_2', 'vm_2', 'fc_2', 'ac_2',
       'c_2', 'h_2', 'o_2', 'n_2', 's_2', 'cl_2', 'hc_2', 'oc_2', 'lhv_2',
       'atmosphere','x_fuel1', 'x_fuel2'], axis=1, inplace=True)

In [50]:
features = pd.concat([features, wc, vm, fc, ac, c,h,o,n,s,cl,hc,oc,lhv ],  axis=1)


In [51]:
features.columns = ['sample', 'temperature', 'residence_time', 'pressure',
            'heat_rate', 'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc', 'lhv']

In [52]:
features.head()

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,o,n,s,cl,hc,oc,lhv
0,brown_coal_1,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
1,brown_coal_2,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
2,brown_coal_3,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
3,brown_coal_4,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
4,brown_coal_5,600,1.0,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13


In [53]:
def classify_fuel(sample):
    sample = str(sample).lower()

    if "gumm" in sample or "gm" in sample:
        return "gummipulver"
    elif "wod" in sample or 'wood' in sample or 'wd' in sample:
        return "wood"
    elif "coal" in sample or 'bks' in sample:
        return "coal"
    elif "swage" in sample or 'swg' in sample or 'sewage' in sample:
        return "sewage"
    elif "refuse" in sample or 'ref' in sample:
        return "ebs2"
    elif "digestate" in sample or 'dgst' in sample or "dgt" in sample:
        if 'pe' not in sample and 'pp' not in sample and 'car' not in sample:
            return "digestate"
        if 'pe' in sample:
            return "digestate_pe"
        if 'car' in sample:
            return "digestate_car"
        if 'pp' in sample:
            return "digestate_pp"
    elif "htc" in sample:
        return "HTC"
    elif "cel" in sample:
        return "cellulose"
    elif "hc" in sample:
        return "Hemicellulose"
    elif "lig" in sample:
        return "Lignin"
    elif "ebs2" in sample:
        return "ebs2"
    elif "ebs1" in sample:
        return "ebs1"
    else:
        return "unknown"  # Default category


features['fuel_type'] = features['sample'].apply(classify_fuel)

print(features.head())

         sample  temperature  residence_time  pressure  heat_rate    wc    vm  \
0  brown_coal_1          600             0.5       1.0       1000  10.8  44.2   
1  brown_coal_2          600             0.5       1.0       1000  10.8  44.2   
2  brown_coal_3          600             0.5       1.0       1000  10.8  44.2   
3  brown_coal_4          600             0.5       1.0       1000  10.8  44.2   
4  brown_coal_5          600             1.0       1.0       1000  10.8  44.2   

     fc   ac     c    h     o    n    s   cl   hc   oc    lhv fuel_type  
0  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
1  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
2  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
3  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
4  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  


In [54]:
def fuel_category(sample):
    sample = str(sample).lower()

    if "gumm" in sample or "gm" in sample:
        return "plastic"
    elif "wod" in sample or 'wood' in sample or 'wd' in sample:
        return "biomass"
    elif "coal" in sample or 'bks' in sample:
        return "coal"
    elif "swage" in sample or 'swg' in sample or 'sewage' in sample:
        return "biomass"
    elif "refuse" in sample or 'ref' in sample:
        return "plastic"
    elif "digestate" in sample or 'dgst' in sample or "dgt" in sample:
        if 'pe' not in sample and 'pp' not in sample and 'car' not in sample:
            return "plastic"
        if 'pe' in sample:
            return "mix"
        if 'car' in sample:
            return "mix"
        if 'pp' in sample:
            return "mix"
    elif "htc" in sample:
        return "biomass"
    elif "cel" in sample:
        return "biomass"
    elif "hc" in sample:
        return "biomass"
    elif "lig" in sample:
        return "biomass"
    elif "ebs2" in sample:
        return "plastic"
    elif "ebs1" in sample:
        return "plastic"
    else:
        return "unknown"  # Default category

features['fuel_category'] = features['sample'].apply(fuel_category)

In [55]:
set(features['fuel_category'])

{'biomass', 'coal', 'mix', 'plastic'}

In [56]:
features.columns

Index(['sample', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc',
       'lhv', 'fuel_type', 'fuel_category'],
      dtype='object')

In [57]:
features.isnull().sum()

sample              0
temperature         0
residence_time      0
pressure            0
heat_rate           0
wc                  0
vm                  0
fc                  0
ac                  0
c                   0
h                   0
o                   0
n                   0
s                   0
cl                  0
hc                  0
oc                  0
lhv               206
fuel_type           0
fuel_category       0
dtype: int64

In [58]:
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(features[['fuel_category']])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
features_encoded = pd.concat([features, one_hot_df], axis=1)
features_encoded.drop(columns=['fuel_category'], inplace=True)
print(features_encoded.head())


         sample  temperature  residence_time  pressure  heat_rate    wc    vm  \
0  brown_coal_1          600             0.5       1.0       1000  10.8  44.2   
1  brown_coal_2          600             0.5       1.0       1000  10.8  44.2   
2  brown_coal_3          600             0.5       1.0       1000  10.8  44.2   
3  brown_coal_4          600             0.5       1.0       1000  10.8  44.2   
4  brown_coal_5          600             1.0       1.0       1000  10.8  44.2   

     fc   ac     c  ...    s   cl   hc   oc    lhv  fuel_type  \
0  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
1  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
2  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
3  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
4  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   

   fuel_category_biomass  fuel_category_coal fuel_category_mix  \
0                    0.0                 1.0            

In [59]:
features_encoded = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\features_encoded.csv")

In [60]:
fuel_type_counts = features_encoded['fuel_type'].value_counts()
print(fuel_type_counts)

fuel_type
sewage           241
digestate        236
wood             236
digestate_car    206
coal             195
digestate_pp     183
digestate_pe     174
ebs2             138
HTC              134
ebs1              62
gummipulver       56
Hemicellulose     44
Lignin            44
cellulose         41
Name: count, dtype: int64


In [61]:
data = pd.concat([features_encoded, labels['devol_yield']], axis=1)

In [62]:
data.columns

Index(['sample', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc',
       'lhv', 'fuel_type', 'fuel_category_biomass', 'fuel_category_coal',
       'fuel_category_mix', 'fuel_category_plastic', 'devol_yield'],
      dtype='object')

# Train-Test Split
- **coal** -> Test Data
- 90-10 train-test split


In [64]:
train_data = data.loc[data['fuel_type'] != 'coal'].reset_index(drop=True).drop(columns=['fuel_type'])
train_data.describe()

Unnamed: 0,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
count,1792.0,1792.0,1792.0,1792.0,1792.0,1792.0,1792.0,1792.0,1792.0,1792.0,...,1792.0,1792.0,1792.0,1792.0,1586.0,1792.0,1792.0,1792.0,1792.0,1792.0
mean,802.449777,5.224888,2.518415,926.489955,6.166711,64.116932,12.030067,17.686663,48.114919,6.185491,...,0.594473,0.295717,1.481106,1.694793,21.217813,0.41183,0.0,0.314174,0.273996,53.890596
std,198.394372,4.416346,4.061221,245.294319,2.426139,12.481599,6.250875,12.506266,12.931507,1.889008,...,0.735614,0.58044,0.264275,5.131881,7.159653,0.492302,0.0,0.464316,0.446131,21.785119
min,200.0,0.5,0.5,10.0,2.6,48.24,3.064326,0.0,32.4,3.265,...,0.04642,0.0,0.461416,0.0,13.528,0.0,0.0,0.0,0.0,0.606673
25%,600.0,1.0,1.0,1000.0,3.2,53.7,6.9,8.2,37.5,4.6,...,0.1465,0.0,1.4,0.250702,15.055,0.0,0.0,0.0,0.0,40.047269
50%,800.0,2.0,1.0,1000.0,5.81,59.804231,11.55,17.775,49.517,5.7,...,0.2,0.0,1.5,0.396065,19.843,0.0,0.0,0.0,0.0,55.975698
75%,1000.0,10.0,1.0,1000.0,7.65,75.4,15.4,23.7,55.100221,7.19625,...,0.8,0.0035,1.675,0.4,29.824045,1.0,0.0,1.0,1.0,69.030604
max,1200.0,20.0,20.0,1000.0,10.2,91.735674,35.249915,37.52,82.3,10.13205,...,2.5,1.585535,1.857,26.10225,35.8,1.0,0.0,1.0,1.0,99.756944


In [65]:
train_data

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,sewage_82,600,0.5,1.0,1000,3.2,53.7,6.41,36.69,32.4,...,2.1,1.5,1.703704,0.396065,13.943,1.0,0.0,0.0,0.0,9.630702
1,sewage_83,600,0.5,1.0,1000,3.2,53.7,6.41,36.69,32.4,...,2.1,1.5,1.703704,0.396065,13.943,1.0,0.0,0.0,0.0,20.481050
2,sewage_84,600,0.5,1.0,1000,3.2,53.7,6.41,36.69,32.4,...,2.1,1.5,1.703704,0.396065,13.943,1.0,0.0,0.0,0.0,7.439287
3,sewage_85,600,1.0,1.0,1000,3.2,53.7,6.41,36.69,32.4,...,2.1,1.5,1.703704,0.396065,13.943,1.0,0.0,0.0,0.0,5.423839
4,sewage_86,600,1.0,1.0,1000,3.2,53.7,6.41,36.69,32.4,...,2.1,1.5,1.703704,0.396065,13.943,1.0,0.0,0.0,0.0,17.044366
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1787,gm_39,1000,1.0,10.0,10,2.6,58.9,27.60,10.90,82.3,...,2.5,0.1,1.000000,0.000000,35.800,0.0,0.0,0.0,1.0,66.583182
1788,gm_40,1000,1.0,10.0,50,2.6,58.9,27.60,10.90,82.3,...,2.5,0.1,1.000000,0.000000,35.800,0.0,0.0,0.0,1.0,68.838085
1789,gm_41,1000,1.0,10.0,100,2.6,58.9,27.60,10.90,82.3,...,2.5,0.1,1.000000,0.000000,35.800,0.0,0.0,0.0,1.0,68.634051
1790,gm_42,1000,1.0,10.0,500,2.6,58.9,27.60,10.90,82.3,...,2.5,0.1,1.000000,0.000000,35.800,0.0,0.0,0.0,1.0,69.513635


In [66]:
test_data = data.loc[data['fuel_type'].isin(['ebs1', 'HTC', 'hemicellulose'])].reset_index(drop=True).drop(columns=['fuel_type'])

In [67]:
test_data

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,htc_1,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,17.427386
1,htc_9,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,11.277050
2,htc_15,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,16.384181
3,htc_96,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,6.785137
4,htc_152,200,1.0,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.342,0.0,1.2,0.4,13.528,1.0,0.0,0.0,0.0,2.878561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191,_ebs1_39,1000,1.0,10.0,10,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,81.351024
192,_ebs1_40,1000,1.0,10.0,50,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,86.075198
193,_ebs1_41,1000,1.0,10.0,100,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,86.096187
194,_ebs1_42,1000,1.0,10.0,500,2.700,82.10,7.0,8.20,68.700,...,0.800,0.0,1.5,0.1,32.177,0.0,0.0,0.0,1.0,85.577563


In [68]:
X_train = train_data.drop(columns=['sample', 'devol_yield'])
y_train = train_data['devol_yield']

X_test = test_data.drop(columns=['sample', 'devol_yield'])
y_test = test_data['devol_yield']

In [69]:
X_train.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\X_train.csv", index=False)
X_test.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\X_test.csv", index=False)
y_train.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\y_train.csv", index=False)
y_test.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\y_test.csv", index=False)

# kNN Imputation & Standard Scaling

In [70]:
import shap
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

In [71]:
# Step 1: Apply KNN Imputer
knn_imputer = KNNImputer(n_neighbors=5)
X_train_imputed = knn_imputer.fit_transform(X_train)
X_test_imputed = knn_imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Baseline Models
## Simple Baselines

In [72]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor

# Dummy models
mean_model = DummyRegressor(strategy="mean")
median_model = DummyRegressor(strategy="median")

# KNN Regression
knn_model = KNeighborsRegressor(n_neighbors=5)

mean_model.fit(X_train_scaled, y_train)
median_model.fit(X_train_scaled, y_train)
knn_model.fit(X_train_scaled, y_train)
mean_predictions = mean_model.predict(X_test_scaled)
median_predictions = median_model.predict(X_test_scaled)
knn_predictions = knn_model.predict(X_test_scaled)

print('Mean Model Score:', mean_model.score(X_test_scaled, y_test))
print('Median Model Score:', median_model.score(X_test_scaled, y_test))
print('KNN Model Score:', knn_model.score(X_test_scaled, y_test))
from sklearn.metrics import mean_absolute_error, mean_squared_error

print("Mean Model MAE:", mean_absolute_error(y_test, mean_predictions))
print("Median Model MAE:", mean_absolute_error(y_test, median_predictions))
print("KNN Model MAE:", mean_absolute_error(y_test, knn_predictions))

Mean Model Score: -0.01204320917166668
Median Model Score: -1.2171062144794575e-05
KNN Model Score: 0.7889137260358863
Mean Model MAE: 14.94223461190517
Median Model MAE: 14.614752681596938
KNN Model MAE: 6.1274879709765315


## 2. Linear Models
Linear Regression (OLS)

Ridge Regression (L2 regularization)

Lasso Regression (L1 regularization)

Elastic Net (Combination of L1 and L2)

In [73]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

lin_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=0.1)
elastic_model = ElasticNet(alpha=0.1, l1_ratio=0.5)
lin_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)
elastic_model.fit(X_train_scaled, y_train)

print('Linear Model Score:', lin_model.score(X_test_scaled, y_test))
print('Ridge Model Score:', ridge_model.score(X_test_scaled, y_test))
print('Lasso Model Score:', lasso_model.score(X_test_scaled, y_test))
print('Elastic Model Score:', elastic_model.score(X_test_scaled, y_test))

Linear Model Score: 0.5106170789530056
Ridge Model Score: 0.5116493770026761
Lasso Model Score: 0.5073228260763099
Elastic Model Score: 0.5149778262391527


## 3. Tree-Based Models
Decision Tree Regression: Simple non-linear model.

Random Forest Regression: Ensemble of decision trees.

Gradient Boosting Regression (GBR)

XGBoost

LightGBM
CatBoost

In [74]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

dt_model = DecisionTreeRegressor(max_depth=5)
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5)
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
dt_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
gbr_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)
lgb_model.fit(X_train_scaled, y_train)
dt_predictions = dt_model.predict(X_test_scaled)
gbr_predictions = gbr_model.predict(X_test_scaled)
lgb_predictions = lgb_model.predict(X_test_scaled)
xgb_predictions = xgb_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test_scaled)
print('Decision Tree Regressor Score:', mean_squared_error(y_test, dt_predictions))
print('Gradient Boosting Regressor Score:', mean_squared_error(y_test, gbr_predictions))
print('lightGBM Regressor Score:', mean_squared_error(y_test, lgb_predictions))
print('xgboost Regressor Score:', mean_squared_error(y_test, xgb_predictions))
print('RF Regressor Score:', mean_squared_error(y_test, rf_predictions))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1364
[LightGBM] [Info] Number of data points in the train set: 1792, number of used features: 20
[LightGBM] [Info] Start training from score 53.890596
Decision Tree Regressor Score: 133.35639565194106
Gradient Boosting Regressor Score: 81.84243497472974
lightGBM Regressor Score: 54.09901074492769
xgboost Regressor Score: 45.11613872319001
RF Regressor Score: 113.16460941973253


In [75]:
print('Decision Tree Regressor Score:', dt_model.score(X_test_scaled, y_test))
print('Gradient Boosting Regressor Score:', gbr_model.score(X_test_scaled, y_test))
print('lightGBM Regressor Score:', lgb_model.score(X_test_scaled, y_test))
print('XGBoost Regressor Score:', xgb_model.score(X_test_scaled, y_test))
print('RF Regressor Score:', rf_model.score(X_test_scaled, y_test))

Decision Tree Regressor Score: 0.6537095459564233
Gradient Boosting Regressor Score: 0.7874773547314411
lightGBM Regressor Score: 0.8595195160863163
XGBoost Regressor Score: 0.8828456026666835
RF Regressor Score: 0.7061421479933887


## 4. Nonlinear & Probabilistic Models
Gaussian Process Regression (GPR)

Support Vector Regression (SVR)

Neural Networks (MLP)

In [76]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

gpr_model = GaussianProcessRegressor()
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', max_iter=2000)
gpr_model.fit(X_train_scaled, y_train)
svr_model.fit(X_train_scaled, y_train)
mlp_model.fit(X_train_scaled, y_train)
print('Gaussian Process Regressor Score:', gpr_model.score(X_test_scaled, y_test))
print('SVR Score:', svr_model.score(X_test_scaled, y_test))
print('MLP Score:', mlp_model.score(X_test_scaled, y_test))

Gaussian Process Regressor Score: 0.9036725388819906
SVR Score: 0.5905216526872621
MLP Score: 0.8162590353022333
