In [208]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy.dialects.mssql.information_schema import columns
from sklearn.preprocessing import OneHotEncoder

# Experiment 3
In this experiment, apart from previous attempts, samples are to be split into train-test sets by the samples (Wood, Coal, Digestate, etc.), not random 80-20 split.

## Aim of the action
- As a result of regular 80-20 train-test-split approach, part of the data points from the same sample measurement at the same parameters fall into the test split (statistically, around one-third of the same sample at same conditions). This leads data leakage, and unreliable accuracy performance of model on test dataset.

## Details & Key Changes
### Outlier Removal
Biomass Data Before: 740 	After: 733	 Removed: 7
Mix Data Before:	 563 	After: 555	 Removed: 8
Coal Data Before:	 195 	After: 191	 Removed: 4
Plastic Data Before: 492 	After: 482	 Removed: 10

- test data contains only **mixed** fuels (digestate_car, digestate_pp, digestate_pe)
- max r2: 0.50 with Gradient Boosting (Baselines w/o hyperparameter tuning)


## Comments | To be improved
- Other fuel combinations are to be selected as test set.

In [209]:
features = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\preprocess\features.csv", delimiter=';')
labels = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\preprocess\labels.csv", delimiter=';')

In [210]:
features.columns

Index(['sample', 'wc_1', 'vm_1', 'fc_1', 'ac_1', 'c_1', 'h_1', 'o_1', 'n_1',
       's_1', 'cl_1', 'hc_1', 'oc_1', 'lhv_1', 'wc_2', 'vm_2', 'fc_2', 'ac_2',
       'c_2', 'h_2', 'o_2', 'n_2', 's_2', 'cl_2', 'hc_2', 'oc_2', 'lhv_2',
       'atmosphere', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'x_fuel1', 'x_fuel2'],
      dtype='object')

In [211]:
wc = features['wc_1']*features['x_fuel1'] + features['wc_2']*features['x_fuel2']
vm = features['vm_1']*features['x_fuel1'] + features['vm_2']*features['x_fuel2']
fc = features['fc_1']*features['x_fuel1'] + features['fc_2']*features['x_fuel2']
ac = features['ac_1']*features['x_fuel1'] + features['ac_2']*features['x_fuel2']
c = features['c_1']*features['x_fuel1'] + features['c_2']*features['x_fuel2']
h = features['h_1']*features['x_fuel1'] + features['h_2']*features['x_fuel2']
o = features['o_1']*features['x_fuel1'] + features['o_2']*features['x_fuel2']
n = features['n_1']*features['x_fuel1'] + features['n_2']*features['x_fuel2']
s = features['s_1']*features['x_fuel1'] + features['s_2']*features['x_fuel2']
cl = features['cl_1']*features['x_fuel1'] + features['cl_2']*features['x_fuel2']
hc = features['hc_1']*features['x_fuel1'] + features['hc_2']*features['x_fuel2']
oc = features['oc_1']*features['x_fuel1'] + features['oc_2']*features['x_fuel2']
lhv = features['lhv_1']*features['x_fuel1'] + features['lhv_2']*features['x_fuel2']

In [212]:
features.drop(['wc_1', 'vm_1', 'fc_1', 'ac_1', 'c_1', 'h_1', 'o_1', 'n_1',
       's_1', 'cl_1', 'hc_1', 'oc_1', 'lhv_1', 'wc_2', 'vm_2', 'fc_2', 'ac_2',
       'c_2', 'h_2', 'o_2', 'n_2', 's_2', 'cl_2', 'hc_2', 'oc_2', 'lhv_2',
       'atmosphere','x_fuel1', 'x_fuel2'], axis=1, inplace=True)

In [213]:
features = pd.concat([features, wc, vm, fc, ac, c,h,o,n,s,cl,hc,oc,lhv ],  axis=1)


In [214]:
features.columns = ['sample', 'temperature', 'residence_time', 'pressure',
            'heat_rate', 'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc', 'lhv']

In [215]:
features.head()

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,o,n,s,cl,hc,oc,lhv
0,brown_coal_1,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
1,brown_coal_2,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
2,brown_coal_3,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
3,brown_coal_4,600,0.5,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13
4,brown_coal_5,600,1.0,1.0,1000,10.8,44.2,40.9,4.2,63.4,3.9,16.2,0.9,0.6,0.0,0.7,0.2,20.13


In [216]:
def classify_fuel(sample):
    sample = str(sample).lower()

    if "gumm" in sample or "gm" in sample:
        return "gummipulver"
    elif "wod" in sample or 'wood' in sample or 'wd' in sample:
        return "wood"
    elif "coal" in sample or 'bks' in sample:
        return "coal"
    elif "swage" in sample or 'swg' in sample or 'sewage' in sample:
        return "sewage"
    elif "refuse" in sample or 'ref' in sample:
        return "ebs2"
    elif "digestate" in sample or 'dgst' in sample or "dgt" in sample:
        if 'pe' not in sample and 'pp' not in sample and 'car' not in sample:
            return "digestate"
        if 'pe' in sample:
            return "digestate_pe"
        if 'car' in sample:
            return "digestate_car"
        if 'pp' in sample:
            return "digestate_pp"
    elif "htc" in sample:
        return "HTC"
    elif "cel" in sample:
        return "cellulose"
    elif "hc" in sample:
        return "Hemicellulose"
    elif "lig" in sample:
        return "Lignin"
    elif "ebs2" in sample:
        return "ebs2"
    elif "ebs1" in sample:
        return "ebs1"
    else:
        return "unknown"  # Default category


features['fuel_type'] = features['sample'].apply(classify_fuel)

print(features.head())

         sample  temperature  residence_time  pressure  heat_rate    wc    vm  \
0  brown_coal_1          600             0.5       1.0       1000  10.8  44.2   
1  brown_coal_2          600             0.5       1.0       1000  10.8  44.2   
2  brown_coal_3          600             0.5       1.0       1000  10.8  44.2   
3  brown_coal_4          600             0.5       1.0       1000  10.8  44.2   
4  brown_coal_5          600             1.0       1.0       1000  10.8  44.2   

     fc   ac     c    h     o    n    s   cl   hc   oc    lhv fuel_type  
0  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
1  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
2  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
3  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  
4  40.9  4.2  63.4  3.9  16.2  0.9  0.6  0.0  0.7  0.2  20.13      coal  


In [217]:
def fuel_category(sample):
    sample = str(sample).lower()

    if "gumm" in sample or "gm" in sample:
        return "plastic"
    elif "wod" in sample or 'wood' in sample or 'wd' in sample:
        return "biomass"
    elif "coal" in sample or 'bks' in sample:
        return "coal"
    elif "swage" in sample or 'swg' in sample or 'sewage' in sample:
        return "biomass"
    elif "refuse" in sample or 'ref' in sample:
        return "plastic"
    elif "digestate" in sample or 'dgst' in sample or "dgt" in sample:
        if 'pe' not in sample and 'pp' not in sample and 'car' not in sample:
            return "plastic"
        if 'pe' in sample:
            return "mix"
        if 'car' in sample:
            return "mix"
        if 'pp' in sample:
            return "mix"
    elif "htc" in sample:
        return "biomass"
    elif "cel" in sample:
        return "biomass"
    elif "hc" in sample:
        return "biomass"
    elif "lig" in sample:
        return "biomass"
    elif "ebs2" in sample:
        return "plastic"
    elif "ebs1" in sample:
        return "plastic"
    else:
        return "unknown"  # Default category

features['fuel_category'] = features['sample'].apply(fuel_category)

In [218]:
set(features['fuel_category'])

{'biomass', 'coal', 'mix', 'plastic'}

In [219]:
features.head

<bound method NDFrame.head of             sample  temperature  residence_time  pressure  heat_rate    wc  \
0     brown_coal_1          600             0.5       1.0       1000  10.8   
1     brown_coal_2          600             0.5       1.0       1000  10.8   
2     brown_coal_3          600             0.5       1.0       1000  10.8   
3     brown_coal_4          600             0.5       1.0       1000  10.8   
4     brown_coal_5          600             1.0       1.0       1000  10.8   
...            ...          ...             ...       ...        ...   ...   
1985         gm_39         1000             1.0      10.0         10   2.6   
1986         gm_40         1000             1.0      10.0         50   2.6   
1987         gm_41         1000             1.0      10.0        100   2.6   
1988         gm_42         1000             1.0      10.0        500   2.6   
1989         gm_43         1000             1.0      10.0       1000   2.6   

        vm    fc    ac     c    h

In [220]:
features.isnull().sum()

sample              0
temperature         0
residence_time      0
pressure            0
heat_rate           0
wc                  0
vm                  0
fc                  0
ac                  0
c                   0
h                   0
o                   0
n                   0
s                   0
cl                  0
hc                  0
oc                  0
lhv               206
fuel_type           0
fuel_category       0
dtype: int64

In [221]:
encoder = OneHotEncoder(sparse_output=False)
one_hot_encoded = encoder.fit_transform(features[['fuel_category']])

one_hot_df = pd.DataFrame(one_hot_encoded, columns=encoder.get_feature_names_out())
features_encoded = pd.concat([features, one_hot_df], axis=1)
features_encoded.drop(columns=['fuel_category'], inplace=True)
print(features_encoded.head())


         sample  temperature  residence_time  pressure  heat_rate    wc    vm  \
0  brown_coal_1          600             0.5       1.0       1000  10.8  44.2   
1  brown_coal_2          600             0.5       1.0       1000  10.8  44.2   
2  brown_coal_3          600             0.5       1.0       1000  10.8  44.2   
3  brown_coal_4          600             0.5       1.0       1000  10.8  44.2   
4  brown_coal_5          600             1.0       1.0       1000  10.8  44.2   

     fc   ac     c  ...    s   cl   hc   oc    lhv  fuel_type  \
0  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
1  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
2  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
3  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   
4  40.9  4.2  63.4  ...  0.6  0.0  0.7  0.2  20.13       coal   

   fuel_category_biomass  fuel_category_coal fuel_category_mix  \
0                    0.0                 1.0            

In [222]:
features_encoded = pd.read_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Experiment_1\features_encoded.csv")

In [223]:
fuel_type_counts = features_encoded['fuel_type'].value_counts()
print(fuel_type_counts)

fuel_type
sewage           241
digestate        236
wood             236
digestate_car    206
coal             195
digestate_pp     183
digestate_pe     174
ebs2             138
HTC              134
ebs1              62
gummipulver       56
Hemicellulose     44
Lignin            44
cellulose         41
Name: count, dtype: int64


In [224]:
data = pd.concat([features_encoded, labels['devol_yield']], axis=1)

In [225]:
data.columns

Index(['sample', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc', 'oc',
       'lhv', 'fuel_type', 'fuel_category_biomass', 'fuel_category_coal',
       'fuel_category_mix', 'fuel_category_plastic', 'devol_yield'],
      dtype='object')

In [261]:
def remove_outliers_iqr(group, cols, multiplier=1.5):
    # If cols is a string, convert it to a list
    if isinstance(cols, str):
        cols = [cols]
    Q1 = group[cols].quantile(0.25)
    Q3 = group[cols].quantile(0.75)
    IQR = Q3 - Q1
    # Keep rows where all specified columns are within the acceptable range
    mask = ~((group[cols] < (Q1 - multiplier * IQR)) | (group[cols] > (Q3 + multiplier * IQR))).any(axis=1)
    return group[mask]

# Assuming `data` is your DataFrame:
biomass_data = data[data['fuel_category_biomass'] == 1]
mix_data = data[data['fuel_category_mix'] == 1]
coal_data = data[data['fuel_category_coal'] == 1]
plastic_data = data[data['fuel_category_plastic'] == 1]

# Group each category by 'temperature'
biomass = biomass_data.groupby('temperature')
mix = mix_data.groupby('temperature')
coal = coal_data.groupby('temperature')
plastic = plastic_data.groupby('temperature')

biomass_clean = biomass.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
mix_clean = mix.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
coal_clean = coal.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
plastic_clean = plastic.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)

data_clean = data.groupby('temperature').apply(lambda g: remove_outliers_iqr(g, 'devol_yield'))

  biomass_clean = biomass.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
  mix_clean = mix.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
  coal_clean = coal.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
  plastic_clean = plastic.apply(lambda g: remove_outliers_iqr(g, 'devol_yield')).reset_index(drop=True)
  data_clean = data.groupby('temperature').apply(lambda g: remove_outliers_iqr(g, 'devol_yield'))


In [227]:
print(f'Biomass Data Before: {biomass_data.shape[0]}', f'\tAfter: {biomass_clean.shape[0]}\t Removed: {biomass_data.shape[0] - biomass_clean.shape[0]}')
print(f'Mix Data Before:\t {mix_data.shape[0]}', f'\tAfter: {mix_clean.shape[0]}\t Removed: {mix_data.shape[0] - mix_clean.shape[0]}')
print(f'Coal Data Before:\t {coal_data.shape[0]}', f'\tAfter: {coal_clean.shape[0]}\t Removed: {coal_data.shape[0] - coal_clean.shape[0]}')
print(f'Plastic Data Before: {plastic_data.shape[0]}', f'\tAfter: {plastic_clean.shape[0]}\t Removed: {plastic_data.shape[0] - plastic_clean.shape[0]}')

Biomass Data Before: 740 	After: 733	 Removed: 7
Mix Data Before:	 563 	After: 555	 Removed: 8
Coal Data Before:	 195 	After: 191	 Removed: 4
Plastic Data Before: 492 	After: 482	 Removed: 10


In [228]:
clean_data = pd.concat([biomass_clean, mix_clean, coal_clean, plastic_clean])
clean_data.head()

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,cl,hc,oc,lhv,fuel_type,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,htc_96,200,0.5,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.0,1.2,0.4,13.528,HTC,1.0,0.0,0.0,0.0,6.785137
1,htc_152,200,1.0,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.0,1.2,0.4,13.528,HTC,1.0,0.0,0.0,0.0,2.878561
2,htc_152,200,1.0,1.0,1000,7.345,48.24,6.9,37.52,33.085,...,0.0,1.2,0.4,13.528,HTC,1.0,0.0,0.0,0.0,2.878561
3,cel_18,200,1.0,1.0,1000,5.725,87.9005,6.3745,0.0,40.305405,...,1.345032,1.020847,16.51605,17.562938,cellulose,1.0,0.0,0.0,0.0,2.018886
4,cel_19,200,1.0,1.0,1000,5.725,87.9005,6.3745,0.0,40.305405,...,1.345032,1.020847,16.51605,17.562938,cellulose,1.0,0.0,0.0,0.0,2.324709


In [229]:
fuel_type_counts_after = clean_data['fuel_type'].value_counts()
print(f'Before:{fuel_type_counts}\tAfter: {fuel_type_counts_after}')

Before:fuel_type
sewage           241
digestate        236
wood             236
digestate_car    206
coal             195
digestate_pp     183
digestate_pe     174
ebs2             138
HTC              134
ebs1              62
gummipulver       56
Hemicellulose     44
Lignin            44
cellulose         41
Name: count, dtype: int64	After: fuel_type
sewage           241
wood             236
digestate        233
digestate_car    205
coal             191
digestate_pp     181
digestate_pe     169
ebs2             133
HTC              131
ebs1              60
gummipulver       56
Lignin            44
Hemicellulose     41
cellulose         40
Name: count, dtype: int64


# Train-Test Split
- **coal** -> Test Data
- 90-10 train-test split


In [292]:
train_data = clean_data[(clean_data['fuel_type'] != 'digestate_car')&(clean_data['fuel_type'] != 'digestate_pe')&(clean_data['fuel_type'] != 'digestate_pp')].drop(columns=['fuel_type']).reset_index(drop=True)
print(clean_data.shape)
train_data.describe()


(1961, 24)


Unnamed: 0,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
count,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,...,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0,1406.0
mean,813.435277,5.592461,2.598506,886.628734,6.655715,61.086884,16.12491,16.146542,48.474232,5.247704,...,0.745751,0.373956,1.300237,2.056329,19.679618,0.521337,0.135846,0.0,0.342817,53.819926
std,193.453918,4.506775,4.087356,296.862143,3.072736,14.752519,11.928295,14.648016,14.804057,1.587047,...,0.77891,0.630066,0.347562,5.710993,6.602662,0.499722,0.342747,0.0,0.47482,22.629499
min,200.0,0.5,0.5,10.0,2.6,44.2,3.064326,0.0,32.4,3.265,...,0.04642,0.0,0.461416,0.0,13.528,0.0,0.0,0.0,0.0,-2.871983
25%,600.0,1.0,1.0,1000.0,3.2,50.7,6.9,1.707454,33.085,4.410371,...,0.2,0.0,1.04402,0.2,13.943,0.0,0.0,0.0,0.0,40.05336
50%,800.0,5.0,1.0,1000.0,7.345,53.7,15.4,8.2,50.4,4.8,...,0.369425,0.0,1.4,0.4,19.843,1.0,0.0,0.0,0.0,54.790839
75%,1000.0,10.0,1.0,1000.0,10.2,75.4,16.4,36.69,63.4,5.7,...,0.8,0.906227,1.5,0.5,20.13,1.0,0.0,0.0,1.0,70.311004
max,1200.0,20.0,20.0,1000.0,10.8,91.735674,40.9,37.52,82.3,8.9,...,2.5,1.585535,1.703704,26.10225,35.8,1.0,1.0,0.0,1.0,99.756944


In [293]:
train_data

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,htc_96,200,0.5,1.0,1000,7.345,48.2400,6.9000,37.52,33.085000,...,0.34200,0.000000,1.200000,0.40000,13.528000,1.0,0.0,0.0,0.0,6.785137
1,htc_152,200,1.0,1.0,1000,7.345,48.2400,6.9000,37.52,33.085000,...,0.34200,0.000000,1.200000,0.40000,13.528000,1.0,0.0,0.0,0.0,2.878561
2,htc_152,200,1.0,1.0,1000,7.345,48.2400,6.9000,37.52,33.085000,...,0.34200,0.000000,1.200000,0.40000,13.528000,1.0,0.0,0.0,0.0,2.878561
3,cel_18,200,1.0,1.0,1000,5.725,87.9005,6.3745,0.00,40.305405,...,0.04642,1.345032,1.020847,16.51605,17.562938,1.0,0.0,0.0,0.0,2.018886
4,cel_19,200,1.0,1.0,1000,5.725,87.9005,6.3745,0.00,40.305405,...,0.04642,1.345032,1.020847,16.51605,17.562938,1.0,0.0,0.0,0.0,2.324709
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,digestate_309,1200,10.0,1.0,1000,10.200,50.7000,15.4000,23.70,37.500000,...,0.20000,0.000000,1.500000,0.40000,15.055000,0.0,0.0,0.0,1.0,60.374640
1402,digestate_310,1200,10.0,1.0,1000,10.200,50.7000,15.4000,23.70,37.500000,...,0.20000,0.000000,1.500000,0.40000,15.055000,0.0,0.0,0.0,1.0,54.277600
1403,digestate_311,1200,10.0,1.0,1000,10.200,50.7000,15.4000,23.70,37.500000,...,0.20000,0.000000,1.500000,0.40000,15.055000,0.0,0.0,0.0,1.0,59.020408
1404,dgt_5,1200,1.0,10.0,1000,10.200,50.7000,15.4000,23.70,37.500000,...,0.20000,0.000000,1.500000,0.40000,15.055000,0.0,0.0,0.0,1.0,79.080180


In [294]:
test_data = clean_data[
    clean_data['fuel_type'].isin(['digestate_car', 'digestate_pe', 'digestate_pp'])
].drop(columns=['fuel_type']).reset_index(drop=True)

In [295]:
test_data

Unnamed: 0,sample,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,...,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic,devol_yield
0,digestate_pe_97,400,10.0,1.0,1000,7.650000,63.025000,11.550000,17.775000,49.362000,...,0.165250,0.00100,1.675000,0.300000,22.9250,0.0,0.0,1.0,0.0,9.983292
1,digestate_pe_99,400,10.0,1.0,1000,7.650000,63.025000,11.550000,17.775000,49.362000,...,0.165250,0.00100,1.675000,0.300000,22.9250,0.0,0.0,1.0,0.0,8.571429
2,digestate_pe_100,400,10.0,1.0,1000,7.752000,62.532000,11.704000,18.012000,48.887520,...,0.166640,0.00096,1.668000,0.304000,22.6102,0.0,0.0,1.0,0.0,15.764057
3,digestate_pe_101,400,10.0,1.0,1000,7.650000,63.025000,11.550000,17.775000,49.362000,...,0.165250,0.00100,1.675000,0.300000,22.9250,0.0,0.0,1.0,0.0,5.234657
4,digestate_pe_119,400,10.0,1.0,1000,5.100000,75.350000,7.700000,11.850000,61.224000,...,0.130500,0.00200,1.850000,0.200000,30.7950,0.0,0.0,1.0,0.0,9.507524
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,dgst_car_178,1200,10.0,1.0,1000,7.915350,55.223959,15.236271,21.624420,44.035753,...,0.289996,0.00000,1.554412,0.325950,,0.0,0.0,1.0,0.0,61.062907
551,dgst_car_184,1200,10.0,1.0,1000,7.937800,55.179506,15.237880,21.644815,43.971532,...,0.289112,0.00000,1.553877,0.326678,,0.0,0.0,1.0,0.0,54.058908
552,dgst_car_186,1200,10.0,1.0,1000,7.884637,55.284776,15.234070,21.596517,44.123615,...,0.291206,0.00000,1.555143,0.324955,,0.0,0.0,1.0,0.0,59.220872
553,dgst_car_303,1200,10.0,1.0,1000,7.890998,55.272180,15.234526,21.602296,44.105417,...,0.290956,0.00000,1.554992,0.325161,,0.0,0.0,1.0,0.0,56.876122


In [296]:
X_train = train_data.drop(columns=['sample', 'devol_yield'])
y_train = train_data['devol_yield']

X_test = test_data.drop(columns=['sample', 'devol_yield'])
y_test = test_data['devol_yield']

# kNN Imputation & Standard Scaling

In [297]:
import shap
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb

In [298]:
X_train.isnull().sum()

temperature              0
residence_time           0
pressure                 0
heat_rate                0
wc                       0
vm                       0
fc                       0
ac                       0
c                        0
h                        0
o                        0
n                        0
s                        0
cl                       0
hc                       0
oc                       0
lhv                      0
fuel_category_biomass    0
fuel_category_coal       0
fuel_category_mix        0
fuel_category_plastic    0
dtype: int64

In [299]:
knn_imputer = KNNImputer(n_neighbors=3)
X_train_imputed = knn_imputer.fit_transform(X_train)
X_test_imputed = knn_imputer.transform(X_test)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

X_train_scaled

Unnamed: 0,temperature,residence_time,pressure,heat_rate,wc,vm,fc,ac,c,h,...,n,s,cl,hc,oc,lhv,fuel_category_biomass,fuel_category_coal,fuel_category_mix,fuel_category_plastic
0,-3.172092,-1.130359,-0.391225,0.382035,0.224403,-0.871136,-0.773639,1.459656,-1.039898,-1.249748,...,0.168439,-0.518538,-0.593730,-0.288504,-0.290128,-0.932019,0.958199,-0.396487,0.0,-0.722250
1,-3.172092,-1.019375,-0.391225,0.382035,0.224403,-0.871136,-0.773639,1.459656,-1.039898,-1.249748,...,0.168439,-0.518538,-0.593730,-0.288504,-0.290128,-0.932019,0.958199,-0.396487,0.0,-0.722250
2,-3.172092,-1.019375,-0.391225,0.382035,0.224403,-0.871136,-0.773639,1.459656,-1.039898,-1.249748,...,0.168439,-0.518538,-0.593730,-0.288504,-0.290128,-0.932019,0.958199,-0.396487,0.0,-0.722250
3,-3.172092,-1.019375,-0.391225,0.382035,-0.303002,1.818209,-0.817709,-1.102694,-0.551993,-0.460159,...,-1.133261,-0.898152,1.541778,-0.804145,2.532811,-0.320694,0.958199,-0.396487,0.0,-0.722250
4,-3.172092,-1.019375,-0.391225,0.382035,-0.303002,1.818209,-0.817709,-1.102694,-0.551993,-0.460159,...,-1.133261,-0.898152,1.541778,-0.804145,2.532811,-0.320694,0.958199,-0.396487,0.0,-0.722250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,1.998937,0.978329,-0.391225,0.382035,1.153873,-0.704326,-0.060794,0.515848,-0.741563,-0.282199,...,0.591989,-0.700909,-0.593730,0.574959,-0.290128,-0.700666,-1.043625,-0.396487,0.0,1.384562
1402,1.998937,0.978329,-0.391225,0.382035,1.153873,-0.704326,-0.060794,0.515848,-0.741563,-0.282199,...,0.591989,-0.700909,-0.593730,0.574959,-0.290128,-0.700666,-1.043625,-0.396487,0.0,1.384562
1403,1.998937,0.978329,-0.391225,0.382035,1.153873,-0.704326,-0.060794,0.515848,-0.741563,-0.282199,...,0.591989,-0.700909,-0.593730,0.574959,-0.290128,-0.700666,-1.043625,-0.396487,0.0,1.384562
1404,1.998937,-1.019375,1.811471,0.382035,1.153873,-0.704326,-0.060794,0.515848,-0.741563,-0.282199,...,0.591989,-0.700909,-0.593730,0.574959,-0.290128,-0.700666,-1.043625,-0.396487,0.0,1.384562


# Baseline Models
## Simple Baselines

In [300]:
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor

# Dummy models
mean_model = DummyRegressor(strategy="mean")
median_model = DummyRegressor(strategy="median")

# KNN Regression
knn_model = KNeighborsRegressor(n_neighbors=5)

mean_model.fit(X_train_scaled, y_train)
median_model.fit(X_train_scaled, y_train)
knn_model.fit(X_train_scaled, y_train)
mean_predictions = mean_model.predict(X_test_scaled)
median_predictions = median_model.predict(X_test_scaled)
knn_predictions = knn_model.predict(X_test_scaled)

print('Mean Model Score:', mean_model.score(X_test_scaled, y_test))
print('Median Model Score:', median_model.score(X_test_scaled, y_test))
print('KNN Model Score:', knn_model.score(X_test_scaled, y_test))
from sklearn.metrics import mean_absolute_error, mean_squared_error

print("Mean Model MAE:", mean_absolute_error(y_test, mean_predictions))
print("Median Model MAE:", mean_absolute_error(y_test, median_predictions))
print("KNN Model MAE:", mean_absolute_error(y_test, knn_predictions))

Mean Model Score: -0.0568699952805094
Median Model Score: -0.08225382308000606
KNN Model Score: 0.4195718368770608
Mean Model MAE: 16.84028818456307
Median Model MAE: 16.855352936187387
KNN Model MAE: 12.2396465115009


## 2. Linear Models
Linear Regression (OLS)

Ridge Regression (L2 regularization)

Lasso Regression (L1 regularization)

Elastic Net (Combination of L1 and L2)

In [301]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

lin_model = LinearRegression()
ridge_model = Ridge(alpha=1.0)
lasso_model = Lasso(alpha=0.1)
elastic_model = ElasticNet(alpha=0.1, l1_ratio=0.5)
lin_model.fit(X_train_scaled, y_train)
ridge_model.fit(X_train_scaled, y_train)
lasso_model.fit(X_train_scaled, y_train)
elastic_model.fit(X_train_scaled, y_train)

print('Linear Model Score:', lin_model.score(X_test_scaled, y_test))
print('Ridge Model Score:', ridge_model.score(X_test_scaled, y_test))
print('Lasso Model Score:', lasso_model.score(X_test_scaled, y_test))
print('Elastic Model Score:', elastic_model.score(X_test_scaled, y_test))

Linear Model Score: 0.4658722407493747
Ridge Model Score: 0.47234335766688773
Lasso Model Score: 0.4521872235617278
Elastic Model Score: 0.4475460314583476


## 3. Tree-Based Models
Decision Tree Regression: Simple non-linear model.

Random Forest Regression: Ensemble of decision trees.

Gradient Boosting Regression (GBR)

XGBoost

LightGBM
CatBoost

In [302]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

dt_model = DecisionTreeRegressor(max_depth=5)
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5)
gbr_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1)

xgb_model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1)
lgb_model = lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1)
dt_model.fit(X_train_scaled, y_train)
rf_model.fit(X_train_scaled, y_train)
gbr_model.fit(X_train_scaled, y_train)
xgb_model.fit(X_train_scaled, y_train)
lgb_model.fit(X_train_scaled, y_train)
dt_predictions = dt_model.predict(X_test_scaled)
gbr_predictions = gbr_model.predict(X_test_scaled)
lgb_predictions = lgb_model.predict(X_test_scaled)
xgb_predictions = xgb_model.predict(X_test_scaled)
rf_predictions = rf_model.predict(X_test_scaled)
print('Decision Tree Regressor Score:', mean_squared_error(y_test, dt_predictions))
print('Gradient Boosting Regressor Score:', mean_squared_error(y_test, gbr_predictions))
print('lightGBM Regressor Score:', mean_squared_error(y_test, lgb_predictions))
print('xgboost Regressor Score:', mean_squared_error(y_test, xgb_predictions))
print('RF Regressor Score:', mean_squared_error(y_test, rf_predictions))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 188
[LightGBM] [Info] Number of data points in the train set: 1406, number of used features: 20
[LightGBM] [Info] Start training from score 53.819926
Decision Tree Regressor Score: 477.905450216217
Gradient Boosting Regressor Score: 201.18534342572468
lightGBM Regressor Score: 347.1639381223634
xgboost Regressor Score: 243.5078879597977
RF Regressor Score: 297.3781886950375


In [303]:
print('Decision Tree Regressor Score:', dt_model.score(X_test_scaled, y_test))
print('Gradient Boosting Regressor Score:', gbr_model.score(X_test_scaled, y_test))
print('lightGBM Regressor Score:', lgb_model.score(X_test_scaled, y_test))
print('XGBoost Regressor Score:', xgb_model.score(X_test_scaled, y_test))
print('RF Regressor Score:', rf_model.score(X_test_scaled, y_test))

Decision Tree Regressor Score: -0.1839267186947271
Gradient Boosting Regressor Score: 0.5015986878079605
lightGBM Regressor Score: 0.1399623881159041
XGBoost Regressor Score: 0.39675202566094603
RF Regressor Score: 0.2632978280665985


## 4. Nonlinear & Probabilistic Models
Gaussian Process Regression (GPR)

Support Vector Regression (SVR)

Neural Networks (MLP)

In [304]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

gpr_model = GaussianProcessRegressor()
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)
mlp_model = MLPRegressor(hidden_layer_sizes=(100,), activation='relu', max_iter=2000)
gpr_model.fit(X_train_scaled, y_train)
svr_model.fit(X_train_scaled, y_train)
mlp_model.fit(X_train_scaled, y_train)
print('Gaussian Process Regressor Score:', gpr_model.score(X_test_scaled, y_test))
print('SVR Score:', svr_model.score(X_test_scaled, y_test))
print('MLP Score:', mlp_model.score(X_test_scaled, y_test))

Gaussian Process Regressor Score: -5.727921490202301
SVR Score: 0.023418902042940393
MLP Score: -1.0568009863117838
