# Single Fuel Train-Test Split | FEATURE ENGINEERING!!!


**Difference from v1 is that the coal sample is totally excluded.**

In this experiment each fuel haas been included as test set and excluded from training set. Each trained model was tried on selected models. Model performances within each trial has been saved as csv file at the end.

File loc: "C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Exp_data\experiment_results.csv"

In [34]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

# Load data
data_path = r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Final Results\Final Data Used\final_data_filtered_ohe.csv"
df = pd.read_csv(data_path, delimiter=',')
df.columns

Index(['sample', 'wc', 'vm', 'fc', 'ac', 'c', 'h', 'o', 'n', 's', 'cl', 'hc',
       'oc', 'lhv', 'temperature', 'residence_time', 'pressure', 'heat_rate',
       'fuel_type', 'devol_yield', 'fuel_category_Biomass',
       'fuel_category_Coal', 'fuel_category_Mix', 'fuel_category_Plastic'],
      dtype='object')

In [35]:
fuel_types = df['fuel_type'].unique()
# Small epsilon to avoid division by zero
epsilon = 1e-6

# Create new ratio features
df['vm_fc_ratio'] = df['vm'] / (df['fc'] + epsilon)
df['ac_fc_ratio'] = df['ac'] / (df['fc'] + epsilon)
min_vm_fc_ratio = df['vm_fc_ratio'].min()
max_ac_fc_ratio = df['ac_fc_ratio'].max()
min_ac_fc_ratio = df['ac_fc_ratio'].min()
max_vm_fc_ratio = df['vm_fc_ratio'].max()
# Drop original columns
df.drop(columns=['vm', 'fc', 'ac'], inplace=True)

# Update feature list
feature_cols = [
    'h', 'wc', 'c', 'lhv', 'o', 'n', 's', 'cl',
    'hc', 'oc', 'temperature', 'residence_time', 'pressure', 'heat_rate',
    'vm_fc_ratio', 'ac_fc_ratio'
]

# Also update min-max bounds accordingly (you may compute new ranges empirically or just fit on full data)


# These are your original min and max values (as from your summary)
min_values = {
    'wc': 2.6, 'vm': 44.2, 'fc': 3.064326, 'ac': 0.0, 'c': 32.4, 'h': 3.265 , 'lhv': 13.528,
    'o': 7.0, 'n': 0.269662, 's': 0.04642, 'cl': 0.0, 'hc': 0.7, 'oc': 0.0,
    'temperature': 200, 'residence_time': 0.5, 'pressure': 0.5, 'heat_rate': 10, 'vm_fc_ratio': min_vm_fc_ratio, 'ac_fc_ratio': min_ac_fc_ratio
}
max_values = {
    'wc': 10.8, 'vm': 91.735674, 'fc': 44.5, 'ac': 37.52, 'c': 82.3, 'h': 10.13205, 'lhv': 35.8,
    'o': 54.936839, 'n': 3.9, 's': 2.5, 'cl': 1.5, 'hc': 1.857, 'oc': 1.04402, 'temperature': 1200, 'residence_time': 20, 'pressure': 20, 'heat_rate': 1000, 'vm_fc_ratio':max_vm_fc_ratio, 'ac_fc_ratio': max_ac_fc_ratio}

# Optional: define a margin (e.g. 10%)
margin_ratio = 0.4

# Create expanded bounds
expanded_bounds = {}
for feat in feature_cols:
    min_val = min_values[feat]
    max_val = max_values[feat]
    range_val = max_val - min_val
    new_min = min_val - range_val * margin_ratio
    if new_min < 0:
        new_min = 0
    new_max = max_val + range_val * margin_ratio
    expanded_bounds[feat] = (round(new_min, 3), round(new_max, 3))

# Prepare fake data to fit the scaler
X_bounds = pd.DataFrame({col: [expanded_bounds[col][0], expanded_bounds[col][1]] for col in feature_cols})

# Fit a global scaler based on the expanded bounds
scaler = MinMaxScaler()
scaler.fit(X_bounds)

# Store results
results = []

# Iterate over each fuel type
for test_fuel in fuel_types:
    print(f"Processing fuel type: {test_fuel}")

    # Train-test split
    train_data = df[df['fuel_type'] != test_fuel].drop(columns=['fuel_type']).reset_index(drop=True)
    test_data = df[df['fuel_type'] == test_fuel].drop(columns=['fuel_type']).reset_index(drop=True)

    X_train = train_data.drop(columns=['sample', 'devol_yield', 'wc', 'h', 'c'])
    y_train = train_data['devol_yield']
    X_test = test_data.drop(columns=['sample', 'devol_yield',  'wc', 'h', 'c'])
    y_test = test_data['devol_yield']
    print(X_train.head())
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    # Models
    models = {
        "Dummy Mean": DummyRegressor(strategy="mean"),
        "Dummy Median": DummyRegressor(strategy="median"),
        "KNN": KNeighborsRegressor(n_neighbors=5),
        "Linear": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.1),
        "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
        "DecisionTree": DecisionTreeRegressor(max_depth=5),
        "RandomForest": RandomForestRegressor(n_estimators=100, max_depth=5),
        "GradientBoosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
        "XGBoost": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1),
        "LightGBM": lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1),
        "GaussianProcess": GaussianProcessRegressor(),
        "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
        "MLP": MLPRegressor(hidden_layer_sizes=(100,), activation='relu', max_iter=2000)
    }

    # Train and evaluate models
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        score = model.score(X_test_scaled, y_test)
        results.append({"Fuel Type": test_fuel, "Model": name, "Score": score})

# Save results
results_df = pd.DataFrame(results)
results_df.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Final Results\Systematic_Experiments\Feat_Eng\OHEsyst_exp_results_1.2_wc_h_c_vm_fc_ac+vmfc+acfc.csv", index=False)
print("Experiment completed! Results saved!")


Processing fuel type: Cellulose
      o      n      s   cl   hc   oc     lhv  temperature  residence_time  \
0  16.6  1.877  0.342  0.0  1.2  0.4  13.528          200             0.5   
1  16.6  1.877  0.342  0.0  1.2  0.4  13.528          200             0.5   
2  16.6  1.877  0.342  0.0  1.2  0.4  13.528          200             0.5   
3  16.6  1.877  0.342  0.0  1.2  0.4  13.528          200             0.5   
4  16.6  1.877  0.342  0.0  1.2  0.4  13.528          200             1.0   

   pressure  heat_rate  fuel_category_Biomass  fuel_category_Coal  \
0       1.0       1000                      1                   0   
1       1.0       1000                      1                   0   
2       1.0       1000                      1                   0   
3       1.0       1000                      1                   0   
4       1.0       1000                      1                   0   

   fuel_category_Mix  fuel_category_Plastic  vm_fc_ratio  ac_fc_ratio  
0                 



Processing fuel type: Digestate_PE
           o         n        s   cl        hc        oc       lhv  \
0  54.860844  0.269662  0.04642  0.0  1.345032  1.020847  16.51605   
1  54.860844  0.269662  0.04642  0.0  1.345032  1.020847  16.51605   
2  16.600000  1.877000  0.34200  0.0  1.200000  0.400000  13.52800   
3  16.600000  1.877000  0.34200  0.0  1.200000  0.400000  13.52800   
4  16.600000  1.877000  0.34200  0.0  1.200000  0.400000  13.52800   

   temperature  residence_time  pressure  heat_rate  fuel_category_Biomass  \
0          200             1.0       1.0       1000                      1   
1          200             1.0       1.0       1000                      1   
2          200             0.5       1.0       1000                      1   
3          200             0.5       1.0       1000                      1   
4          200             0.5       1.0       1000                      1   

   fuel_category_Coal  fuel_category_Mix  fuel_category_Plastic  vm_fc_rati



Processing fuel type: Torr-Wood
           o         n        s   cl        hc        oc       lhv  \
0  54.860844  0.269662  0.04642  0.0  1.345032  1.020847  16.51605   
1  54.860844  0.269662  0.04642  0.0  1.345032  1.020847  16.51605   
2  16.600000  1.877000  0.34200  0.0  1.200000  0.400000  13.52800   
3  16.600000  1.877000  0.34200  0.0  1.200000  0.400000  13.52800   
4  16.600000  1.877000  0.34200  0.0  1.200000  0.400000  13.52800   

   temperature  residence_time  pressure  heat_rate  fuel_category_Biomass  \
0          200             1.0       1.0       1000                      1   
1          200             1.0       1.0       1000                      1   
2          200             0.5       1.0       1000                      1   
3          200             0.5       1.0       1000                      1   
4          200             0.5       1.0       1000                      1   

   fuel_category_Coal  fuel_category_Mix  fuel_category_Plastic  vm_fc_ratio  