# Fuel Pairs Combinations Train-Test Split
In this experiment each possible fuel combination has been included as test set and excluded from training set. Each trained model was tried on selected models. Model performances within each trial has been saved as csv file at the end.

File loc: "C:\Users\demir\OneDrive\Desktop\MSc Thesis\Data\!Exp_data\experiment2_results.csv"

In [6]:
fuel_types = clean_data['fuel_type'].unique()
feature_cols = [
    'h', 'wc', 'vm', 'ac', 'c', 'lhv', 'o', 'n', 's', 'cl',
    'hc', 'oc', 'fc', 'temperature', 'residence_time', 'pressure', 'heat_rate'
]

# These are your original min and max values (as from your summary)
min_values = {
    'wc': 2.6, 'vm': 44.2, 'fc': 3.064326, 'ac': 0.0, 'c': 32.4, 'h': 3.265 , 'lhv': 13.528,
    'o': 7.0, 'n': 0.269662, 's': 0.04642, 'cl': 0.0, 'hc': 0.7, 'oc': 0.0,
    'temperature': 200, 'residence_time': 0.5, 'pressure': 0.5, 'heat_rate': 10
}
max_values = {
    'wc': 10.8, 'vm': 91.735674, 'fc': 44.5, 'ac': 37.52, 'c': 82.3, 'h': 10.13205, 'lhv': 35.8,
    'o': 54.936839, 'n': 3.9, 's': 2.5, 'cl': 1.5, 'hc': 1.857, 'oc': 1.04402, 'temperature': 1200, 'residence_time': 20, 'pressure': 20, 'heat_rate': 1000
}

# Optional: define a margin (e.g. 10%)
margin_ratio = 0.4

# Create expanded bounds
expanded_bounds = {}
for feat in feature_cols:
    min_val = min_values[feat]
    max_val = max_values[feat]
    range_val = max_val - min_val
    new_min = min_val - range_val * margin_ratio
    if new_min < 0:
        new_min = 0
    new_max = max_val + range_val * margin_ratio
    expanded_bounds[feat] = (round(new_min, 3), round(new_max, 3))

# Prepare fake data to fit the scaler
X_bounds = pd.DataFrame({col: [expanded_bounds[col][0], expanded_bounds[col][1]] for col in feature_cols})

# Fit a global scaler based on the expanded bounds
scaler = MinMaxScaler()
scaler.fit(X_bounds)

# Store results
results = []

# Iterate over all combinations of 2 fuel types
for test_comb in combinations(fuel_types, 2):

    # Split data
    train_data = clean_data[~clean_data['fuel_type'].isin(test_comb)].drop(columns=['fuel_type']).reset_index(drop=True)
    test_data = clean_data[clean_data['fuel_type'].isin(test_comb)].drop(columns=['fuel_type']).reset_index(drop=True)

    # Extract features and target
    X_train = train_data.drop(columns=['sample', 'devol_yield'])
    y_train = train_data['devol_yield']
    X_test = test_data.drop(columns=['sample', 'devol_yield'])
    y_test = test_data['devol_yield']

    # Train-test split ratio
    train_ratio = len(X_train) / (len(X_train) + len(X_test))
    test_ratio = 1 - train_ratio

    # Imputation and scaling
    knn_imputer = KNNImputer(n_neighbors=3)
    X_train_imputed = knn_imputer.fit_transform(X_train)
    X_test_imputed = knn_imputer.transform(X_test)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_imputed)
    X_test_scaled = scaler.transform(X_test_imputed)

    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

    # Models
    models = {
        "Dummy Mean": DummyRegressor(strategy="mean"),
        "Dummy Median": DummyRegressor(strategy="median"),
        "KNN": KNeighborsRegressor(n_neighbors=5),
        "Linear": LinearRegression(),
        "Ridge": Ridge(alpha=1.0),
        "Lasso": Lasso(alpha=0.1),
        "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
        "Decision Tree": DecisionTreeRegressor(max_depth=5),
        "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=5),
        "Gradient Boosting": GradientBoostingRegressor(n_estimators=100, learning_rate=0.1),
        "XGBoost": xgb.XGBRegressor(n_estimators=100, learning_rate=0.1),
        "LightGBM": lgb.LGBMRegressor(n_estimators=100, learning_rate=0.1),
        "Gaussian Process": GaussianProcessRegressor(),
        "SVR": SVR(kernel='rbf', C=1.0, epsilon=0.1),
        "MLP": MLPRegressor(hidden_layer_sizes=(100,), activation='relu', max_iter=2000)
    }

    # Train and evaluate models
    scores = {}
    for model_name, model in models.items():
        model.fit(X_train_scaled, y_train)
        scores[model_name] = model.score(X_test_scaled, y_test)

    # Store results
    results.append({
        "Test Fuel Types": test_comb,
        "Train-Test Ratio": (train_ratio, test_ratio),
        **scores
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv(r"C:\Users\demir\OneDrive\Desktop\MSc Thesis\Final Results\Systematic_Experiments\experiment2_results.csv", index=False)

print("Experiment completed. Results saved.")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000201 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1178
[LightGBM] [Info] Number of data points in the train set: 1417, number of used features: 21
[LightGBM] [Info] Start training from score 53.213209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1095
[LightGBM] [Info] Number of data points in the train set: 1478, number of used features: 21
[LightGBM] [Info] Start training from score 52.697112
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000324 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1094
[LightGBM] [Info] Number of data points in the train set: 1481, number of used features: 21
[LightGBM] [Info] Start trai