In [7]:

# ==============================================================================
# 0. SETUP & INSTALLATION
# ==============================================================================
import subprocess
import sys
import os

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

try:
    import rdkit
    import optuna
    import xgboost
except ImportError:
    print("Installing dependencies... (rdkit, optuna, xgboost)")
    install("rdkit-pypi")
    install("optuna")
    install("xgboost")

import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.backends.backend_pdf import PdfPages
from scipy.signal import savgol_filter
from scipy.stats import ttest_rel, wilcoxon
import requests
import io

from rdkit import Chem
from rdkit.Chem import Draw
import optuna
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
optuna.logging.set_verbosity(optuna.logging.WARNING)

MODEL_LIST = ['PLS', 'Ridge', 'ElasticNet', 'SVR', 'XGBoost']
N_OPTUNA_TRIALS = 30
INNER_CV_FOLDS = 3
OUTER_CV_FOLDS = 3

print(f"--- ðŸš€ GRAND MASTER PIPELINE (GASOLINE EDITION - FINAL) STARTED ---")

# ==============================================================================
# PHASE 1: DATA LOADING
# ==============================================================================
print("\n--- ðŸ“‚ PHASE 1: Loading Gasoline Data ---")

def load_gasoline_data():
    local_files = [f for f in os.listdir('.') if f.endswith('.csv') and 'gasoline' in f.lower()]

    if local_files:
        f_path = local_files[0]
        print(f"   > Found local file: {f_path}")
        df = pd.read_csv(f_path)
    else:
        url = "https://raw.githubusercontent.com/elnegmelnegm/gasoline/main/gasoline.csv"
        print(f"   > Downloading from: {url}")
        try:
            s = requests.get(url).content
            df = pd.read_csv(io.StringIO(s.decode('utf-8')))
        except Exception as e:
            sys.exit(f"   > Download failed: {e}. Please manually upload 'gasoline.csv'.")

    # CLEANING
    df.dropna(axis=1, how='all', inplace=True)
    df.dropna(axis=0, how='any', inplace=True)
    df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

    target_col = next((c for c in df.columns if 'octane' in c.lower()), None)
    if target_col is None:
        target_col = df.columns[-1]

    Y = df[[target_col]]
    X = df.drop(columns=[target_col])

    try:
        wavelengths = np.array([float(c) for c in X.columns])
    except:
        wavelengths = np.linspace(900, 1700, X.shape[1])

    print(f"   > Loaded {X.shape[0]} Clean Samples with {X.shape[1]} Wavelengths.")
    return X, Y, wavelengths

X_raw, Y_raw, wavs = load_gasoline_data()

# FEATURE SELECTION
mask_struct = (wavs >= 1100) & (wavs <= 1350)
scenarios = {
    "Structure": X_raw.loc[:, mask_struct],
    "FullSpec":  X_raw
}

print(f"   > Scenarios defined:")
print(f"     1. Structure (1100-1350 nm): {mask_struct.sum()} features")
print(f"     2. FullSpec  (900-1700 nm):  {X_raw.shape[1]} features")

# ==============================================================================
# PHASE 2: CHEMICAL STRUCTURES
# ==============================================================================
print("\n--- ðŸ§ª PHASE 2: Generating Structures ---")
octane_mols = {
    'Isooctane (100)': 'CC(C)CC(C)(C)C',
    'n-Heptane (0)': 'CCCCCCC',
    'Toluene': 'CC1=CC=CC=C1',
    'Ethanol': 'CCO'
}
mols = [Chem.MolFromSmiles(s) for s in octane_mols.values()]

if mols and all(mols):
    img = Draw.MolsToGridImage(mols, molsPerRow=4, subImgSize=(300, 300),
                               legends=list(octane_mols.keys()), returnPNG=False)
    img.save('gasoline_mols.png')
    print("   > Molecules saved to gasoline_mols.png")

# ==============================================================================
# PHASE 3: MODEL CLASSES & EXECUTION
# ==============================================================================
class SpectralPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, method='none', window=15, deriv=1):
        self.method = method; self.window = window; self.deriv = deriv
    def fit(self, X, y=None): return self
    def transform(self, X):
        X_n = pd.DataFrame(X).copy()
        if self.method == 'snv':
            stds = X_n.std(axis=1)
            stds[stds == 0] = 1e-8
            X_n = X_n.sub(X_n.mean(axis=1), axis=0).div(stds, axis=0)
        elif self.method == 'deriv':
            w = min(self.window, X_n.shape[1] - (X_n.shape[1]%2) - 2)
            if w < 3: w = 3
            X_n = savgol_filter(X_n, w, 2, deriv=self.deriv, axis=1)
        return np.array(X_n)

def run_benchmark(X_data, Y, scenario_name):
    print(f"\nðŸ”¬ SCENARIO: {scenario_name} (Features: {X_data.shape[1]})")
    stats = {m: {'R2':[], 'RMSE':[], 'MAE':[], 'Params':None} for m in MODEL_LIST}
    best_plot = {'Model': None, 'R2_mean': -999, 'Y_true': None, 'Y_pred': None, 'Pipe': None}

    outer = KFold(n_splits=OUTER_CV_FOLDS, shuffle=True, random_state=42)
    X_arr = X_data.values if isinstance(X_data, pd.DataFrame) else X_data
    Y_arr = Y.values if isinstance(Y, pd.DataFrame) else Y

    for fold, (t_ix, v_ix) in enumerate(outer.split(X_arr, Y_arr)):
        print(f"   > Fold {fold+1}...", end=" ")
        X_t, X_v = X_arr[t_ix], X_arr[v_ix]
        y_t, y_v = Y_arr[t_ix], Y_arr[v_ix]
        y_t_flat = y_t.ravel(); y_v_flat = y_v.ravel()

        for name in MODEL_LIST:
            def obj(trial):
                prep = trial.suggest_categorical('prep', ['none', 'snv', 'deriv'])

                if name=='PLS':
                    # --- CRITICAL FIX: PLS Component Limit ---
                    # Calculate inner loop training size
                    inner_train_size = int(len(X_t) * (INNER_CV_FOLDS - 1) / INNER_CV_FOLDS)
                    # Constraint: n_components < inner_train_size
                    n_limit = min(inner_train_size - 1, X_t.shape[1])
                    # Also cap at reasonable number (e.g., 20) to prevent overfitting on small data
                    n_max = min(20, n_limit)

                    est = PLSRegression(n_components=trial.suggest_int('n', 2, max(2, n_max)), scale=False)

                elif name=='Ridge': est = Ridge(alpha=trial.suggest_float('a', 0.01, 100, log=True))
                elif name=='ElasticNet': est = ElasticNet(alpha=trial.suggest_float('a', 0.01, 10, log=True), l1_ratio=trial.suggest_float('l1', 0.1, 0.9))
                elif name=='SVR': est = SVR(C=trial.suggest_float('c', 0.1, 100, log=True), epsilon=0.1)
                elif name=='XGBoost': est = XGBRegressor(n_estimators=trial.suggest_int('n', 50, 150), max_depth=trial.suggest_int('d', 2, 6), learning_rate=trial.suggest_float('lr', 0.01, 0.2), n_jobs=1, verbosity=0)

                pipe = Pipeline([
                    ('imputer', SimpleImputer(strategy='mean')),
                    ('p', SpectralPreprocessor(method=prep)),
                    ('s', StandardScaler()),
                    ('e', est)
                ])
                return cross_val_score(pipe, X_t, y_t_flat, cv=INNER_CV_FOLDS, scoring='r2', n_jobs=-1).mean()

            study = optuna.create_study(direction='maximize')
            study.optimize(obj, n_trials=N_OPTUNA_TRIALS)

            bp = study.best_params
            prep = bp.pop('prep')

            if name=='PLS': est = PLSRegression(n_components=bp['n'], scale=False)
            elif name=='Ridge': est = Ridge(alpha=bp['a'])
            elif name=='ElasticNet': est = ElasticNet(alpha=bp['a'], l1_ratio=bp['l1'])
            elif name=='SVR': est = SVR(C=bp['c'], epsilon=0.1)
            elif name=='XGBoost': est = XGBRegressor(n_estimators=bp['n'], max_depth=bp['d'], learning_rate=bp['lr'], n_jobs=-1)

            final = Pipeline([
                ('imputer', SimpleImputer(strategy='mean')),
                ('p', SpectralPreprocessor(method=prep)),
                ('s', StandardScaler()),
                ('e', est)
            ])
            final.fit(X_t, y_t_flat)
            p_v = final.predict(X_v)

            stats[name]['R2'].append(r2_score(y_v_flat, p_v))
            stats[name]['RMSE'].append(np.sqrt(mean_squared_error(y_v_flat, p_v)))
            stats[name]['MAE'].append(mean_absolute_error(y_v_flat, p_v))
            stats[name]['Params'] = study.best_params

            curr_r2 = np.mean(stats[name]['R2'])
            if curr_r2 > best_plot['R2_mean']:
                best_plot['R2_mean'] = curr_r2; best_plot['Model'] = name
                best_plot['Y_true'] = y_v_flat; best_plot['Y_pred'] = p_v; best_plot['Pipe'] = final

        print("Done.")
    return stats, best_plot

res_Struct, plots_Struct = run_benchmark(scenarios['Structure'], Y_raw, "Structure")
res_Full, _ = run_benchmark(scenarios['FullSpec'], Y_raw, "FullSpec")

# ==============================================================================
# PHASE 4: PDF REPORT GENERATION
# ==============================================================================
print("\n--- ðŸ“„ PHASE 4: Generating Report ---")
pdf_path = 'Gasoline_Benchmark_Report.pdf'

with PdfPages(pdf_path) as pdf:
    fig = plt.figure(figsize=(11, 8.5)); plt.axis('off')
    plt.text(0.5, 0.95, "GASOLINE OCTANE BENCHMARK REPORT", ha='center', fontsize=18, weight='bold')

    if os.path.exists('gasoline_mols.png'):
        mol_img = mpimg.imread('gasoline_mols.png')
        ax_mol = fig.add_axes([0.1, 0.55, 0.8, 0.35])
        ax_mol.imshow(mol_img); ax_mol.axis('off')

    ax_spec = fig.add_axes([0.1, 0.1, 0.8, 0.4])
    mean_spec = X_raw.mean(0)
    ax_spec.plot(wavs, mean_spec, c='gray', alpha=0.3, label='Full NIR Spectrum')
    ax_spec.plot(wavs[mask_struct], mean_spec[mask_struct], c='orange', lw=2, label='Structure Selected')
    ax_spec.set_xlabel("Wavelength (nm)"); ax_spec.set_ylabel("Absorbance")
    ax_spec.legend(); ax_spec.set_title("Chemistry-Driven Feature Selection")
    pdf.savefig(fig); plt.close()

    fig = plt.figure(figsize=(11, 8.5)); plt.axis('off')
    txt = "COMPARATIVE RESULTS (Mean +/- SD)\n"
    txt += f"{'Model':<12} | {'Scenario':<10} | {'R2':<22} | {'RMSE':<22} | {'MAE':<22}\n"
    txt += "="*100 + "\n"
    for m in MODEL_LIST:
        r, rm, ma = res_Struct[m]['R2'], res_Struct[m]['RMSE'], res_Struct[m]['MAE']
        txt += f"{m:<12} | Struct     | {np.mean(r):.4f} Â± {np.std(r):.4f} | {np.mean(rm):.2f} Â± {np.std(rm):.2f} | {np.mean(ma):.2f} Â± {np.std(ma):.2f}\n"
        r, rm, ma = res_Full[m]['R2'], res_Full[m]['RMSE'], res_Full[m]['MAE']
        txt += f"{m:<12} | FullSpec   | {np.mean(r):.4f} Â± {np.std(r):.4f} | {np.mean(rm):.2f} Â± {np.std(rm):.2f} | {np.mean(ma):.2f} Â± {np.std(ma):.2f}\n"
        txt += "-"*100 + "\n"
    plt.text(0.05, 0.95, txt, family='monospace', fontsize=10, va='top')
    pdf.savefig(fig); plt.close()

    win = plots_Struct['Model']
    y_t = plots_Struct['Y_true']; y_p = plots_Struct['Y_pred']
    if win is not None:
        fig, ax = plt.subplots(figsize=(8, 8))
        fig.suptitle(f"Diagnostics: {win} (Structure-Driven)", fontsize=16)
        ax.scatter(y_t, y_p, alpha=0.6, c='orange', edgecolor='k', s=50)
        mx = max(y_t.max(), y_p.max()); mn = min(y_t.min(), y_p.min())
        ax.plot([mn, mx], [mn, mx], 'k--', lw=2)
        ax.set_xlabel("Actual Octane"); ax.set_ylabel("Predicted Octane")
        resid = y_t - y_p
        ins = ax.inset_axes([0.6, 0.1, 0.35, 0.35])
        ins.scatter(y_p, resid, s=20, c='crimson', alpha=0.5)
        ins.axhline(0, c='k', lw=1)
        pdf.savefig(fig); plt.close()

print(f"âœ… FINAL REPORT GENERATED: {pdf_path}")

--- ðŸš€ GRAND MASTER PIPELINE (GASOLINE EDITION - FINAL) STARTED ---

--- ðŸ“‚ PHASE 1: Loading Gasoline Data ---
   > Downloading from: https://raw.githubusercontent.com/elnegmelnegm/gasoline/main/gasoline.csv
   > Loaded 60 Clean Samples with 401 Wavelengths.
   > Scenarios defined:
     1. Structure (1100-1350 nm): 126 features
     2. FullSpec  (900-1700 nm):  401 features

--- ðŸ§ª PHASE 2: Generating Structures ---
   > Molecules saved to gasoline_mols.png

ðŸ”¬ SCENARIO: Structure (Features: 126)
   > Fold 1... Done.
   > Fold 2... Done.
   > Fold 3... Done.

ðŸ”¬ SCENARIO: FullSpec (Features: 401)
   > Fold 1... Done.
   > Fold 2... Done.
   > Fold 3... Done.

--- ðŸ“„ PHASE 4: Generating Report ---
âœ… FINAL REPORT GENERATED: Gasoline_Benchmark_Report.pdf
