In [145]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.metrics import r2_score
from gpytorch.mlls import ExactMarginalLogLikelihood
import gpytorch
import altair as alt
import matplotlib.pyplot as plt
import re
from scipy.stats import norm
from datetime import datetime
import os
import csv

In [146]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

def train_gp(xt_train, yt_train, training_iterations=100):
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(xt_train, yt_train, likelihood)

    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.2)
    mll = ExactMarginalLogLikelihood(likelihood, model)

    losses = []

    for i in range(training_iterations):
        optimizer.zero_grad()
        output = model(xt_train)
        loss = -mll(output, yt_train)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

    return model.eval(), likelihood.eval(), losses

In [147]:
def is_bin_column(col) -> bool:
    """
    True für:
      - 'bin_0', 'bin_1', ... (beliebige nichtnegative Integer)
      - auch für numerische Spaltennamen wie 0, 1, '0', '1' (optional nützlich)
    """
    # numerische Spaltennamen zulassen (z. B. 0, 1, 2)
    if isinstance(col, (int, np.integer)):
        return True

    s = str(col)
    if s.isdigit():                 # '0', '1', ...
        return True
    if re.fullmatch(r"bin_\d+", s): # 'bin_0', 'bin_1', ...
        return True
    return False

In [148]:
def data_gen(temperature, pressure, vext_str):
    dft_data = pd.read_csv("/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/dft_fckin_clean_kond_64grid.csv")
    expV_data = pd.read_csv(f"/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/Vext_allcsv/Vext_allTEMP_64grid_{vext_str}.csv")
    data = pd.merge(dft_data, expV_data, 'inner', on=["structure_name", "temperature_kelvin"])
    feature_columns = [col for col in data.columns if is_bin_column(col)]
    data = data[data.beladung_mol_per_kg > 0]
    data = data[(data.temperature_kelvin == temperature) & (data.pressure_bar == pressure)]
    return data, feature_columns

In [149]:
def label_feature(data, feature_columns):
    data["beladung_pro_vol"] = data["beladung_atoms"] / data["volume_kubAng"]
    
    data[feature_columns] = (
        data[feature_columns]
        .multiply(data["grid.dv"], axis=0)
        .div(data["volume_kubAng"], axis=0)
    )
    label = "beladung_pro_vol"
    normalize_feature = True
    normalize_labels = True
    return data, label

In [150]:
from torch.distributions import Normal

def af_log_expIm(mean, var, best_f, xi=0.01):
    """Logarithmic Expected Improvement acquisition function."""

    std = torch.sqrt(var)
    std_safe = torch.clamp(std, min=1e-9)  # Avoid division by zero
    z = (mean - best_f - xi) / std_safe
    normal = Normal(torch.zeros_like(z), torch.ones_like(z))
    cdf = normal.cdf(z)
    pdf = torch.exp(normal.log_prob(z))

    ei = std * (z * cdf + pdf)

    ei_safe = torch.clamp(ei, min=1e-9)  # Avoid log(0)
    log_ei = torch.log(ei_safe)
    return log_ei

In [151]:
def BO_run(data, label, feature_columns):
    candidates = data.copy()  # Zunächst gefiltert, später alle Daten
    patience = 10

    # --- Initialisierung ---
    n_initial = 1  # Anzahl der initialen Trainingspunkte
    initial_indices = candidates.nsmallest(n_initial, label).index  # hier geht auch random

    print(f"Initial training points:")
    for idx in initial_indices:
        print(f"  Index {idx}, Structure {candidates.loc[idx, 'structure_name']}, {label}: {candidates.loc[idx, label]:.4f}")

    selected = candidates.loc[initial_indices]
    candidates = candidates.drop(initial_indices)
    best = [selected[label].max()]

    best_indices = list(initial_indices)  # <-- Für test_max wichtig!

    # --- BO-Schleife ---
    for i in range(100):
        if len(best) >= patience:
            if len(np.unique(best[-patience:])) == 1:
                print(f"Early stopping at iteration {i} due to no improvement in the last {patience} iterations.")
                break

        feature_transformer = MinMaxScaler()
        label_transformer = MinMaxScaler()

        train_x = torch.tensor(feature_transformer.fit_transform(selected[feature_columns].values))
        train_y = torch.tensor(label_transformer.fit_transform(selected[[label]].values)).flatten()
        test_x = torch.tensor(feature_transformer.transform(candidates[feature_columns].values))

        model, likelihood, _ = train_gp(train_x, train_y, 250)
        with torch.no_grad():
            prediction = model(test_x)
            mean, var = prediction.mean, prediction.variance

        best_f = train_y.max()
        log_ei = af_log_expIm(mean, var, best_f, 0.01 * best_f)

        # --- Nächste Struktur auswählen ---
        index = torch.argmax(log_ei).item()
        new_index = candidates.index[index]  # <-- wichtig für spätere Analyse
        best_indices.append(new_index)

        selected = pd.concat([selected, candidates.iloc[[index]]])
        best.append(selected[label].max())

        candidates = candidates.drop(new_index)  # <-- Tippfehler gefixt!

    print(f"Best Value after {len(best)} iterations: {best[-1]}")

    # --- Analyse ---
    mean_np = mean.detach().cpu().numpy().flatten()
    var_np = var.detach().cpu().numpy().flatten()
    std_np = np.sqrt(var_np)
    ei_np = torch.exp(log_ei).detach().cpu().numpy().flatten()

    best_f = train_y.max().item()
    z = (mean_np - best_f) / std_np
    pi_np = norm.cdf(z)

    # Test: Hat BO die beste Struktur gefunden?
    test_max(data, label, best_indices)

    return best


In [152]:
def test_max(data, label, best_indices):
    best_global = data[label].max()
    best_global_index = data[label].idxmax()

    print("\n=== Ergebnis-Check ===")
    print(f"Beste globale Struktur im Datensatz:")
    print(f"  Index: {best_global_index}")
    print(f"  Name: {data.loc[best_global_index, 'structure_name']}")
    print(f"  {label}: {best_global:.6f}")

    found_best = best_global_index in best_indices

    if found_best:
        print("\n✅ BO hat die globale beste Struktur gefunden!")
        return True
    else:
        print("\n⚠️ BO hat die globale beste Struktur NICHT gefunden!")
        return False


In [153]:
def get_temp_pressure_combinations(
    dft_path="/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/dft_fckin_clean_kond_64grid.csv",
    vext_str=None
):
    """DataArchiv/dft_fckin_clean_kond_64grid.csv
    Liefert sortierte Liste aller (temperature_kelvin, pressure_bar)-Kombinationen,
    die nach dem Merge beider CSVs tatsächlich vorkommen (beladung_mol_per_kg > 0).
    """
    dft_data = pd.read_csv(dft_path)
    expV_data = pd.read_csv(f"/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/Vext_allcsv/Vext_allTEMP_64grid_{vext_str}.csv")

    # Merge wie in data_gen
    merged = pd.merge(dft_data, expV_data, how='inner', on=["structure_name", "temperature_kelvin"])

    # Nur sinnvolle Daten
    if "beladung_mol_per_kg" in merged.columns:
        merged = merged[merged["beladung_mol_per_kg"] > 0]

    # (T, P)-Paare extrahieren
    if not {"temperature_kelvin", "pressure_bar"}.issubset(merged.columns):
        raise KeyError("Spalten 'temperature_kelvin' und/oder 'pressure_bar' fehlen nach dem Merge.")

    combos_df = (
        merged[["temperature_kelvin", "pressure_bar"]]
        .dropna()
        .drop_duplicates()
        .sort_values(["temperature_kelvin", "pressure_bar"])
    )
    combos = list(map(tuple, combos_df.to_numpy()))
    return combos

In [154]:
def append_log_row(
    log_path,
    row_dict,
    header_order=(
        "timestamp",
        "temperature_kelvin",
        "pressure_bar",
        "n_candidates",
        "n_iterations",
        "best_value",
        "global_best_value",
        "found_global_best",
        "vext_str"
    )
):
    """
    Hängt eine Zeile ans CSV an. Legt Datei mit Header an, falls sie fehlt.
    """
    file_exists = os.path.exists(log_path)
    with open(log_path, mode="a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=header_order)
        if not file_exists:
            writer.writeheader()
        writer.writerow({k: row_dict.get(k, "") for k in header_order})

In [155]:
def run_all_BO(
    log_path="bo_results_fr.csv",
    vext_str=None,
    verbose=True
):
    """
    Durchläuft alle (T, P)-Kombinationen, führt data_gen → label_feature → BO_run aus,
    prüft per Vergleich max(best) vs. global_max, und loggt alles in log_path (Append).
    """
    combos = get_temp_pressure_combinations(vext_str=vext_str)
    if verbose:
        print(f"Gefundene Kombinationen: {len(combos)}")
        if len(combos) > 0:
            print("Erste 5:", combos[:5])

    for (T, P) in combos:
        if verbose:
            print("\n" + "="*72)
            print(f"Starte BO für T={T} K, P={P} bar")
            print("="*72)

        # --- Daten generieren (deine bestehende Funktion!) ---
        data, feature_cols = data_gen(temperature=T, pressure=P, vext_str=vext_str)

        # Safety-Checks
        if data is None or len(data) == 0:
            if verbose:
                print(f"[SKIP] Keine Daten für T={T}, P={P}.")
            append_log_row(log_path, {
                "timestamp": datetime.now().isoformat(timespec="seconds"),
                "temperature_kelvin": T,
                "pressure_bar": P,
                "n_candidates": 0,
                "n_iterations": 0,
                "best_value": "",
                "global_best_value": "",
                "found_global_best": False,
                "vext_str": str(vext_str)
            })
            continue

        if verbose:
            print(f"→ Kandidaten vor Filter: {len(data)}")

        # --- label_feature erwartet globales feature_columns (deine Logik beibehalten) ---
        globals()["feature_columns"] = feature_cols
        data, label = label_feature(data, feature_columns)

        # Nach label_feature nochmal prüfen (z. B. falls Filtering leer macht)
        if data is None or len(data) == 0:
            if verbose:
                print(f"[SKIP] Nach label_feature keine Daten mehr für T={T}, P={P}.")
            append_log_row(log_path, {
                "timestamp": datetime.now().isoformat(timespec="seconds"),
                "temperature_kelvin": T,
                "pressure_bar": P,
                "n_candidates": 0,
                "n_iterations": 0,
                "best_value": "",
                "global_best_value": "",
                "found_global_best": False,
                "vext_str": str(vext_str)
            })
            continue

        # --- BO laufen lassen (deine bestehende Funktion!) ---
        try:
            best_list = BO_run(data, label, feature_cols)  # sichtbare Prints macht BO_run selbst
        except Exception as e:
            if verbose:
                print(f"[ERROR] BO_run fehlgeschlagen für T={T}, P={P}: {e}")
            append_log_row(log_path, {
                "timestamp": datetime.now().isoformat(timespec="seconds"),
                "temperature_kelvin": T,
                "pressure_bar": P,
                "n_candidates": len(data),
                "n_iterations": 0,
                "best_value": "",
                "global_best_value": "",
                "found_global_best": False,
                "vext_str": str(vext_str)
            })
            continue

        # --- Erfolg prüfen (ohne best_indices; robust über Wertevergleich) ---
        try:
            best_value = float(np.max(best_list)) if len(best_list) else float("nan")
        except Exception:
            # Falls dtype Probleme (np.float64-Objekte), robust casten:
            best_value = float(np.max([float(x) for x in best_list])) if len(best_list) else float("nan")

        global_best_value = float(data[label].max())
        found_global = bool(np.isclose(best_value, global_best_value, rtol=1e-10, atol=1e-14))
        n_iterations = max(len(best_list) - 1, 0)

        if verbose:
            print(f"\n[SUMMARY] T={T} K, P={P} bar")
            print(f"  Kandidaten:            {len(data)}")
            print(f"  Iterationen (eff.):    {n_iterations}")
            print(f"  BO best_value:         {best_value:.12g}")
            print(f"  Global best in subset: {global_best_value:.12g}")
            print("  Treffer global best?:  " + ("✅ JA" if found_global else "❌ NEIN"))

        # --- Log schreiben ---
        append_log_row(log_path, {
            "timestamp": datetime.now().isoformat(timespec="seconds"),
            "temperature_kelvin": T,
            "pressure_bar": P,
            "n_candidates": len(data),
            "n_iterations": n_iterations,
            "best_value": best_value,
            "global_best_value": global_best_value,
            "found_global_best": found_global,
            "vext_str": str(vext_str)
        })

    if verbose:
        print("\nFertig. Ergebnisse geloggt nach:", os.path.abspath(log_path))

In [156]:
from glob import glob
import os

vext_files = glob("/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/Vext_allcsv/Vext_allTEMP_64grid_*.csv")

for file_path in vext_files:
    # Molekülname aus dem Dateinamen extrahieren
    vext_str = os.path.basename(file_path).replace("Vext_allTEMP_64grid_", "").replace(".csv", "")

    run_all_BO(
        log_path=f"bo_log_cleankond_{vext_str}.csv",
        vext_str=vext_str,
        verbose=False
    )



Initial training points:
  Index 105, Structure LIT, beladung_pro_vol: 0.0000
Early stopping at iteration 12 due to no improvement in the last 10 iterations.
Best Value after 13 iterations: 0.001960839017015396

=== Ergebnis-Check ===
Beste globale Struktur im Datensatz:
  Index: 58
  Name: DOH
  beladung_pro_vol: 0.003369

⚠️ BO hat die globale beste Struktur NICHT gefunden!
Initial training points:
  Index 232, Structure LIT, beladung_pro_vol: 0.0000
Early stopping at iteration 12 due to no improvement in the last 10 iterations.
Best Value after 13 iterations: 0.0037008755204272323

=== Ergebnis-Check ===
Beste globale Struktur im Datensatz:
  Index: 205
  Name: AFY
  beladung_pro_vol: 0.005249

⚠️ BO hat die globale beste Struktur NICHT gefunden!
Initial training points:
  Index 359, Structure LIT, beladung_pro_vol: 0.0000
Early stopping at iteration 17 due to no improvement in the last 10 iterations.
Best Value after 18 iterations: 0.004580186055383727

=== Ergebnis-Check ===
Beste