In [None]:
import pandas as pd
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler 
from sklearn.metrics import r2_score
from gpytorch.mlls import ExactMarginalLogLikelihood
import gpytorch
import altair as alt
import matplotlib.pyplot as plt
import re
from scipy.stats import norm

UTILITY

In [None]:
def plot_gp_results(candidates, mean_np, std_np, ei_np, pi_np, iteration, label="target_value"):
    x = np.arange(len(candidates))  # Struktur-Index
    names = candidates["structure_name"].values

    fig, axs = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
    plt.subplots_adjust(hspace=0.3)

    axs[0].plot(x, mean_np, color="C0", label="Predicted Mean")
    axs[0].fill_between(x, mean_np - std_np, mean_np + std_np, color="C0", alpha=0.3, label="Uncertainty (±1σ)")
    axs[0].set_ylabel("Predicted Mean")
    axs[0].set_title(f"Iteration {iteration}: GP Prediction (mean ± std)")
    axs[0].legend()
    axs[0].grid(alpha=0.3)

    axs[1].plot(x, ei_np, color="C1")
    axs[1].set_ylabel("Expected Improvement")
    axs[1].set_title("Expected Improvement over Candidates")
    axs[1].grid(alpha=0.3)

    axs[2].plot(x, pi_np, color="C2")
    axs[2].set_ylabel("Probability of Improvement")
    axs[2].set_xlabel("Candidate Structure Index")
    axs[2].set_title("Probability of Improvement")
    axs[2].grid(alpha=0.3)

    # Optional: Struktur-Namen als xticks (nur alle paar, sonst zu viele)
    if len(names) <= 30:
        axs[2].set_xticks(x)
        axs[2].set_xticklabels(names, rotation=90)
    else:
        step = max(1, len(names)//30)
        axs[2].set_xticks(x[::step])
        axs[2].set_xticklabels(names[::step], rotation=90)

    plt.tight_layout()
    plt.show()


In [None]:
def is_bin_column(col) -> bool:
    # numerische Spaltennamen erlauben
    if isinstance(col, (int, np.integer)):
        return True

    s = str(col)

    # rein numerischer Spaltenname: '0', '1', ...
    if s.isdigit():
        return True

    # bin_X oder bin_X_high / bin_X_low
    if re.fullmatch(r"bin_\d+(_high|_low)?", s):
        return True
    if re.fullmatch(r"bin_\d+", s): # 'bin_0', 'bin_1', ...
        return True

    return False

MODELL GP

In [None]:
class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

def train_gp(xt_train, yt_train, training_iterations=100):
    likelihood = gpytorch.likelihoods.GaussianLikelihood()
    model = ExactGPModel(xt_train, yt_train, likelihood)

    model.train()
    likelihood.train()

    optimizer = torch.optim.Adam(model.parameters(), lr=0.2)
    mll = ExactMarginalLogLikelihood(likelihood, model)

    losses = []

    for i in range(training_iterations):
        optimizer.zero_grad()
        output = model(xt_train)
        loss = -mll(output, yt_train)
        losses.append(loss.item())
        loss.backward()
        optimizer.step()

    return model.eval(), likelihood.eval(), losses

In [None]:
def datenfilternundfertigmachenfür(data, temp1, temp2, pres1, pres2):
    # wc(p, T) -> high
    data_high = data[(data.temperature_kelvin == temp1) & (data.pressure_bar == pres1)]
    data_high = data_high.drop_duplicates(subset=["structure_name", "temperature_kelvin", "pressure_bar"])
    feature_columns_high = [col for col in data_high.columns if is_bin_column(col)]
    
    # wc(p, T) -> low
    data_low = data[(data.temperature_kelvin == temp2) & (data.pressure_bar == pres2)]
    data_low = data_low.drop_duplicates(subset=["structure_name", "temperature_kelvin", "pressure_bar"])
    feature_columns_low = [col for col in data_low.columns if is_bin_column(col)]

    add_features = True
    
    data_high["beladung_pro_vol"] = (
        data_high["beladung_atoms"]
        #.div(data_high["density_bulk"], axis=0)
        .div(data_high["volume_kubAng"], axis=0)
    )
    
    data_low["beladung_pro_vol"] = (
        data_low["beladung_atoms"]
        #.div(data_low["density_bulk"], axis=0)
        .div(data_low["volume_kubAng"], axis=0)
    )
    
    data_high[feature_columns_high] = (
        data_high[feature_columns_high]
        .multiply(data_high["grid.dv"], axis=0)
        .div(data_high["volume_kubAng"], axis=0)
    )
    
    data_low[feature_columns_low] = (
        data_low[feature_columns_low]
        .multiply(data_low["grid.dv"], axis=0)
        .div(data_low["volume_kubAng"], axis=0)
    )
    
    additional_features12 = ["delta_p", "delta_T"]
    additional_features12 = ["chem_potential_bulk_high", "chem_potential_bulk_low", "pressure_bar_high", "pressure_bar_low", "temperature_kelvin_high", "temperature_kelvin_low"]
    additional_features = ["pressure_bar_high", "pressure_bar_low", "temperature_kelvin_high", "temperature_kelvin_low"]

    merged = pd.merge(
        #data_high[["structure_name", "beladung_pro_vol"]],
        #data_low[["structure_name", "beladung_pro_vol"]],
        data_high,#[cols],
        data_low,#[cols],
        on="structure_name",
        suffixes=("_high", "_low")
    )
    
    # wc aus den beiden Zuständen high/low
    merged["working_capacity"] = (merged["beladung_pro_vol_high"] - merged["beladung_pro_vol_low"]).abs()
    merged["delta_T"] = (merged["temperature_kelvin_high"] - merged["temperature_kelvin_low"]).abs()
    merged["delta_p"] = (merged["pressure_bar_high"] - merged["pressure_bar_low"]).abs()
    feature_columns = [col for col in merged.columns if is_bin_column(col)]
    
    if add_features:
        feature_columns += additional_features
    
    merged["working_capacity"] = pd.to_numeric(merged["working_capacity"], errors="coerce")
    
    return merged, feature_columns

NORMALIZE

In [None]:
normalize_feature = True
normalize_labels = True

FOLD - TRAINING - PREDICTION

In [61]:
def GPGPGP(merged, feature_columns):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    label = "working_capacity"
    #X = data[feature_columns].values 
    X = merged[feature_columns].values
    #y = data[label].values 
    y = merged[label].values 
    #print(merged.head())
    ids = data.index.values
    
    split_info = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X), start=1):
        x_train = torch.tensor(X[train_idx], dtype=torch.float64)
        y_train = torch.tensor(y[train_idx], dtype=torch.float64)
        x_test = torch.tensor(X[test_idx], dtype=torch.float64)
        y_test = torch.tensor(y[test_idx], dtype=torch.float64)
    
        train_ids = ids[train_idx]
        test_ids = ids[test_idx]
    
        #test_df = data.iloc[test_idx].copy()
        test_df = merged.iloc[test_idx].copy()
        test_df["fold"] = fold
    
        if normalize_feature:
            feature_transformer = MinMaxScaler()
            feature_transformer.fit(x_train)
            xt_train = torch.tensor(feature_transformer.transform(x_train), dtype=torch.float64)
            xt_test = torch.tensor(feature_transformer.transform(x_test), dtype=torch.float64) #*2
        else:
            xt_train = x_train
            xt_test = x_test
    
        # Label-Normalisierung
        if normalize_labels:
            label_transformer = MinMaxScaler()  # oder StandardScaler()
            label_transformer.fit(y_train.unsqueeze(1))
            yt_train = torch.tensor(label_transformer.transform(y_train.unsqueeze(1)).flatten(), dtype=torch.float64)
            yt_test = torch.tensor(label_transformer.transform(y_test.unsqueeze(1)).flatten(), dtype=torch.float64)
        else:
            yt_train = y_train
            yt_test = y_test
    
        # Training
        model, likelihood, losses = train_gp(xt_train, yt_train, training_iterations=200)
    
        # Prediction
        with torch.no_grad():
            prediction = model(xt_test)
            inverse_transformed_prediction = label_transformer.inverse_transform(
                prediction.mean.unsqueeze(1)
            ).squeeze()
            inverse_transformed_prediction = np.where(
                inverse_transformed_prediction > 0, inverse_transformed_prediction, 0
            )
    
        # Ergebnisse
        test_df[f"{label}_pred"] = inverse_transformed_prediction
        test_df["abs_rel_deviation"] = np.abs(
            (test_df[label] - test_df[f"{label}_pred"]) / test_df[label] * 100
        )
    
        split_info.append(test_df)
    
    results = pd.concat(split_info, ignore_index=True)
    return results

ALLE DATEN

In [None]:
def printmirmeinenscheiss(results, label="working_capacity"):
    print(f"R²                        : {r2_score(results[label], results[f'{label}_pred']):.4f}")
    print(f"Median APE                : {results['abs_rel_deviation'].median():.2f}%")
    print(f"Mean APE                  : {results['abs_rel_deviation'].mean():.2f}%")
    #print(f"Final Loss                : {losses[-1]:.4f}")
    
    count = (results['abs_rel_deviation'] > 20).sum()
    print(f"Abs rel dev > 20%         : {count} out of {len(results)}")
    print(f"Max abs rel dev           : {results['abs_rel_deviation'].max():.2f}%")


BAYESIAN OPTIMIZATION

In [None]:
from torch.distributions import Normal

def af_log_expIm(mean, var, best_f, xi=0.01):
    """Logarithmic Expected Improvement acquisition function."""

    std = torch.sqrt(var)
    std_safe = torch.clamp(std, min=1e-9)  # Avoid division by zero
    z = (mean - best_f - xi) / std_safe
    normal = Normal(torch.zeros_like(z), torch.ones_like(z))
    cdf = normal.cdf(z)
    pdf = torch.exp(normal.log_prob(z))

    ei = std * (z * cdf + pdf)

    ei_safe = torch.clamp(ei, min=1e-9)  # Avoid log(0)
    log_ei = torch.log(ei_safe)
    return log_ei

In [None]:
def bimbamBO():
    candidates = merged.copy() # zunächst gefilteret, später alle Daten
    
    patience = 10
    
    n_initial = 1 # Anzahl der initialen Trainingspunkte
    initial_indices = candidates.nsmallest(n_initial, label).index # hier geht auch random
    
    #print(f"Initial training points:")
    for idx in initial_indices:
        #print(f"  Index {idx}, Structure {candidates.loc[idx, 'structure_name']}, {label}: {candidates.loc[idx, label]:.4f}")
    
    # Transfer from candidates to selection
    selected = candidates.loc[initial_indices]
    candidates = candidates.drop(initial_indices)
    best = [selected[label].max()]
    
    for i in range(100):
        if len(best) >= patience:
            if len(np.unique(best[-patience:])) == 1:
                #print(f"Early stopping at iteration {i} due to no improvement in the last {patience} iterations.")
                break
        
        feature_transoformer = MinMaxScaler()
        label_transformer = MinMaxScaler()
    
        train_x = torch.tensor(feature_transoformer.fit_transform(selected[feature_columns].values))
        train_y = torch.tensor(label_transformer.fit_transform(selected[[label]].values)).flatten()
    
        test_x = torch.tensor(feature_transoformer.transform(candidates[feature_columns].values))
    
        model, likelihood, _ = train_gp(train_x, train_y, 250)
        with torch.no_grad():
            prediction = model(test_x)
            mean, var = prediction.mean, prediction.variance
        
        best_f = train_y.max()
    
        log_ei = af_log_expIm(mean, var, best_f, 0.01 * best_f)
    
        # Select the candidate with the highest acquisition value
        index = torch.argmax(log_ei).item()
        best.append(selected[label].max())
        #print(f"Iteration: {i}, Current Best: {selected[label].max():.2e}")
        selected = pd.concat([selected, candidates.iloc[[index]]])
        canidates = candidates.drop(candidates.index[index])
    
    #print(f"Best Value after {len(best)} iterations: {best[-1]}")
    
    mean_np = mean.detach().cpu().numpy().flatten()
    var_np = var.detach().cpu().numpy().flatten()
    std_np = np.sqrt(var_np)
    ei_np = torch.exp(log_ei).detach().cpu().numpy().flatten()
    
    # --- Probability of Improvement ---
    best_f = train_y.max().item()
    z = (mean_np - best_f) / std_np
    pi_np = norm.cdf(z)

In [None]:
def tempunddruckunique(data):
    temperatures = sorted(data['temperature_kelvin'].unique())
    pressures = sorted(data['pressure_bar'].unique())
    combinations = list(
        data[['temperature_kelvin', 'pressure_bar']]
        .drop_duplicates()
        .itertuples(index=False, name=None)
    )
    return temperatures, pressures, combinations

In [None]:
# dft Daten - beladungen, grid etc
dft_data1 = pd.read_csv('/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/DFT_Data_clean_06_10.csv')
dft_data2 = pd.read_csv("/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/dft_fckin_clean_kond_64grid.csv")
dft_data_all =  pd.concat([dft_data1, dft_data2], ignore_index=True)

# Feature Daten - bins zu Vext, Vext+chem_res etc
expV_data = pd.read_csv("/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/Vext_chem_res_allTEMP_pressure_20b_exp.csv")
#expV_data = pd.read_csv("/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/Vext_allcsv/Vext_allTEMP_64grid_20b.csv")

# Chem_res_bulk explizit - für additional feature 
chem_res = pd.read_csv("/Users/danielbock/MASTERTHESIS/MASTA/DataArchiv/bulk_potentials.csv")

# Kombi aus obigem 
data = pd.merge(dft_data_all, expV_data, 'inner', on=["structure_name", "temperature_kelvin", "pressure_bar"])
data = pd.merge(data, chem_res, 'inner', on=["structure_name", "temperature_kelvin", "pressure_bar"])
#feature_columns = [col for col in data.columns if is_bin_column(col)]
data = data[data.beladung_mol_per_kg > 0]

In [55]:
data["pressure_bar"].unique()

array([  0.1       ,   1.        ,   8.07142857,  15.14285714,
        22.21428571,  29.28571429,  36.35714286,  43.42857143,
        50.5       ,  57.57142857,  64.64285714,  71.71428571,
        78.78571429,  85.85714286,  92.92857143, 100.        ])

In [None]:
temperatures, pressures, combis = tempunddruckunique(data)

results_gp = []
results_bo = []
counter_bad = 0
counter_good = 0
for T in temperatures:
    sub = data[data['temperature_kelvin']==T]
    sub = sub.drop_duplicates(subset=['pressure_bar', 'structure_name'], keep='first').reset_index(drop=True)
    print(sub.shape)
    for p1 in pressures:
        for p2 in pressures:
            if p1 == p2:
                continue
            # Hier muss der shit rein
            merged, feature_columns = datenfilternundfertigmachenfür(sub, T, T, p1, p2) # funktioniert
            if merged.shape==(245, 88):
                continue
            else:
                # hier den ganzen müll rein#
                results = GPGPGP(merged, feature_columns)
                print(f"Zustand --> {T}K, {p1}bar und {p2}bar")
                printmirmeinenscheiss(results)

In [62]:
T=400
p1=1
p2=100
merged, feature_columns = datenfilternundfertigmachenfür(data, T, T, p1, p2) # funktioniert
results = GPGPGP(merged, feature_columns)
printmirmeinenscheiss(results)

R²                        : 0.6373
Median APE                : 12.34%
Mean APE                  : 2994.92%
Abs rel dev > 20%         : 34 out of 127
Max abs rel dev           : 317099.30%


In [69]:
doto=data[data["temperature_kelvin"]==400]
doto=doto[doto["pressure_bar"]==100]
doto

Unnamed: 0,structure_name,pressure_bar,temperature_kelvin,volume_kubAng,grid.dv,density_Atmos_per_kubAng,density_bulk,fraction_of_used_points,beladung_mol_per_kg,beladung_atoms,...,bin_13,bin_14,bin_15,bin_16,bin_17,bin_18,bin_19,x_max_y,x_min_y,chem_potential_bulk
14890,DDR,100.0,400.0,6715.860313,0.025619,855.049395,[0.0018837],0.082912,2.940194,21.905488,...,915,838,696,673,619,675,238341,10.1,-15.0,-0.087041
14891,RRO,100.0,400.0,1007.699890,0.003844,565.118379,[0.0018837],0.045502,2.008644,2.172355,...,1000,952,920,840,864,700,247720,10.1,-15.0,-0.087041
14892,MER,100.0,400.0,1954.329977,0.007455,727.048515,[0.0018837],0.072937,2.819137,5.420276,...,1392,720,1120,1040,832,1040,240032,10.1,-15.0,-0.087041
14893,EOS,100.0,400.0,682.685342,0.002604,855.860658,[0.0018837],0.064011,3.091343,2.228865,...,1076,1056,848,756,772,896,242856,10.1,-15.0,-0.087041
14894,CFI,100.0,400.0,1908.333111,0.007280,508.661083,[0.0018837],0.115356,1.925917,3.702907,...,896,864,864,752,624,736,229664,10.1,-15.0,-0.087041
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15012,LIO,100.0,400.0,2041.797373,0.007789,975.031449,[0.0018837],0.063141,3.410128,7.594363,...,788,700,694,608,576,550,243778,10.1,-15.0,-0.087041
15013,SVV,100.0,400.0,3106.520566,0.011850,521.881574,[0.0018837],0.068390,1.838074,6.184524,...,744,784,752,600,600,496,242440,10.1,-15.0,-0.087041
15014,PUN,100.0,400.0,2405.249497,0.009175,872.706332,[0.0018837],0.115387,3.701951,8.007341,...,1608,1664,1536,1280,1240,1296,227928,10.1,-15.0,-0.087041
15015,FER,100.0,400.0,2051.260998,0.007825,678.754814,[0.0018837],0.076477,2.455479,5.311216,...,960,1056,784,736,800,752,239712,10.1,-15.0,-0.087041
