# HFRD data generation method

This method is an hybrid Flamelet / random data approach to generate training data for neural networks. The idea is to compute standard flamelets (0D, 1D premixed, 1D diffusion) and augment them using a random based technique.

Using Google Colab:

In [None]:
use_colab = False

## Google colab preparation

These lines are here to enable Colab running of the tools. We need to perform a git clone in order to have access to python scripts. This needs to be done at each runtime as the clone is lost. 

In [None]:
import os

if use_colab:
    !git clone -b cost_course_exercices https://github.com/cmehl/ML_chem.git
    !pip install PyDOE
    !pip install cantera

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    # Create a folder in the root directory
    if not os.path.isdir("/content/drive/MyDrive/ML_chem_data"):
        !mkdir -p "/content/drive/MyDrive/ML_chem_data"
    else:
        print("Folder /content/drive/MyDrive/ML_chem_data already exists")

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import joblib
import json
import pickle

import numpy as np

import matplotlib.pyplot as plt

if use_colab:
    from ML_chem.database_flamelets import DatabaseFlamelets
    from ML_chem.chem_ai.utils import StandardScaler
else:
    from chem_ai.database_flamelets import DatabaseFlamelets
    from chem_ai.utils import StandardScaler

# Flames computation

We first define common parameters to all flames computations, such as fuel, chemical mechanism, etc...

In [None]:
p = 101325.0

fuel = "H2"
mech_file = "/work/mehlc/Lecture_IA_chem_accel/chem_AI_project/data/mechanisms/mech_h2.yaml"

folder = "case_multi_" + "test_case_flamelets"

dt_CFD = 1.0e-6

In [None]:
dtb = DatabaseFlamelets(mech_file, fuel, folder, p, dt_CFD)

## 0D reactors

In [None]:
phi_bounds = (0.8, 1.0)
T0_bounds = (1500.0, 1600.0)

n_samples = 200

max_sim_time = 10.0e-3

solve_mode = "dt_cfd"

In [None]:
dtb.compute_0d_reactors(phi_bounds, T0_bounds, n_samples, max_sim_time, solve_mode)

## 1D PREMIXED FLAMES

In [None]:
phi_bounds = (0.5, 1.0)
T0_bounds = (300.0, 400.0)

n_samples = 200

In [None]:
dtb.compute_1d_premixed(phi_bounds, T0_bounds, n_samples)

## 1D DIFFUSION FLAMES

In [None]:
strain_bounds = (0., 1000.0)
T0_bounds = (300.0, 500.0)

n_samples = 200

width = 0.02

In [None]:
dtb.compute_1d_diffusion(strain_bounds, T0_bounds, n_samples, width)

## Postprocessing simulations database

In [None]:
dtb.df.plot.scatter(x="Temperature", y="OH")

In [None]:
dtb.augment_data()

In [None]:
fig, ax = plt.subplots()

x_var = "Temperature"
y_var = "H2O"

if dtb.includes_0d_reactors:
    ax.scatter(dtb.df_augmented[x_var][dtb.df_augmented["reactor_type"]==0], dtb.df_augmented[y_var][dtb.df_augmented["reactor_type"]==0], color="blue", alpha=0.2,  s=3)
if dtb.includes_1d_prem:
    ax.scatter(dtb.df_augmented[x_var][dtb.df_augmented["reactor_type"]==1], dtb.df_augmented[y_var][dtb.df_augmented["reactor_type"]==1], color="green", alpha=0.2,  s=3)
if dtb.includes_1d_diff:
    ax.scatter(dtb.df_augmented[x_var][dtb.df_augmented["reactor_type"]==2], dtb.df_augmented[y_var][dtb.df_augmented["reactor_type"]==2], color="purple", alpha=0.2,  s=3)

if dtb.includes_0d_reactors:
    ax.scatter(dtb.df_flamelet[x_var][dtb.df_flamelet["reactor_type"]==0], dtb.df_flamelet[y_var][dtb.df_flamelet["reactor_type"]==0], color="blue", s=3, label="0D")
if dtb.includes_1d_prem:
    ax.scatter(dtb.df_flamelet[x_var][dtb.df_flamelet["reactor_type"]==1], dtb.df_flamelet[y_var][dtb.df_flamelet["reactor_type"]==1], color="green", s=3, label="1D premixed")
if dtb.includes_1d_diff:
    ax.scatter(dtb.df_flamelet[x_var][dtb.df_flamelet["reactor_type"]==2], dtb.df_flamelet[y_var][dtb.df_flamelet["reactor_type"]==2], color="purple", s=3, label="1D diffusion")

ax.set_xlabel(x_var, fontsize=14)
ax.set_ylabel(y_var, fontsize=14)

ax.legend()

In [None]:
dtb.save_database()

# Generation of train and test databases

In [None]:
valid_ratio = 0.15
test_ratio = 0.15
dtb.generate_train_valid_test(valid_ratio, test_ratio)

In [None]:
dtb.X_train.head()

In [None]:
dtb.Y_train.head()

# Pre-processing of database

Pre-processing is by default made using K-means clustering. If no clustering is needed, we need to set *n_clusters=1*.

In [None]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

We set a flag to decide if we apply the logarithm or not:

In [None]:
log_transform = True
threshold = 1.0e-10

In [None]:
folder_p_2 = os.path.join(folder,"processed_database_cluster")
if not os.path.isdir(folder_p_2):
    os.mkdir(folder_p_2)

Number of clusters:

In [None]:
n_clusters = 1

We perform k-means clustering:

In [None]:
X_kmeans = dtb.X_train.copy()

# We apply log and normalization
# Apply threshold if log
if log_transform:
    X_kmeans[X_kmeans < threshold] = threshold

    # Apply log
    X_kmeans.iloc[:, 1:] = np.log(X_kmeans.iloc[:, 1:])

# Apply scaling
Xscaler = StandardScaler()
Xscaler.fit(X_kmeans)
X_kmeans = Xscaler.transform(X_kmeans)

kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_kmeans)

kmeans_clusters_train = kmeans.labels_

# Saving K-means model
with open(os.path.join(folder_p_2, "kmeans_model.pkl"), "wb") as f:
    pickle.dump(kmeans, f)

# Saving scaler
joblib.dump(Xscaler, os.path.join(folder_p_2,"Xscaler_kmeans.pkl"))

# Saving normalization parameters and centroids
np.savetxt(os.path.join(folder_p_2, 'kmeans_norm.dat'), np.vstack([Xscaler.mean, Xscaler.std]).T)
np.savetxt(os.path.join(folder_p_2, 'km_centroids.dat'), kmeans.cluster_centers_.T)


# Validation data Kmeans
X_kmeans_val = dtb.X_val.copy()
# Apply threshold if log
if log_transform:
    X_kmeans_val[X_kmeans_val < threshold] = threshold

    # Apply log
    X_kmeans_val.iloc[:, 1:] = np.log(X_kmeans_val.iloc[:, 1:])

# Apply scaling
X_kmeans_val = Xscaler.transform(X_kmeans_val)

kmeans_clusters_val = kmeans.predict(X_kmeans_val)

We do the log transformation and scaling for each cluster separately:

In [None]:
for i_cluster in range(n_clusters):

    dtb_folder_i = os.path.join(folder_p_2, f"cluster_{i_cluster}")
    if not os.path.isdir(dtb_folder_i):
        os.mkdir(dtb_folder_i)

    Xcols = dtb.X_train.columns
    Ycols = dtb.Y_train.columns

    # Getting data for cluster
    X_train_i = dtb.X_train[kmeans_clusters_train==i_cluster].copy()
    Y_train_i = dtb.Y_train[kmeans_clusters_train==i_cluster].copy()
    #
    X_val_i = dtb.X_val[kmeans_clusters_val==i_cluster].copy()
    Y_val_i = dtb.Y_val[kmeans_clusters_val==i_cluster].copy()

    print(f"CLUSTER {i_cluster}")
    print(f" >> {X_train_i.shape[0]} points in training set")
    print(f" >> {X_val_i.shape[0]} points in validation set \n")

    # Apply threshold if log
    if log_transform:
        X_train_i[X_train_i < threshold] = threshold
        X_val_i[X_val_i < threshold] = threshold
        #
        Y_train_i[Y_train_i < threshold] = threshold
        Y_val_i[Y_val_i < threshold] = threshold

        # Apply log
        X_train_i.iloc[:, 1:] = np.log(X_train_i.iloc[:, 1:])
        X_val_i.iloc[:, 1:] = np.log(X_val_i.iloc[:, 1:])
        #
        Y_train_i = np.log(Y_train_i)
        Y_val_i = np.log(Y_val_i)


    # Apply scaling
    Xscaler = StandardScaler()
    Xscaler.fit(X_train_i)
    X_train_i = Xscaler.transform(X_train_i)
    X_val_i = Xscaler.transform(X_val_i)

    Yscaler = StandardScaler()
    Yscaler.fit(Y_train_i)
    Y_train_i = Yscaler.transform(Y_train_i)
    Y_val_i = Yscaler.transform(Y_val_i)

    # Saving scalers for later use
    joblib.dump(Xscaler, os.path.join(dtb_folder_i,'Xscaler.pkl'))
    joblib.dump(Yscaler, os.path.join(dtb_folder_i,'Yscaler.pkl'))


    # Saving data (transformed)
    X_train_i.to_csv(os.path.join(dtb_folder_i,"X_train.csv"), index=False)
    Y_train_i.to_csv(os.path.join(dtb_folder_i,"Y_train.csv"), index=False)
    X_val_i.to_csv(os.path.join(dtb_folder_i,"X_val.csv"), index=False)
    Y_val_i.to_csv(os.path.join(dtb_folder_i,"Y_val.csv"), index=False)

In [None]:
params = {
        "fuel": fuel,
        "mech_file": mech_file,
        "log_transform": log_transform,
        "threshold": threshold,
        "p": p,
        "dt": dt_CFD,
        "n_clusters": n_clusters,
        }

# Save to file
with open(os.path.join(folder, "dtb_params.json"), "w") as file:
    json.dump(params, file)

We compute PCA to analyze the clusters:

In [None]:
# PCA computed on training database

# Number of PCA dimensions here forced to 2
k = 2

# Get states only (temperature and Yk's)
data = dtb.X_train.values.copy()
data_val = dtb.X_val.values.copy()

if log_transform:
    data[data < threshold] = threshold
    data[:, 1:] = np.log(data[:, 1:])
    #
    data_val[data_val < threshold] = threshold
    data_val[:, 1:] = np.log(data_val[:, 1:])

# Scaling data
pca_scaler = StandardScaler()
pca_scaler.fit(data)
data = pca_scaler.transform(data)
data_val = pca_scaler.transform(data_val)

# Performing PCA
pca_algo = PCA(n_components=k, svd_solver="full")
pca_algo.fit(data)
PC_train = pca_algo.transform(data)
PC_val = pca_algo.transform(data_val)

Cluster in PCA space:

In [None]:
fig, ax = plt.subplots()
im = ax.scatter(PC_train[:,0], PC_train[:,1], c = kmeans_clusters_train, s=2)
fig.colorbar(im, ax=ax)
ax.set_xlabel("PC 1", fontsize=16)
ax.set_ylabel("PC 2", fontsize=16)
fig.tight_layout()

In [None]:
fig, ax = plt.subplots()
im = ax.scatter(dtb.X_train["Temperature_X"], dtb.X_train["H2O_X"], c = kmeans_clusters_train, s=2)
fig.colorbar(im, ax=ax)
ax.set_xlabel("T", fontsize=16)
ax.set_ylabel("H2O", fontsize=16)
fig.tight_layout()

In [None]:
T_X = dtb.X_train['Temperature_X']
ax = T_X.plot.kde()
