# 0D database clustering

In this notebook we load a database (generated with file *0D_database_generation.ipynb*) and apply K-means clustering. 

In [None]:
use_colab = False

## Google colab preparation

These lines are here to enable Colab running of the tools. We need to perform a git clone in order to have access to python scripts. This needs to be done at each runtime as the clone is lost. 

In [None]:
import os

if use_colab:
    !git clone -b cost_course_exercices https://github.com/cmehl/COST_lecture.git
    !pip install PyDOE
    !pip install cantera

    # Mount Google Drive
    from google.colab import drive
    drive.mount('/content/drive')

    # Create a folder in the root directory
    if not os.path.isdir("/content/drive/MyDrive/COST_lecture_data"):
        !mkdir -p "/content/drive/MyDrive/COST_lecture_data"
    else:
        print("Folder /content/drive/MyDrive/COST_lecture_data already exists")

## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os
import pickle
import joblib

from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

if use_colab:
    from COST_lecture.chem_ai.utils import StandardScaler
else:
    from chem_ai.utils import StandardScaler


## Loading database

We first load the database. We define the folder to consider:

In [None]:
if use_colab:
    folder = "/content/drive/MyDrive/ML_chem_data/case_0D_highT"
else:
    folder = "./case_0D_test" 

We extract associated parameters:

In [None]:
with open(os.path.join(folder, "dtb_params.json"), "r") as file:
    dtb_params = json.load(file)

fuel = dtb_params["fuel"]
mech_file = dtb_params["mech_file"]
log_transform = dtb_params["log_transform"]
threshold = dtb_params["threshold"]
p = dtb_params["p"]
dt = dtb_params["dt"]

We load the data. It is here the raw unscaled data. We load both the training and validation data, but only the training data will be used to define the clusters.

In [None]:
X_train = pd.read_csv(os.path.join(folder,"X_train_raw.csv"))
Y_train = pd.read_csv(os.path.join(folder,"Y_train_raw.csv"))

X_val = pd.read_csv(os.path.join(folder,"X_val_raw.csv"))
Y_val = pd.read_csv(os.path.join(folder,"Y_val_raw.csv"))

## Clustering

We define a copy of *X_train* which will be transformed for application of K-means:

In [None]:
X_kmeans = X_train.copy()

We apply logarithm transform (if needed) and standard scaler:

In [None]:
if log_transform:
    X_kmeans[X_kmeans < threshold] = threshold

    # Apply log
    X_kmeans.iloc[:, 1:] = np.log(X_kmeans.iloc[:, 1:])

# Apply scaling
Xscaler = StandardScaler()
Xscaler.fit(X_kmeans)
X_kmeans = Xscaler.transform(X_kmeans)

We then apply the K-means clustering. The number of clusters has to be manually prescribed with this method.

In [None]:
n_clusters = 3
kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_kmeans)

We can then get the list of labels for the training data:

In [None]:
kmeans_clusters_train = kmeans.labels_

In order to get the labels for the validation data, we apply the K-Means algorithm to *X_val*:

In [None]:
# Validation data Kmeans
X_kmeans_val = X_val.copy()
# Apply threshold if log
if log_transform:
    X_kmeans_val[X_kmeans_val < threshold] = threshold

    # Apply log
    X_kmeans_val.iloc[:, 1:] = np.log(X_kmeans_val.iloc[:, 1:])

# Apply scaling
X_kmeans_val = Xscaler.transform(X_kmeans_val)

kmeans_clusters_val = kmeans.predict(X_kmeans_val)

We store the K-means and the associated scaler in the folder for a later use:

In [None]:
folder_kmeans = os.path.join(folder, "KMEANS")
if not os.path.isdir(folder_kmeans):
    os.mkdir(folder_kmeans)

# Saving K-means model
with open(os.path.join(folder_kmeans, "kmeans_model.pkl"), "wb") as f:
    pickle.dump(kmeans, f)

# Saving scaler
joblib.dump(Xscaler, os.path.join(folder_kmeans,"Xscaler_kmeans.pkl"))

We can visualize the clusters on a scatter plot for instance:

In [None]:
from matplotlib.colors import ListedColormap

fig, ax = plt.subplots()

colors = ['red', 'green', 'blue']
cmap = ListedColormap(colors)

scatter = ax.scatter(X_train["Temperature_X"], X_train["H2_X"], c=kmeans_clusters_train, cmap=cmap)

ax.set_ylabel(r"$Y_{H_2}$ $[-]$", fontsize=14)
ax.set_xlabel(r"$T$ $[K]$", fontsize=14)

cbar = fig.colorbar(scatter, ax=ax, ticks=np.unique(kmeans_clusters_train))
cbar.ax.tick_params(size=0, labelsize=14)

fig.savefig(os.path.join(folder,"KMEANS/clustering_plot.png"), dpi=500)

## Preparing data for training

If we want to use the K-means clustered data for training ANN models, we need to define separate scalers for each cluster.

In [None]:
def preproc_cluster_i(log_transform, threshold, i_cluster):

    dtb_folder_i = os.path.join(folder, f"KMEANS/cluster_{i_cluster}")
    if not os.path.isdir(dtb_folder_i):
        os.mkdir(dtb_folder_i)

    # Getting data for cluster
    X_train_i = X_train[kmeans_clusters_train==i_cluster].copy()
    Y_train_i = Y_train[kmeans_clusters_train==i_cluster].copy()
    #
    X_val_i = X_val[kmeans_clusters_val==i_cluster].copy()
    Y_val_i = Y_val[kmeans_clusters_val==i_cluster].copy()

    print(f"CLUSTER {i_cluster}")
    print(f" >> {X_train_i.shape[0]} points in training set")
    print(f" >> {X_val_i.shape[0]} points in training set \n")

    # Apply threshold if log
    if log_transform:
        X_train_i[X_train_i < threshold] = threshold
        X_val_i[X_val_i < threshold] = threshold
        #
        Y_train_i[Y_train_i < threshold] = threshold
        Y_val_i[Y_val_i < threshold] = threshold

        # Apply log
        X_train_i.iloc[:, 1:] = np.log(X_train_i.iloc[:, 1:])
        X_val_i.iloc[:, 1:] = np.log(X_val_i.iloc[:, 1:])
        #
        Y_train_i = np.log(Y_train_i)
        Y_val_i = np.log(Y_val_i)


    # Apply scaling
    Xscaler = StandardScaler()
    Xscaler.fit(X_train_i)
    X_train_i = Xscaler.transform(X_train_i)
    X_val_i = Xscaler.transform(X_val_i)

    Yscaler = StandardScaler()
    Yscaler.fit(Y_train_i)
    Y_train_i = Yscaler.transform(Y_train_i)
    Y_val_i = Yscaler.transform(Y_val_i)

    # Saving scalers for later use
    joblib.dump(Xscaler, os.path.join(dtb_folder_i,'Xscaler.pkl'))
    joblib.dump(Yscaler, os.path.join(dtb_folder_i,'Yscaler.pkl'))


    # Saving data (transformed)
    X_train_i.to_csv(os.path.join(dtb_folder_i,"X_train.csv"), index=False)
    Y_train_i.to_csv(os.path.join(dtb_folder_i,"Y_train.csv"), index=False)
    X_val_i.to_csv(os.path.join(dtb_folder_i,"X_val.csv"), index=False)
    Y_val_i.to_csv(os.path.join(dtb_folder_i,"Y_val.csv"), index=False)

In [None]:
for i_cluster in range(n_clusters):
    preproc_cluster_i(log_transform, threshold, i_cluster)