# Generating force field parameters to predict liquid densities for R-32 by fitting a GP model 

In this notebook we fit a Gaussian process model to predict the liquid density resulting from a set of {$\sigma$, $\epsilon$} parameters for R-32. Then we generate a new parameter set to use in liquid density MD. 

## Step 0: Load Libraries

In [None]:
import gpflow
import tensorflow as tf
import matplotlib.pyplot as plt
import unyt as u
import numpy as np
import pandas as pd
import seaborn

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

from sklearn import svm

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import silhouette_score


from fffit.utils import (
    shuffle_and_split,
    values_real_to_scaled,
    values_scaled_to_real,
    variances_scaled_to_real,
)

from fffit.plot import(
    plot_model_performance,
    plot_slices_temperature,
    plot_slices_params,
    plot_model_vs_test
)

from fffit.gpflow import run_gpflow_scipy

import sys
sys.path.append('../')

from utils.r32 import R32
R32 = R32()

n_params = len(R32.param_names)
liquid_density_threshold = 600 # kg/m^3

csv_name = 'r32-density-iter1-results.csv'
csv_path = '/scratch365/rdefever/hfcs-fffit/analysis/csv/' + csv_name

## Step 1: Extract density

In [None]:
df = pd.read_csv(csv_path, index_col=0)

param_values = df[list(R32.param_names)].values
temperature_values = df["temperature"].values
md_density_values = df["density"].values

md_is_liquid = [density > liquid_density_threshold for density in md_density_values]
expt_density_values = [R32.expt_liq_density[round(temp)] for temp in temperature_values]

scaled_param_values = values_real_to_scaled(param_values, R32.param_bounds)
scaled_temperature_values = values_real_to_scaled(temperature_values, R32.temperature_bounds)
scaled_md_density_values = values_real_to_scaled(md_density_values, R32.liq_density_bounds)
scaled_expt_density_values = values_real_to_scaled(expt_density_values, R32.liq_density_bounds)

scaled_data = np.hstack((scaled_param_values,
                         scaled_temperature_values.reshape(-1,1),
                         scaled_md_density_values.reshape(-1,1),
                         scaled_expt_density_values.reshape(-1,1),
                         np.array(md_is_liquid).reshape(-1,1)
                        ))

column_names = list(R32.param_names) + ["temperature",
                                      "md_density",
                                      "exptl_density",
                                      "is_liquid"]

df_all = pd.DataFrame(scaled_data, columns=column_names)
df_liquid = df_all[df_all["is_liquid"]==True]
df_vapor = df_all[df_all["is_liquid"]==False]

### Display parameter sets

In [None]:
#Display entire dataframe
pd.options.display.max_rows=1000
df_all

## Step 2: Fit classifier and GP models

### Classifier
SVM classifier with RBF kernel and trained with density data from all temperature points to predict parameter sets which return liquid densities.

In [None]:
# Create training dataset
param_names = list(R32.param_names) + ["temperature"]
property_name = "is_liquid"
x_train, y_train, x_test, y_test = shuffle_and_split(df_all, param_names, property_name, shuffle_seed=295734)

# Create classifier
classifier = svm.SVC(kernel='rbf')

# Fit classifier
classifier.fit(x_train, y_train)

# Check classifier accuracy
test_score = classifier.score(x_test, y_test)
print(f"Classifer is {test_score*100.0}% accurate on the test set.")

### GP Model

RBF kernel with linear mean and ARD (multipled length scales). Trained on parameter sets which produced densities greater than 600 kg/m3 (liquid densities only).

In [None]:
# Create training set
param_names = list(R32.param_names) + ["temperature"]
property_name = "md_density"
x_train, y_train, x_test, y_test = shuffle_and_split(df_liquid, param_names, property_name, shuffle_seed=957473)

# Fit model
model = run_gpflow_scipy(x_train, y_train, gpflow.kernels.RBF(lengthscales=np.ones(n_params+1)))

## Step 3: Find new parameters for MD simulations

### Load in Latin hypercube

In [None]:
latin_hypercube = np.loadtxt("LHS_1e6x6.csv",delimiter=",")

print("Shape of Latin hypercube sample:", latin_hypercube.shape)

### Use a classifier to predict which LH parameter sets give liquid and vapor

Classification done using only the highest temperature.

In [None]:
# Append highest temperature parameter to LH parameters
samples = np.hstack((latin_hypercube,np.tile(1.0,(latin_hypercube.shape[0],1))))

# Apply clasifier
pred = classifier.predict(samples)

# Separate LH samples into predicted liquid and predicted vapor sets
liquid_samples = latin_hypercube[np.where(pred == 1)]
print("Shape of the predicted liquid samples:", liquid_samples.shape)

vapor_samples = latin_hypercube[np.where(pred == 0)]
print("Shape of the predicted vapor samples:", vapor_samples.shape)

### Apply model and calculate mean squared errors (MSE) for all parameter sets

In [None]:
all_errs = np.empty(shape=(liquid_samples.shape[0],len(R32.expt_liq_density.keys())))
col_idx = 0
for (temp, density) in R32.expt_liq_density.items():
    scaled_temp = values_real_to_scaled(temp, R32.temperature_bounds)
    xx = np.hstack((liquid_samples, np.tile(scaled_temp,(liquid_samples.shape[0],1))))
    means_scaled, vars_scaled = model.predict_f(xx)
    means = values_scaled_to_real(means_scaled, R32.liq_density_bounds)
    err = means - density
    all_errs[:, col_idx] = err[:,0]
    col_idx += 1

# Liquid MSE
liquid_mean_sq_errs = np.mean(all_errs**2, axis=1)

all_errs = np.empty(shape=(vapor_samples.shape[0],len(R32.expt_liq_density.keys())))
col_idx = 0
for (temp, density) in R32.expt_liq_density.items():
    scaled_temp = values_real_to_scaled(temp, R32.temperature_bounds)
    xx = np.hstack((vapor_samples, np.tile(scaled_temp,(vapor_samples.shape[0],1))))
    means_scaled, vars_scaled = model.predict_f(xx)
    means = values_scaled_to_real(means_scaled, R32.liq_density_bounds)
    err = means - density
    all_errs[:, col_idx] = err[:,0]
    col_idx += 1

# Vapor MSE
vapor_mean_sq_errs = np.mean(all_errs**2, axis=1)

#### Create parameter sets for predicted liquid and predicted vapor
Sort parameters from lowest to highest MSE.

In [None]:
# Make liquid and vapor pandas dataframes
liquid_samples_mse = np.hstack((liquid_samples,liquid_mean_sq_errs.reshape(-1,1)))
liquid_samples_mse = pd.DataFrame(
    liquid_samples_mse,
    columns = list(R32.param_names) + ["mse"]
)
vapor_samples_mse = np.hstack((vapor_samples,vapor_mean_sq_errs.reshape(-1,1)))
vapor_samples_mse = pd.DataFrame(
    vapor_samples_mse,
    columns = list(R32.param_names) + ["mse"]
)
# Rank by MSE
liquid_samples_mse_ranked = liquid_samples_mse.sort_values("mse")
vapor_samples_mse_ranked = vapor_samples_mse.sort_values("mse")

### Visualization: Low MSE parameter sets

Parameter sets which give densities within 25 kg/m$^2$ of experimental values

In [None]:
# Make a set of the lowest MSE parameter sets
top_liquid_samples = liquid_samples_mse[liquid_samples_mse["mse"] < 625.0]
top_vapor_samples = vapor_samples_mse[vapor_samples_mse["mse"] < 625.0]
print("There are:", top_liquid_samples.shape[0],"liquid parameter sets which produce densities within 25 kg/m$^2$ of experimental densities")
print("There are:", top_vapor_samples.shape[0]," vapor parameter sets which produce densities within 25 kg/m$^2$ of experimental densities")

In [None]:
# Create a pairplot of the top parameter values
column_names = list(R32.param_names)
g = seaborn.pairplot(top_liquid_samples.drop(columns=["mse"]))
g.set(xlim=(-0.1,1.1),ylim=(-0.1,1.1))
pass

In [None]:
# Create a pairplot of the top parameter values
column_names = list(R32.param_names)
g = seaborn.pairplot(top_vapor_samples.drop(columns=["mse"]))
g.set(xlim=(-0.1,1.1),ylim=(-0.1,1.1))
pass

### Combine top 100 lowest MSE for parameter sets predicted as liquid and vapor

In [None]:
new_params = [
    liquid_samples_mse_ranked.drop(columns=["mse"])[:100],
    vapor_samples_mse_ranked.drop(columns=["mse"])[:100],
]

# Concatenate
new_params = pd.concat(new_params)

# Display
pd.options.display.max_rows=200
new_params

In [None]:
# Save to csv
new_params.to_csv(csv_path + 'r32-density-iter2-params.csv')