In [2]:
!git clone  https://github.com/daniela-figueroa/CHEMENG177

%cd CHEMENG177
!pip install tensorflow

Cloning into 'CHEMENG177'...
remote: Enumerating objects: 45, done.[K
remote: Counting objects: 100% (45/45), done.[K
remote: Compressing objects: 100% (37/37), done.[K
remote: Total 45 (delta 21), reused 9 (delta 6), pack-reused 0 (from 0)[K
Receiving objects: 100% (45/45), 688.37 KiB | 6.68 MiB/s, done.
Resolving deltas: 100% (21/21), done.
/content/CHEMENG177


In [3]:
ls

'CALiSol-23 Dataset.csv'         lasso_final.ipynb                 ridge.ipynb
 CALiSol-23.ipynb               'Pre-processed CALiSol Data.csv'
'Data_Preprocessing (1).ipynb'   README.md


Imports

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
import os
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

# 0. Neural Network Class Defined

In [5]:
class ConductivityNN:
    def __init__(self, input_dim):
        """Initialize the Feedforward Neural Network"""
        self.model = Sequential([
            Input(shape=(input_dim,)),
            Dense(64, activation='relu'),
            Dense(128, activation='relu'),
            Dropout(0.2),
            Dense(64, activation='relu'),
            Dense(1)  # Output layer for regression
        ])
        self.model.compile(optimizer='adam', loss='mse', metrics=['r2_score'])

    def train(self, X_train, y_train, epochs=50, batch_size=32):
        """Train the model using a validation set"""
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)

    def predict(self, X_test):
        """Predict conductivity"""
        return self.model.predict(X_test)

    def evaluate(self, X_test, y_test):
        """Evaluate model"""
        return self.model.evaluate(X_test, y_test, verbose=0)


# 1. Data Wrangling

In [6]:
dataset = pd.read_csv("Pre-processed CALiSol Data.csv")
print(dataset.shape)

(13302, 46)


In [7]:
dataset = dataset.drop(columns=['Unnamed: 0'])
print(dataset.shape)

(13302, 45)


### Dictionaries

In [8]:
molar_weights = {'EC': 88.06,
       'PC' : 102.08, 'DMC' : 90.08,'EMC' : 104.10, 'DEC' : 118.132,
        'DME' : 90.12, 'DMSO' : 78.13, 'AN': 41.05, 'MOEMC': 134.13, 'TFP': 344.07, 'EA' : 88.10,
       'MA': 74.08, 'FEC': 106.05, 'DOL': 74.08, '2-MeTHF' : 86.13, 'DMM': 162.2, 'Freon 11' : 137.36,
       'Methylene chloride' : 84.93,
       'THF' : 72.10, 'Toluene' : 92.14, 'Sulfolane' : 120.17, '2-Glyme' : 134.17, '3-Glyme' : 178.22,
                 '4-Glyme' : 222.28,
       '3-Me-2-Oxazolidinone' : 101.10, '3-MeSulfolane' : 134.20, 'Ethyldiglyme': 134.17, 'DMF' : 73.09,
       'Ethylbenzene' : 106.17, 'Ethylmonoglyme' : 76.10, 'Benzene' : 78.11, 'g-Butyrolactone' : 86.09,
       'Cumene' : 120.19, 'Propylsulfone' : 150.24, 'Pseudocumeme' : 120.19, 'TEOS' : 208.33, 'm-Xylene' : 106.17,
       'o-Xylene' : 106.16} # Molar weights of different organic solvents  [g / mol]
molar_weights_salts = {'LiPF6' : 151.91, 'LiBF4': 93.75,
                      'LiFSI': 187.7, 'LiTDI' : 192.1, 'LiPDI' : 242.1, 'LiTFSI' : 287.07, 'LiClO4' : 160.44, 'LiAsF6' : 195.9,
 'LiBOB' : 193.79, 'LiCF3SO3' : 156.01, 'LiBPFPB' : 193.8, 'LiBMB': 221.85, 'LiN(CF3SO2)2' : 287.07} # Molar weights of salts in [g / mol]. Note that LiClO4 exists in hydrous and anyhydrous form
# Note that the weight of LiBMB was calculated theoretically
names_salts = {'LiPF6' : "Lithium hexafluorophosphate", 'LiBF4': "Lithium tetrafluoroborate",
                      'LiFSI': "Lithium Bis(fluorosulfonyl)imide", 'LiTDI' : "lithium 2-trifluoromethyl-4,5-dicyanoimidazole",
               'LiPDI' : "lithium 4,5-dicyano-2-(pentafluoroethyl)imidazolide", 'LiTFSI': "Lithium bis(trifluoromethanesulfonyl)imide",
               'LiClO4' : "Lithium perchlorate", 'LiAsF6' : "Lithium hexafluoroarsenate(V)",
 'LiBOB' : "Lithium bis(oxalato)borate", 'LiCF3SO3' : "Lithium triflate", 'LiBPFPB' : "Lithium bis(oxalate)borate",
               'LiBMB' : "lithium bis(malonato)borate", 'LiN(CF3SO2)2' : "lithium bis(trifluoromethanesulfonimide)"}

names_solvents = {'EC' : "Ethylene carbonate",
       'PC' : "Propylene carbonate", 'DMC' : "Dimethyl carbonate", 'EMC' : "Ethyl Methyl Carbonate",
         'DEC' : "​Diethyl carbonate", 'DME' : "Dimethoxyethane", 'DMSO' : "Dimethyl sulfoxide", 'AN' : "Acetonitrile", 'MOEMC' : "2-Methoxyethyl (methyl) carbonate",
         'TFP' : "Tris(2,2,2-trifluoroethyl) phosphate", 'EA' : "Ethyl acetate",
       'MA' : "Methyl acetate", 'FEC' : "Fluoroethylene carbonate", 'DOL': "Dioxolane"
         , '2-MeTHF': "2-Methyltetrahydrofuran", 'DMM' : "Dipropylene glycol dimethyl ether",
         'Freon 11' : "Trichlorofluoromethane", 'Methylene chloride' : 'Methylene chloride',
       'THF' : "Tetrahydrofuran", 'Toluene' : "Toluene", 'Sulfolane' : "Sulfolane",
         '2-Glyme': "Diglyme", '3-Glyme': "Triglyme", '4-Glyme': "tetraglyme",
       '3-Me-2-Oxazolidinone' : "3-Me-2-Oxazolidinone", '3-MeSulfolane' : "3-Methylsulfolane",
         'Ethyldiglyme' : "2-(2-Ethoxyethoxy)ethanol", 'DMF' : "Dimethylformamide",
       'Ethylbenzene': 'Ethylbenzene', 'Ethylmonoglyme': "ethylene glycol monomethyl", 'Benzene' : "Benzene", 'g-Butyrolactone' : "gamma-Butyrolactone",
       'Cumene' : "Cumene", 'Propylsulfone' : 'Propylsulfone', 'Pseudocumeme' : "1,2,4-Trimethylbenzene", 'TEOS' : "Tetraethyl orthosilicate", 'm-Xylene' : 'm-Xylene',
       'o-Xylene' : 'o-Xylene'
}

densities = {'EC': 1.3210,
       'PC' : 1.205, 'DMC' : 1.07, 'EMC' : 0.902, 'DEC' : 0.975,
        'DME' : 0.86, 'DMSO' : 1.1004, 'AN': 0.786, 'MOEMC': 1.5, 'TFP': 1.487, 'EA' : 0.902,
       'MA': 0.932, 'FEC': 1.454, 'DOL': 1.06, '2-MeTHF' : 0.854, 'DMM': 0.902, 'Freon 11': 1.49, 'Methylene chloride': 1.33,
       'THF': 0.888, 'Toluene' : 0.867, 'Sulfolane' : 1.26, '2-Glyme': 0.937, '3-Glyme': 0.986, '4-Glyme': 1.009,
       '3-Me-2-Oxazolidinone': 1.17, '3-MeSulfolane': 1.20, 'Ethyldiglyme' : 0.937, 'DMF': 0.944,
       'Ethylbenzene': 0.866, 'Ethylmonoglyme': 0.965, 'Benzene': 0.876, 'g-Butyrolactone' : 1.13,
       'Cumene': 0.862, 'Propylsulfone' : 1.109, 'Pseudocumeme': 0.876, 'TEOS': 0.940, 'm-Xylene' : 0.860,
       'o-Xylene': 0.87596} # g / cm3 = g / ml at 25 C

#NOTE: The salt density values for LiPDI is approximated based off of LiTDI, and the salt desnity values for LiBPFPB and LiBMB are approximated off of LiBOB.
densities_salts = {'LiPF6' : 2.84, 'LiBF4': 0.852,
                      'LiFSI': 1.052, 'LiTDI' : 2.2, 'LiPDI' : 2.1, 'LiTFSI' : 1.33, 'LiClO4' : 2.42, 'LiAsF6' : 2.32,
 'LiBOB' : 2.021, 'LiCF3SO3' : 1.9, 'LiBPFPB' : 2.10, 'LiBMB': 2.00, 'LiN(CF3SO2)2' : 1.33}


## Saving the data of the 4 salts with the most data



In [9]:
#separating data by salt
salt_datas = {}
X = {}
y = {}
relevant_salts = np.array([])
for salt in names_salts.keys():
  salt_datas[salt] = dataset[dataset['salt'] == salt]
  trials = len(salt_datas[salt])
  if trials > 1000:
    relevant_salts = np.append(relevant_salts, salt)
    print(f"{salt} data has shape {salt_datas[salt].shape}")
    X[salt] = salt_datas[salt].iloc[:, 2:]
    X[salt] = X[salt].drop(['salt', 'c units', 'solvent ratio type'],axis=1)
    y[salt] = salt_datas[salt]['k']
print(X.keys())
print(y.keys())
print(relevant_salts)

LiPF6 data has shape (4706, 45)
LiBF4 data has shape (2943, 45)
LiAsF6 data has shape (1040, 45)
LiBOB data has shape (3699, 45)
dict_keys(['LiPF6', 'LiBF4', 'LiAsF6', 'LiBOB'])
dict_keys(['LiPF6', 'LiBF4', 'LiAsF6', 'LiBOB'])
['LiPF6' 'LiBF4' 'LiAsF6' 'LiBOB']


# 2. Train and Evaluate Models For Hyperparameter Optimization

## Cross Validation Process to Select Hyperparameters

In [12]:
from itertools import product

k_folds = 5  # Number of splits
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Hyperparameter grid
epochs_list = [50, 100]
batch_sizes = [32, 64]

best_models = {}
best_hyperparams = {}
best_model_mse = {}
best_model_std = {}
best_model_r2 = {}

X_fulltrain = {}
X_test = {}
y_fulltrain = {}
y_test = {}

for salt in X.keys():
    print(f"\nPerforming Grid Search with {k_folds}-Fold CV for {salt}...\n")

    X_data = X[salt]
    y_data = y[salt]

    # 10% of data held out for testing
    X_fulltrain[salt], X_test[salt], y_fulltrain[salt], y_test[salt] = train_test_split(X_data, y_data, test_size=0.1,train_size=0.9, random_state=42)
    best_mse = float("inf")
    best_r2 = float("-inf")
    best_model = None
    best_params = None


    # Iterate over all hyperparameter combinations
    for epochs, batch_size in product(epochs_list, batch_sizes):
        print(f"  \nEvaluating: epochs={epochs}, batch_size={batch_size}")

        fold_mse_scores = []
        fold_r2 = []

        # Cross-validation
        for fold, (train_idx, test_idx) in enumerate(kf.split(X_fulltrain[salt])):
            print(f"  Fold {fold + 1}/{k_folds}...")
            # Split data into train/test for this fold
            X_tr, X_val = X_fulltrain[salt].iloc[train_idx], X_fulltrain[salt].iloc[test_idx]
            y_tr, y_val = y_fulltrain[salt].iloc[train_idx], y_fulltrain[salt].iloc[test_idx]

            # Scale the data
            scaler = StandardScaler() # should scaler be per fold or when data is initialized
            X_tr = scaler.fit_transform(X_tr)
            X_val = scaler.transform(X_val)

            # Train the model
            model = ConductivityNN(input_dim=X_tr.shape[1])
            model.train(X_tr, y_tr, epochs=epochs, batch_size=batch_size)

            # Evaluate the model on this fold
            mse, r2 = model.evaluate(X_val, y_val)
            fold_mse_scores.append(mse)
            fold_r2.append(r2)

        std_mse = np.std(fold_mse_scores)
        mean_mse = np.mean(fold_mse_scores)
        mean_r2 = np.mean(fold_r2)
        print(f"\nFinal results for {salt} with {epochs} epochs and {batch_size} batch size:")
        print(f"  Mean MSE: {mean_mse:.4f} ± {std_mse:.4f}")
        print(f"  Mean R2: {mean_r2:.4f}")

        # Update best model if this one is better
        if (mean_mse < best_mse and mean_r2 == best_r2) or (mean_r2 > best_r2):
            best_mse = mean_mse
            best_model = model
            best_params = (epochs, batch_size)
            best_std = std_mse
            best_r2 = mean_r2

    # Store the best model, parameters, and scaler
    best_models[salt] = best_model
    best_hyperparams[salt] = best_params
    best_model_mse[salt] = best_mse
    best_model_std[salt] = best_std
    best_model_r2[salt] = best_r2
    print(f"\nBest Model for {salt}:")
    print(f"  Epochs: {best_hyperparams[salt][0]}, Batch Size: {best_hyperparams[salt][1]}")
    print(f"  Mean MSE: {best_model_mse[salt]:.4f} ± {best_model_std[salt]:.4f}")
    print(f"  Mean R2: {best_model_r2[salt]:.4f}")


Performing Grid Search with 5-Fold CV for LiPF6...

  
Evaluating: epochs=50, batch_size=32
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

Final results for LiPF6 with 50 epochs and 32 batch size:
  Mean MSE: 0.2173 ± 0.0714
  Mean R2: 0.9873
  
Evaluating: epochs=50, batch_size=64
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

Final results for LiPF6 with 50 epochs and 64 batch size:
  Mean MSE: 0.2522 ± 0.0740
  Mean R2: 0.9853
  
Evaluating: epochs=100, batch_size=32
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

Final results for LiPF6 with 100 epochs and 32 batch size:
  Mean MSE: 0.2130 ± 0.0847
  Mean R2: 0.9876
  
Evaluating: epochs=100, batch_size=64
  Fold 1/5...
  Fold 2/5...
  Fold 3/5...
  Fold 4/5...
  Fold 5/5...

Final results for LiPF6 with 100 epochs and 64 batch size:
  Mean MSE: 0.2029 ± 0.0388
  Mean R2: 0.9882

Best Model for LiPF6:
  Epochs: 100, Batch Size: 64
  Mean MSE: 0.2029 ± 0.0388
  

# 3. Final Training and Evaluation for Each Salt Model on the Full Dataset

In [14]:
salt_mse = {}
salt_r2 = {}
model_paths = {}
repo_path = "CHEMENG177"
model_dir = os.path.join(repo_path, "saved_models")
os.makedirs(model_dir, exist_ok=True)
scalers = {}
for salt in X.keys():
    print(f"\nTraining on best hyperparameters for {salt}...\n")

    X_train = X_fulltrain[salt]
    y_train = y_fulltrain[salt]
    X_eval = X_test[salt]
    y_eval = y_test[salt]

    (epochs, batch_size) = best_hyperparams[salt]

    # Scale the data
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_eval = scaler.transform(X_eval)

    model = best_models[salt]
    model.train(X_train, y_train, epochs=epochs, batch_size=batch_size)
    # Evaluate Model
    mse, r2 = model.evaluate(X_eval, y_eval)

    salt_mse[salt] = mse
    salt_r2[salt] = r2
    scalers[salt] = scaler

    print(f"\nFinal results for {salt} with {epochs} epochs and {batch_size} batch size:")
    print(f"  MSE: {mse:.4f}")
    print(f"  R2: {r2:.4f}")
    model_path = os.path.join(model_dir, f"{salt}_nn_model.h5")  # Model save path
    model.model.save(model_path)

    # Store model path
    model_paths[salt] = model_path


Training on best hyperparameters for LiPF6...






Final results for LiPF6 with 100 epochs and 64 batch size:
  MSE: 0.1957
  R2: 0.9890

Training on best hyperparameters for LiBF4...






Final results for LiBF4 with 100 epochs and 32 batch size:
  MSE: 0.2856
  R2: 0.8931

Training on best hyperparameters for LiAsF6...






Final results for LiAsF6 with 100 epochs and 32 batch size:
  MSE: 0.1970
  R2: 0.9903

Training on best hyperparameters for LiBOB...






Final results for LiBOB with 100 epochs and 64 batch size:
  MSE: 0.7348
  R2: 0.9584
