# Gibbs-Helmholtz Graph Neural Network (GH-GNN)

An illustration of how to use GH-GNN for obtaining predictions on infinite dilution activity coefficients

### 1. Activate the conda environment

Make sure you have activated an anaconda environment with all the necessary dependencies installed as mentioned in the [README file](https://github.com/edgarsmdn/GH-GNN/edit/main/README.md) on GitHub.

You can create such environment in any [Google Colab](https://colab.research.google.com/) by simply running the following first cell:

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googlecolab/colabtools/blob/master/notebooks/colab-github-demo.ipynb)

In [3]:
try:
  import google.colab
  IN_COLAB = True
except:
  IN_COLAB = False

if IN_COLAB:
    import torch
    !rm -rf sample_data/
    !pip install rdkit
    !pip install torch-scatter torch-sparse torch-cluster torch-spline-conv torch-geometric -f https://data.pyg.org/whl/torch-1.12.0+cpu.html
    !pip install mordred 
    !git clone https://github.com/beangoben/molecular_screening_lecture.git
    

### 2. Import the model

## 1. Import the model

In [None]:
import 

In [1]:
import pandas as pd
import plotly.express as px
import molplotly

In [2]:
def select_df(df, comparison, methods):
    if comparison == 'all':
        return df
    elif comparison == 'intersection_all':
        mask = np.prod(np.array([df[method].notna().to_numpy() for method in methods]),
                       axis=0).astype(bool)
        return df[mask]
    elif comparison == 'feasible_UNIFAC_Do':
        return df[df['UNIFAC_Do'].notna()]
    elif comparison == 'feasible_MOSCED':
        return df[df['MOSCED'].notna()]

In [3]:
df_train = pd.read_csv('../Temperature_dependency/results/train_predictions.csv')
df_test = pd.read_csv('../Temperature_dependency/results/test_predictions.csv')

kfolds = True

methods = [
        'GNNGH',
        'SolvGNNGH',
        'GNNGH_T',
               ]
if kfolds:
    methods = [method + '_kfolds' for method in methods]

df_train = select_df(df_train, 'all', methods)
df_test = select_df(df_test, 'all', methods)
df_complete = pd.concat([df_train, df_test], axis=0)
df_complete['split'] = ['Train' for i in range(df_train.shape[0])] + ['Test' for i in range(df_test.shape[0])]

In [4]:
def plot_parity_interactive(method, df_complete=df_complete):
    fig_scatter = px.scatter(df_complete,
                             x="log-gamma",
                             y=method,
                             color='split',
                             title=method+' predictions',
                             labels={method: 'Predicted IDAC',
                                     'log-gamma': 'Experimental IDAC',
                                     'split': 'Split'},
                             width=800,
                             height=700)

    # This adds a dashed line for what a perfect model _should_ predict
    y = df_complete["log-gamma"].values
    fig_scatter.add_shape(
        type="line", line=dict(dash='dash'),
        x0=y.min(), y0=y.min(),
        x1=y.max(), y1=y.max()
    )

    app_scatter = molplotly.add_molecules(fig=fig_scatter,
                                          df=df_complete,
                                          smiles_col=['Solvent_SMILES', 'Solute_SMILES'],
                                          caption_cols=['T', 'Solvent_name','Solute_name'],
                                          caption_transform={'Predicted IDAC': lambda x: f"{x:.2f}",
                                                             'Experimental IDAC': lambda x: f"{x:.2f}",
                                                             'T': lambda x: f"{x:.2f}"
                                                             },
                                          color_col='split'
                                          )

    # change the arguments here to run the dash app on an external server and/or change the size of the app!
    app_scatter.run_server(mode='inline', port=8002, height=800)
    
def plot_parity_interactive_allmethods(df_complete):
    fig_scatter = px.scatter(df_complete,
                             x="log-gamma",
                             y='prediction',
                             color='method',
                             title='All models predictions',
                             labels={'prediction': 'Predicted IDAC',
                                     'log-gamma': 'Experimental IDAC',
                                     'method': 'Model (split)'},
                             width=800,
                             height=700)

    # This adds a dashed line for what a perfect model _should_ predict
    y = df_complete["log-gamma"].values
    fig_scatter.add_shape(
        type="line", line=dict(dash='dash'),
        x0=y.min(), y0=y.min(),
        x1=y.max(), y1=y.max()
    )

    app_scatter = molplotly.add_molecules(fig=fig_scatter,
                                          df=df_complete,
                                          smiles_col=['Solvent_SMILES', 'Solute_SMILES'],
                                          caption_cols=['T', 'Solvent_name','Solute_name'],
                                          caption_transform={'Predicted IDAC': lambda x: f"{x:.2f}",
                                                             'Experimental IDAC': lambda x: f"{x:.2f}",
                                                             'T': lambda x: f"{x:.2f}"
                                                             },
                                          color_col='method'
                                          )

    # change the arguments here to run the dash app on an external server and/or change the size of the app!
    app_scatter.run_server(mode='inline', port=8003, height=800)
    
def plot_parity_interactive_T(method, df_complete=df_complete):
    fig_scatter = px.scatter(df_complete,
                             x="log-gamma",
                             y=method,
                             color='T',
                             title=method+' predictions',
                             labels={method: 'Predicted IDAC',
                                     'log-gamma': 'Experimental IDAC',
                                     'T': 'Temperature'},
                             width=800,
                             height=700)

    # This adds a dashed line for what a perfect model _should_ predict
    y = df_complete["log-gamma"].values
    fig_scatter.add_shape(
        type="line", line=dict(dash='dash'),
        x0=y.min(), y0=y.min(),
        x1=y.max(), y1=y.max()
    )

    app_scatter = molplotly.add_molecules(fig=fig_scatter,
                                          df=df_complete,
                                          smiles_col=['Solvent_SMILES', 'Solute_SMILES'],
                                          caption_cols=['T', 'Solvent_name','Solute_name'],
                                          caption_transform={'Predicted IDAC': lambda x: f"{x:.2f}",
                                                             'Experimental IDAC': lambda x: f"{x:.2f}",
                                                             'T': lambda x: f"{x:.2f}"
                                                             },
                                          color_col='T'
                                          )

    # change the arguments here to run the dash app on an external server and/or change the size of the app!
    app_scatter.run_server(mode='inline', port=8004, height=800)
    

In [5]:
log_gamma_all = []
prediction_all = []
method_all = []
solvent_smiles_all = []
solute_smiles_all = []
solvent_name_all = []
solute_name_all = []
T_all = []

for method in methods:
    log_gamma_all.extend(df_complete['log-gamma'].tolist())
    prediction_all.extend(df_complete[method].tolist())
    solvent_smiles_all.extend(df_complete['Solvent_SMILES'].tolist())
    solute_smiles_all.extend(df_complete['Solute_SMILES'].tolist())
    solvent_name_all.extend(df_complete['Solvent_name'].tolist())
    solute_name_all.extend(df_complete['Solute_name'].tolist())
    T_all.extend(df_complete['T'].tolist())
    for i in range(df_complete.shape[0]):
        method_all.append(method + ' (' + df_complete['split'].iloc[i] + ')')

df_all = pd.DataFrame({
    'log-gamma':log_gamma_all,
    'prediction': prediction_all,
    'Solvent_SMILES': solvent_smiles_all,
    'Solute_SMILES': solute_smiles_all,
    'Solvent_name': solvent_name_all,
    'Solute_name': solute_name_all,
    'T': T_all,
    'method':method_all
})

## Parity plot for all methods

In [6]:
plot_parity_interactive_allmethods(df_all)

## Parity plot with temperatures

In [7]:
plot_parity_interactive_T(methods[2], df_complete=df_test)

## Individual methods parity plots

In [8]:
plot_parity_interactive(methods[0])