# Evaluate FCHL Models for a Molecule Size Extrapolation Test
In this notebook, we build FCHL models and test their ability to predict the properties of molecules larger than those in the training set

In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.metrics import mean_absolute_error
from ase.units import Hartree, eV
from time import perf_counter
from tqdm import tqdm
import pickle as pkl
import pandas as pd
import numpy as np
import gzip
import json
import os

In [2]:
electron_cuts = [41, 51, 61]

Get the original benchmarking directory, which has the data in FCHL format

In [3]:
fchl_dir = os.path.join('..', '..', 'benchmark', 'qml')

## Load in the Training Data
Load the training data, complete with the representations

In [4]:
train_data = pd.read_pickle(os.path.join(fchl_dir, 'train_data.pkl.gz'))

In [5]:
test_data = pd.read_pickle(os.path.join(fchl_dir, 'test_data.pkl.gz'))

## Load in the Model
Use the model from the previous calculation

In [6]:
with gzip.open(os.path.join(fchl_dir, 'fchl-model.pkl.gz'), 'rb') as fp:
    model = pkl.load(fp)

## Train a Model on G4MP2 Atomization Energies
Train only on G4MP2 atomization energy, not using the B3LYP results

In [7]:
results = []
for train_split in tqdm(electron_cuts):
    # Get some training data
    train_subset = train_data.query('n_electrons < {}'.format(train_split))
    
    # Train the model
    train_time = perf_counter()
    model.fit(train_subset['rep'].tolist(), train_subset['g4mp2_atomization'])
    train_time = perf_counter() - train_time

    # Predict the u0 for the test set
    test_time = perf_counter()
    pred_y = 'pred_y_{}'.format(train_split)
    test_data[pred_y] = model.predict(test_data['rep'].tolist())
    test_time = perf_counter() - test_time
    
    # MAE on big molecules
    big_test = test_data.query('n_electrons > 67')
    
    results.append({
        'electron_cut': train_split,
        'train_size': len(train_subset),
        'mae': mean_absolute_error(test_data[pred_y], test_data['g4mp2_atomization']), 
        'big_mae': mean_absolute_error(big_test[pred_y], big_test['g4mp2_atomization']), 
        'train_time': train_time, 
        'test_time': test_time,
    })

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [34:48:31<00:00, 37295.27s/it]


In [8]:
results = pd.DataFrame(results)
results

Unnamed: 0,big_mae,electron_cut,mae,test_time,train_size,train_time
0,0.021625,41,0.024528,315.703409,152,0.662593
1,0.004446,51,0.004381,3900.41589,1161,80.977722
2,0.00099,61,0.000902,85958.878878,14317,35054.672712


In [9]:
with open('fchl.json', 'w') as fp:
    json.dump({
        'name': 'FCHL',
        'description': 'Model built using the FCHL representation, as implemented in QML, and KRR',
        'g4mp2_benchmark': results.to_dict('records')
    }, fp, indent=2)

In [10]:
with gzip.open('fchl.pkl.gz', 'wb') as fp:
    pkl.dump(model, fp)

## Train a $\Delta$-Learning Model on G4MP2 Atomization Energies
Train on the difference between B3LYP and G4MP2

In [11]:
delta_results = []
for train_split in tqdm(electron_cuts):
    # Get some training data
    train_subset = train_data.query('n_electrons < {}'.format(train_split))
    
    # Train the model
    train_time = perf_counter()
    model.fit(train_subset['rep'].tolist(), train_subset['delta'])
    train_time = perf_counter() - train_time

    # Predict the u0 for the test set
    test_time = perf_counter()
    pred_y = 'pred_y_{}-delta'.format(train_split)
    test_data[pred_y] = model.predict(test_data['rep'].tolist())
    test_time = perf_counter() - test_time
    
    # MAE on big molecules
    big_test = test_data.query('n_electrons > 67')
    
    
    delta_results.append({
        'electron_cut': train_split,
        'mae': mean_absolute_error(test_data[pred_y], test_data['delta']), 
        'big_mae': mean_absolute_error(big_test[pred_y], big_test['delta']), 
        'train_size': len(train_subset),
        'train_time': train_time, 
        'test_time': test_time,
    })

100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [35:08:52<00:00, 37656.50s/it]


In [12]:
delta_results = pd.DataFrame(delta_results)
delta_results

Unnamed: 0,big_mae,electron_cut,mae,test_time,train_size,train_time
0,0.002999,41,0.003492,319.339844,152,0.699618
1,0.00094,51,0.000843,3962.213988,1161,83.415066
2,0.000236,61,0.000207,86642.034707,14317,35524.706634


*Finding*: The FCHL representation performs remarkably well, even for small dataset sizes.

In [13]:
with open('fchl-delta.json', 'w') as fp:
    json.dump({
        'name': 'FCHL $\Delta$-Learning',
        'description': '$\Delta$-Learning model built using the FCHL representation, as implemented in QML, and KRR',
        'g4mp2_with_b3lyp_results': delta_results.to_dict('records')
    }, fp, indent=2)

In [14]:
with gzip.open('fchl-delta.pkl.gz', 'wb') as fp:
    pkl.dump(model, fp)

In [15]:
test_data.drop('xyz', 'columns').to_pickle('predictions.pkl.gz')