# Comparing performance of different crystal structure representations

In this notebook, we'll compare the performance of three different crystal structure representations, which are Coulomb Matrix (CM), Partial Radial Distribution Function (PRDF) and Voronoi tessellation features shown in [Ward et al's paper](https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104). 

In [None]:
import numpy as np
import pandas as pd
import os
import pickle
import time
import matplotlib.pyplot as plt

from tqdm import tqdm_notebook as tqdm

from sklearn.utils import shuffle
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

Load data

In [None]:
%%time
data = pd.read_pickle('./X_cm.pkl')

X_ward = pd.read_pickle('./X_ward.pkl')
X_cm = data['coulomb matrix']
X_prdf = pd.read_pickle('./X_prdf.pkl')

y = data['delta_e']

Form vector descriptors using eigenvalue of CM matrix and append each vector descriptor of to make them same size.

In [None]:
X_cm = data['coulomb matrix']

X_cm = pd.Series([np.sort(np.linalg.eigvals(s)) \
            for s in X_cm], X_cm.index)
nt = max(X_cm.apply(len))

XLIST = []
for x in X_cm:
    XLIST.append(np.append(x, np.zeros(nt - x.shape[0])))
X_cm = np.array(XLIST)
print ("CM input data shape:", X_cm.shape)

In [None]:
X_prdf = np.array(X_prdf)
X_ward = np.nan_to_num(X_prdf, copy=True)
print ("PRDF input data shape:", X_prdf.shape)

In [None]:
X_ward = np.array(X_ward)
X_ward = np.nan_to_num(X_ward, copy=True)
print ("Voronoi tessellation input data shape:", X_ward.shape)

In [None]:
np.any(np.isnan(X_prdf)), np.all(np.isfinite(X_prdf))

## Training RandomForestRegressor model

A set of randomly selected 30,000 entries is used to train the model. Performance of the model is then evaluted on a distinct set of around 1000 entries. Each cross-validation test is repeated 20 times. 

In [None]:
ft = ['ward', 'cm', 'prdf']

In [None]:
X_ward, X_cm, X_prdf, y = shuffle(X_ward, X_cm, X_prdf, y)
X = {"ward": X_ward, "cm": X_cm, "prdf": X_prdf}

Training RandomForestRegressor with respective training set and compute the stats

In [None]:
stats, pred, calc = dict.fromkeys(ft), dict.fromkeys(ft), dict.fromkeys(ft)
for ft in stats:
    stats[ft] = dict.fromkeys(['mae', 'rmse', 'r2', 'time_used'], 0.0)

In [None]:
for ft, x in X.items():
    for train_idx, test_idx in tqdm(ShuffleSplit(train_size=30000, test_size=1000, n_splits=20).split(x)):
        # split dataset
        x_train, x_test = X[ft][train_idx], X[ft][test_idx]
        y_train, y_test = y[train_idx], y[test_idx]
        print (ft, np.any(np.isnan(x_train)), np.all(np.isfinite(x_train)))
        # compute time used to train model
        start = time.perf_counter()
        
        # Use pipeline
        Pipeline([
                ('imputer', Imputer(missing_values='NaN', strategy='mean', axis=1)), # For the failed structures
                ('model', RandomForestRegressor(n_estimators=100, n_jobs=-1))
                ])
        # train model
        model.fit(x_train, y_train)
        
        # run model
        y_pred = model.predict(x_test)
        
        finish = time.perf_counter()
        
        # compute stats
        stats[ft]['mae'] += mean_absolute_error(y_pred, y_test) / 20
        stats[ft]['rmse'] += np.sqrt(mean_squared_error(y_pred, y_test)) / 20
        stats[ft]['r2'] += r2_score(y_pred, y_test) / 20
        stats[ft]['time_used'] += (finish - start) / 20
        
    # save predicted formation enthalpy at last iteration
    pred[ft], calc[ft] = y_pred, y_test

Plot predictions

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, sharex=True)

ax1.set_title("CM")
ax2.set_title("PRDF")
ax3.set_title("Voronoi")

ax1.set_ylabel("ML $\Delta H_f (eV/atom)$")
ax2.set_xlabel(" DFT $\Delta H_f (eV/atom)$")

ax1.scatter(calc['cm'], pred['cm'], color='g', marker='.')
ax2.scatter(calc['prdf'], pred['prdf'], color='b', marker='.')
ax3.scatter(calc['ward'], pred['ward'], color='r', marker='.')

ax1.text(0.495, 0.026, 'MAE: {:.0f} meV/atom\nRMSE:{:.0f} meV/atom\n$R^2$: {:.3f}'.format(stats['cm']['mae']*1e3, stats['cm']['rmse']*1e3, stats['cm']['r2']),
         transform=ax1.transAxes, fontsize=8,
         bbox={'facecolor': 'w', 'edgecolor': 'k'})
ax2.text(1.0, 0.026, 'MAE: {:.0f} meV/atom\nRMSE:{:.0f} meV/atom\n$R^2$: {:.3f}'.format(stats['prdf']['mae']*1e3, stats['prdf']['rmse']*1e3, stats['prdf']['r2']),
         transform=ax1.transAxes, fontsize=8,
         bbox={'facecolor': 'w', 'edgecolor': 'k'})
ax3.text(1.5, 0.026, 'MAE: {:.0f} meV/atom\nRMSE:{:.0f} meV/atom\n$R^2$: {:.3f}'.format(stats['ward']['mae']*1e3, stats['ward']['rmse']*1e3, stats['ward']['r2']),
         transform=ax1.transAxes, fontsize=8,
         bbox={'facecolor': 'w', 'edgecolor': 'k'})

fig.tight_layout()