# Compare Runs
Compare different configurations for models

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
from datetime import datetime
from pathlib import Path
from ase import units
import pandas as pd
import numpy as np
import json

Configuration

In [None]:
data_hash = '4380cfde'

## Pull Runs from MLFlow
ML flow uses local storage in the `mlruns` directory

In [None]:
def gather_run_summaries() -> list[dict]:
    """Get all runs for a certain experiment
    
    Pulls the metrics, parameters, tags, and the artifact URI (so we can download data later).
    
    Returns:z
        List of dictionaries describing each run
    """
    
    output = []
    for path in Path('runs').rglob('performance.json'):
        path = path.parent

        # Load the input parameters
        date, run_hash = path.name.rsplit("-", 1)
        record = {
            'path': path,
            'hash': run_hash,
            'date': datetime.fromisoformat(date)
        }
        with open(path / 'params.json') as fp:
            for key, val in json.load(fp).items():
                record[f'param.{key}'] = val

        # Load in performance
        with open(path / 'performance.json') as fp:
            for key, val in json.load(fp).items():
                record[f'metric.{key}'] = val
        
        output.append(record)
        
    return output

In [None]:
summary = pd.DataFrame(gather_run_summaries()).sort_values('date')
summary.tail()

In [None]:
summary['param.test_hash']

## Evaluate Best Model
See how the model's error wrt frame appears and learning curve during training

In [None]:
top_runs = summary[summary['param.test_hash'] == data_hash]
best_run = top_runs.sort_values('metric.force_mean_error', ascending=True).iloc[0]

Pull out the parameters

In [None]:
params = dict((k[6:], v) for k, v in best_run.to_dict().items() if k.startswith('param.'))
params

Plot the error as a function of timestep

In [None]:
preds = np.load(best_run['path'] / 'test_pred.npz')

In [None]:
true = np.load(best_run['path'] / 'test_true.npz')

Predicted vs actual

In [None]:
pred_e_pera = preds['energy'] / preds['count']
true_e_pera = true['energy'] / true['count']
for pera in [pred_e_pera, true_e_pera]:
    pera -= true_e_pera.min()

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(6.4, 2.3))

ax = axs[0]
ax.scatter(pred_e_pera, true_e_pera, s=2)
lim = [
    min(pred_e_pera.min(), true_e_pera.min()),
    max(pred_e_pera.max(), true_e_pera.max())
]
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.plot(ax.get_xlim(), ax.get_xlim(), 'k--', lw=1)

r2_score = np.corrcoef(pred_e_pera, true_e_pera)[0, 1]
ax.text(0.1, 0.8, f'$R^2$: {r2_score: .2f}', transform=ax.transAxes, fontsize=10)

ax.set_title('Energy (meV/atom)', loc='left', fontsize=10)

ax = axs[1]
ax.scatter(preds['forces'].flatten(), true['forces'].flatten(), s=2)
ax.set_title('Forces (eV/$\\AA$)', loc='left', fontsize=10)

ax.set_xlim(ax.get_ylim())
ax.set_ylim(ax.get_ylim())
ax.plot(ax.get_xlim(), ax.get_xlim(), 'k--', lw=1)

rmse = np.sqrt(np.power(preds['forces'].flatten() - true['forces'].flatten(), 2).mean())
ax.text(0.1, 0.9, f'RMSE: {rmse:.2f}', transform=ax.transAxes, fontsize=10)


ax = axs[2]
ax.scatter(preds['stress'] / units.GPa, true['stress'] / units.GPa, s=2)
ax.set_title('Stress (GPa)', loc='left', fontsize=10)

rmse = np.sqrt(np.power(preds['stress'] - true['stress'], 2).mean()) / units.GPa
ax.text(0.1, 0.9, f'RMSE: {rmse:.2f}', transform=ax.transAxes, fontsize=10)

ax.set_xlim(ax.get_ylim())
ax.set_ylim(ax.get_ylim())
ax.plot(ax.get_xlim(), ax.get_xlim(), 'k--', lw=1)

for ax in axs:
    ax.set_xlabel('ML')
    ax.set_ylabel('DFT')

fig.tight_layout()

Plot the training curve

In [None]:
train_log = pd.read_csv(best_run['path'] / "log.csv")

In [None]:
fig, axs = plt.subplots(3, 1, figsize=(3.5, 2.))

for a, c, color in zip(axs, ['s_rmse_valid', 'f_rmse_valid', 'e_rmse_valid'], ['red', 'blue', 'gray']):
    a.plot(train_log[c], color=color)
    a.set_ylabel(c.split("_")[0])
    a.set_label('Loss')

axs[-1].set_xlabel('Epoch')

## Compare Pre-Trained and Random Start
See the test performance for the same architecture

In [None]:
fig, ax = plt.subplots(figsize=(3.5, 2.))

top_runs = top_runs.sort_values('param.num_epochs')
ax.loglog(
    top_runs['param.num_epochs'],
    top_runs['metric.force_mean_error'],
    '--o',
)

ax.set_xlabel('Epochs')
ax.set_ylabel('Force RMSE (eV/$\\AA$)')
fig.tight_layout()