#  Computation of crystal structure representations 

We want to compare performance of both featurizations and ML algorithms in [Ward (2016)](https://www.nature.com/articles/npjcompumats201628), [Ward (2017)](https://journals.aps.org/prb/abstract/10.1103/PhysRevB.96.024104), [Deml (2016)](https://journals.aps.org/prb/abstract/10.1103/PhysRevB.93.085142), [Faber (2016)](https://arxiv.org/abs/1503.07406) and [Schutt (2014)](https://journals.aps.org/prb/abstract/10.1103/PhysRevB.89.205118) in predicting formation enthalpies of compounds, $\Delta H_f$.

Hence, in this notebook, data is featurized differently according to the papers mentioned above.

Note: Notebook takes 3 CPU hours to run (probably more).

In [14]:
import numpy as np
import pandas as pd
import os
import pickle
import warnings

from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers.composition import ElementProperty, Stoichiometry, \
        ValenceOrbital, IonProperty, TMetalFraction, CationProperty, \
        OxidationStates, ElectronAffinity, ElectronegativityDiff
from matminer.featurizers.structure import SiteStatsFingerprint, \
        StructuralHeterogeneity, ChemicalOrdering, StructureComposition, \
        CoulombMatrix, PartialRadialDistributionFunction 
from matminer.featurizers.structure import MaximumPackingEfficiency

from matminer.utils.conversions import str_to_composition, composition_to_oxidcomposition

Load data

In [15]:
%%time
path = os.path.join(os.getcwd(), "oqmd_icsd_subset.pkl")
data = pd.read_pickle(path)

CPU times: user 13.1 s, sys: 1.68 s, total: 14.7 s
Wall time: 15.3 s


Drop data without formation enthalpy value

In [16]:
data = data.dropna(subset=['delta_e']).reset_index()

In [17]:
data.columns

Index(['index', 'band_gap', 'delta_e', 'magnetic_moment', 'path', 'stability',
       'structure', 'total_energy', 'volume_pa', 'structure_obj',
       'composition', 'is_ICSD'],
      dtype='object')

Compute pymatgen composition and ionic states

In [18]:
data['composition_obj'] = str_to_composition(data['composition'])

Remove compounds that cannot be featurized (due to their oxidation states for Deml's model)

In [19]:
for i in [952, 1214, 1217, 1311, 1315, 1710, 1963]:
    data = data.drop([i, i])

In [20]:
print ("Shape of data: ", data.shape)
data.reset_index(inplace=True)
data.head(1)

Shape of data:  (31156, 13)


Unnamed: 0,level_0,index,band_gap,delta_e,magnetic_moment,path,stability,structure,total_energy,volume_pa,structure_obj,composition,is_ICSD,composition_obj
0,0,234975,3.879,-3.579764,-3.2e-05,/home/oqmd/libraries/icsd/31750/static,-1.0848,Ac O\n 1.0\n4.067812 -0.000030 0.000026\n-2.03...,-7.936143,17.988,[[5.0000001e-05 2.3486100e+00 1.5314600e+00] A...,Ac2O3,True,"(Ac, O)"


## Create featurizer
Here we featurize data with Coulomb Matrix (CM), PartialRadialDistributionFunction (PRDF) and Voronoi tessellation features used in Ward et al (2017).

### 1) Sine Coulomb Matrix features (Faber et al 2015)

In [None]:
%%time
cm = CoulombMatrix()
data = cm.featurize_dataframe(data,'structure_obj')

Process data to form vector descriptors using eigenvalue of CM matrix and append the descriptors to make them same size

In [None]:
X_cm = data['coulomb matrix']

X_cm = pd.Series([np.sort(np.linalg.eigvals(s)) \
            for s in X_cm], X_cm.index)
nt = max(X_cm.apply(len))

XLIST = []
for x in X_cm:
    XLIST.append(np.append(x, np.zeros(nt - x.shape[0])))
X_cm = np.array(XLIST)
print ("CM input data shape:", X_cm.shape)

Save Coulomb Matrix featurized data

In [None]:
pickle.dump(X_cm, open ("X_faber.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

### 2) Composition based features (Ward et al 2016) 

In [None]:
ft = MultipleFeaturizer([Stoichiometry(), ElementProperty.from_preset("magpie"),
                         ValenceOrbital(props=['avg']), IonProperty(fast=True)])
data = ft.featurize_many(data['composition_obj'], ignore_errors=True)

Process data to remove NaN values

In [None]:
X_ward2016 = np.array(X_ward2016)
X_ward2016 = np.nan_to_num(X_ward2016, copy=True)
print ("Voronoi tessellation input data shape:", X_ward2016.shape)

Save featurized data

In [None]:
pickle.dump(X_ward2016, open ("X_ward2016.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

### 3) Voronoi tessellation features (Ward et al 2017) 

In [None]:
ward = MultipleFeaturizer([
    SiteStatsFingerprint.from_preset("CoordinationNumber_ward-prb-2017"),
    StructuralHeterogeneity(),
    ChemicalOrdering(),
    MaximumPackingEfficiency(),
    SiteStatsFingerprint.from_preset("LocalPropertyDifference_ward-prb-2017"),
    StructureComposition(Stoichiometry()),
    StructureComposition(ElementProperty.from_preset("magpie")),
    StructureComposition(ValenceOrbital(props=['frac'])),
    StructureComposition(IonProperty(fast=True))
])

In [None]:
print ("Total number of Ward features:", len(ward.featurize(data['structure_obj'][0])))

In [None]:
%%time
X_ward2017 = ward.featurize_many(data['structure_obj'], ignore_errors=True)

Process data to remove NaN values

In [None]:
X_ward2017 = np.array(X_ward2017)
X_ward2017 = np.nan_to_num(X_ward2017, copy=True)
print ("Voronoi tessellation input data shape:", X_ward2017.shape)

Save Voronoi tessellation featurized data

In [None]:
pickle.dump(X_ward, open ("X_ward.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

### 4) Deml-based features 

Compute ionic states

In [None]:
data['oxidation_states'] = composition_to_oxidcomposition(data['composition_obj'])

In [None]:
%%time
ft = MultipleFeaturizer([ElementProperty.from_preset('deml'), 
                         TMetalFraction(),
                         ValenceOrbital(),
                         CationProperty.from_preset('deml'),
                         OxidationStates.from_preset('deml'),
                         ElectronAffinity(),
                         ElectronegativityDiff()])
data = ft.featurize_dataframe(data, col_id='oxidation_states', ignore_errors=True)

Drop stats of f orbital valence electrons

In [None]:
data = data.drop(['frac f valence electrons', 'avg f valence electrons'], 1)

Calculate number of atoms in a formula unit

In [None]:
data['num_atoms'] = data['composition_obj'].apply(lambda x: x.num_atoms)

Fill in NaN values with zeros

In [None]:
data.fillna(value=0, inplace=True)

Square root and inverse of each term

In [None]:
def inv(x):
    try:
        output = 1.0/x
    except:
        output = 0.0
    return output

In [None]:
col = data.columns.drop(['composition', 'composition_obj', 'oxidation_states'])
mean_col = []

In [None]:
for i in col:
    data["inverse %s"%i] = data[i].apply(lambda x: inv(x))
    data["sqrt %s"%i] = data[i].apply(lambda x: np.sqrt(x))
    if "mean" in i:
        mean_col.append(i)    

Products of the primary (those without an asterisk) and stoichiometric weighted mean values.

In [None]:
primary = ['num_atoms', 'transition metal fraction', 'avg anion electron affinity',
           'avg s valence electrons', 'avg p valence electrons', 
           'avg d valence electrons', 'frac s valence electrons', 
           'frac p valence electrons','frac d valence electrons']

In [None]:
product = data[mean_col + primary]
col = product.columns

Use PolynomialFeatures with degree 2 from scikit-learn package

In [None]:
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=True)
product = pd.DataFrame(poly.fit_transform(product))
product.columns = poly.get_feature_names(col)
product = product.drop(["1"]+ mean_col+primary, 1)
print (product.shape)

Merge into original dataframe

In [None]:
data[product.columns] = product

In [None]:
print ("Shape of featurized data: ", data.shape)

In [None]:
X_deml = data.drop(['index', 'band_gap', 'delta_e', 'magnetic_moment', 'path', 'stability',
       'structure', 'total_energy', 'volume_pa', 'structure_obj',
       'composition', 'is_ICSD', 'composition_obj', 'oxidation_states'], 1)

Save into pickle file

In [None]:
pickle.dump(X_deml, open("X_deml.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

### 5) PRDF features (Schutt et al 2014)

In [None]:
%%time
prdf = PartialRadialDistributionFunction(cutoff=16.0, bin_size=3.0)
prdf.fit(data['structure_obj'])
X_schutt = prdf.featurize_many(data['structure_obj'], ignore_errors=True)

Process data to remove NaN values

In [None]:
X_schutt = np.array(X_schutt)
X_schutt = np.nan_to_num(X_schutt, copy=True)
print ("PRDF input data shape:", X_schutt.shape)

Save PRDF featurized data

In [None]:
pickle.dump(X_schutt, open ("X_schutt.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)

### Formation enthalpy values as the predicted data

Save formation enthalpy data as y input data.

In [None]:
pickle.dump(data['delta_e'], open ("y.pkl", "wb"), protocol=pickle.HIGHEST_PROTOCOL)