# Featurization of Elastic Tensor data 

We want to study the performance of model proposed in [De Jong's paper](https://www.nature.com/articles/srep34256). Here, features for elastic tensor data is added according to that as shown in the paper. Data is then saved as pickle file.

In [1]:
import numpy as np
import pandas as pd
import pymatgen as pmg

from matminer.datasets.dataframe_loader import load_elastic_tensor
from matminer.utils.conversions import str_to_composition

from matminer.featurizers.composition import ElementProperty, CohesiveEnergy
from matminer.featurizers.structure import SiteStatsFingerprint
from matminer.featurizers.site import CoordinationNumber
from matminer.featurizers.base import MultipleFeaturizer
from pymatgen import MPRester
from pymatgen.analysis.local_env import VoronoiNN

key = 'T6QzrvW8J07u4L2O'

Load data

In [2]:
data = load_elastic_tensor()

Compute composition object from formula provided

In [3]:
data['composition'] = str_to_composition(data['formula'])

Drop unnecessary data

In [4]:
data = data.drop(['formula', 'nsites', 'space_group', 
               'G_Reuss', 'G_Voigt', 'K_Reuss', 'K_Voigt',
               'compliance_tensor', 'elastic_tensor', 'elastic_tensor_original'], 1)

## Compute features 

#### Holder Means of first 8 properties in Table 1 of De Jong et al. paper 
'group_number', 'atomic_mass', 'atomic_radius', 'row_number', 'boiling_temp', 'melting_temp', 'electronegativity', 'atomic_number'
$$\mu_p(x) = [\frac{(\Sigma^n_{i=1}w_ix_i^p)}{(\Sigma^n_{i=1}w_i)}]^\frac{1}{p}$$

In [5]:
ef = ElementProperty(data_source='pymatgen', 
                    features=['group', 'atomic_mass', 'atomic_radius', 'row',
                              'boiling_point', 'melting_point', 'X', 'Z'],  
                    stats=['holder_mean::%d'%d for d in range(-4, 4+1)] + ['geom_std_dev', 'std_dev'])
data = ef.featurize_dataframe(data, col_id='composition')


divide by zero encountered in double_scalars


invalid value encountered in double_scalars


divide by zero encountered in double_scalars


invalid value encountered in double_scalars


divide by zero encountered in double_scalars


invalid value encountered in double_scalars


divide by zero encountered in double_scalars


invalid value encountered in double_scalars

ElementProperty: 100%|██████████| 1181/1181 [00:01<00:00, 799.69it/s]


#### Cohesive Energy 

In [6]:
%%time
ft = CohesiveEnergy(mapi_key=key)

data = ft.featurize_dataframe(data, col_id='composition', ignore_errors=True)

CohesiveEnergy: 100%|██████████| 1181/1181 [01:01<00:00, 19.29it/s]


CPU times: user 139 ms, sys: 89.2 ms, total: 229 ms
Wall time: 1min 55s


#### Formation energy per atom, Energy above hull, band gap, density 

In [None]:
mpr = MPRester(api_key=key)

In [None]:
%%time
data['formation_energy_per_atom'], data['e_above_hull'], data['band_gap'], data['density'] = np.nan, np.nan, np.nan, np.nan
for idx, n in enumerate(data['material_id']):
    ls = mpr.get_data(n)
    try:
        data['formation_energy_per_atom'][idx] = ls[0]['formation_energy_per_atom']
        data['e_above_hull'][idx] = ls[0]['e_above_hull']
        data['band_gap'][idx] = ls[0]['band_gap']
        data['density'][idx] = ls[0]['density']
    except:
        pass

#### log (V) per atom 

In [None]:
def compute_log_volume(x):
    return np.log(x['volume']/x['composition'].num_atoms)

In [None]:
data['log volume per atom'] = data.apply(compute_log_volume, axis=1)

#### Voronoi-based average bond length, bond angles and mean AD and SD of composition features

In [None]:
ft = MultipleFeaturizer([
    SiteStatsFingerprint.from_preset("Composition-dejong2016_AD"), 
    SiteStatsFingerprint.from_preset("Composition-dejong2016_SD"), 
    SiteStatsFingerprint.from_preset("BondLength-dejong2016"), 
    SiteStatsFingerprint.from_preset("BondAngle-dejong2016")
])

data = ft.featurize_dataframe(data, col_id='structure')

#### Voronoi based site coordination number

In [None]:
ft = SiteStatsFingerprint(CoordinationNumber(nn=VoronoiNN(weight='area')), 
        stats=['holder_mean::%d' % d for d in range(-4, 4 + 1)]
                        + ['std_dev', 'geom_std_dev'])

data = ft.featurize_dataframe(data, col_id='structure')

In [None]:
print ("FINAL SHAPE OF DATA: ", data.shape)
data.head(1)

Save data as pickle file

In [None]:
data.to_pickle('./dejong_featurized_data.pkl')