In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from mp_api.client import MPRester
from emmet.core.summary import HasProps
import pymatgen.core
from pymatgen.analysis.local_env import VoronoiNN
from matminer.utils.caching import get_all_nearest_neighbors
from matminer.featurizers.utils.stats import PropertyStats
from matminer.featurizers.structure.order import DensityFeatures, ChemicalOrdering, StructuralComplexity
from matminer.featurizers.structure.bonding import BondFractions, GlobalInstabilityIndex, StructuralHeterogeneity
from matminer.featurizers.structure.matrix import SineCoulombMatrix
from matminer.featurizers.structure.misc import EwaldEnergy, XRDPowderPattern
from matminer.featurizers.structure.rdf import RadialDistributionFunction, PartialRadialDistributionFunction
from matminer.featurizers.structure.composite import JarvisCFID

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = pd.read_csv('data.csv')
data

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,material_id,elements,formula_pretty_reduced,formula,num_elements,chemsys,spacegroup_symbol,spacegroup_number,...,980,990,0.2,0.5,0.6,1e-05,0.0001,0.001,0.01,σ(RT)(S cm-1)
0,0,0,mp-8892,"['Li', 'F', 'In']",LiInF4,Li4 In4 F16,3,F-In-Li,Pbcn,60,...,9.653824,9.657305,9.927689,9.927689,9.927689,0.993939,0.981707,0.962025,0.788732,
1,1,1,mp-772968,"['Li', 'Tl', 'P', 'H', 'O']",LiTlPHO3,Li4 Tl4 P4 H4 O12,5,H-Li-O-P-Tl,C2,5,...,9.865593,9.873043,11.140249,14.066884,14.066884,1.000000,0.925000,0.725000,0.555556,
2,2,2,mp-756951,"['Li', 'Sb', 'O']",LiSbO3,Li4 Sb4 O12,3,Li-O-Sb,C2/m,12,...,7.551016,7.553322,5.543978,5.543978,6.903721,1.000000,0.993865,0.906832,0.560606,
3,3,3,mp-754856,"['Li', 'Y', 'Zr', 'S']",Li7Y7ZrS16,Li7 Y7 Zr1 S16,4,Li-S-Y-Zr,P2/m,10,...,5.753538,5.754215,4.319241,5.047983,5.047983,0.987952,0.962963,0.921569,0.735294,
4,4,4,mp-24610,"['H', 'Li', 'O', 'P']",LiP(HO2)2,Li4 P4 H8 O16,4,H-Li-O-P,Pna2_1,33,...,20.776278,20.872825,29.153401,32.155099,32.155099,1.000000,0.988506,0.941860,0.537500,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
868,868,868,mp-9919,"['Li', 'Zn', 'Sb']",LiZnSb,Li2 Zn2 Sb2,3,Li-Sb-Zn,P6_3mc,186,...,5.701444,5.702351,4.292991,7.108637,7.108637,0.987578,0.968750,0.876623,0.664286,
869,869,869,mp-14704,"['Li', 'B', 'O', 'Y']",Li6Y(BO3)3,Li24 Y4 B12 O36,4,B-Li-O-Y,P2_1/c,14,...,8.689684,8.696529,5.979421,11.272210,11.272210,1.000000,0.982143,0.812500,0.742574,
870,870,870,mp-557756,"['Li', 'Zn', 'P', 'O']",Li4Zn(PO4)2,Li16 Zn4 P8 O32,4,Li-O-P-Zn,P2_1/c,14,...,10.007699,10.013265,8.758697,8.758697,10.855064,1.000000,0.967480,0.811475,0.700935,1.000000e-10
871,871,871,mp-966801,"['Li', 'Ca', 'B', 'O']",LiCaBO3,Li8 Ca8 B8 O24,4,B-Ca-Li-O,Ibca,73,...,8.888549,8.891707,7.424524,10.040524,10.040524,0.989011,0.933333,0.771084,0.735294,


In [6]:
df = data[['material_id', 'formula_pretty_reduced']]

In [7]:
mat_id = df['material_id'].tolist() 

In [8]:
len(mat_id)

873

In [9]:
MP_API_KEY = #type your key

with MPRester(MP_API_KEY) as mpr:
    oxi_docs = mpr.provenance.get_data_by_id(material_ids=mat_id,
                                                     fields=['material_id','structure','formula_pretty','possible_species'])

Retrieving OxidationStateDoc documents:   0%|          | 0/867 [00:00<?, ?it/s]

In [10]:
oxi_mat_ids = [doc.material_id for doc in oxi_docs]

missing_id = set(mat_id)-set(oxi_mat_ids)

print(missing_id)

{'mp-867699', 'mp-867712', 'mp-613442', 'mp-561623', 'mp-867677', 'mp-867700'}


In [11]:
mat_id_updt = mat_id
for mat in missing_id:
    mat_id_updt.remove(mat)
    
len(mat_id_updt)

867

In [12]:
df_s = pd.DataFrame()

for i, doc in enumerate(oxi_docs):
    df_s.loc[i, 'material_id'] = doc.material_id
    df_s.loc[i, 'formula_pretty_reduced'] = doc.formula_pretty 
    
df_s

Unnamed: 0,material_id,formula_pretty_reduced
0,mp-10103,LiYbAlF6
1,mp-10517,LiScP2O7
2,mp-10555,Ba2LiReN4
3,mp-12263,LiAuF4
4,mp-14476,LiRhO2
...,...,...
862,mp-8430,KLiS
863,mp-8754,NaLiTe
864,mp-8870,LiMgAsO4
865,mp-9018,LiCdPO4


In [13]:
no_oxi = ['Li(BH)5', 'K2LiBO3', 'K4LiIO6', 'LiSiB6', 'LiB6C', 'LiH6BrO7', 'LiAlB14']
structures = []

for i, doc in enumerate(tqdm(oxi_docs)):
    s = doc.structure
    if doc.formula_pretty in no_oxi:
        s.add_oxidation_state_by_guess()
    structures.append(s)

100%|██████████| 867/867 [00:00<00:00, 2158.13it/s]


In [10]:
densityf = DensityFeatures()
densityf.feature_labels()

[]


['density', 'vpa', 'packing fraction']

In [15]:
s = structures[0]

densityf.featurize(s)

[5.047785181171902, 11.731326068744151, 0.4408156729864949]

In [16]:
for i, s in enumerate(tqdm(structures)):
    features = densityf.featurize(s)
    for j, label in enumerate(densityf.feature_labels()):
        df_s.loc[i, label] = features[j]
        
df_s

100%|██████████| 867/867 [00:02<00:00, 317.62it/s]


Unnamed: 0,material_id,formula_pretty_reduced,density,vpa,packing fraction
0,mp-10103,LiYbAlF6,5.047785,11.731326,0.440816
1,mp-10517,LiScP2O7,2.606895,13.077787,0.310300
2,mp-10555,Ba2LiReN4,6.095159,17.838710,0.777361
3,mp-12263,LiAuF4,6.143331,12.609524,0.332691
4,mp-14476,LiRhO2,6.201424,9.495387,0.655202
...,...,...,...,...,...
862,mp-8430,KLiS,2.009905,21.509345,0.954021
863,mp-8754,NaLiTe,3.580874,24.350290,0.666564
864,mp-8870,LiMgAsO4,3.569512,11.308706,0.466101
865,mp-9018,LiCdPO4,4.199445,12.106782,0.426874


In [2]:
chemordering = ChemicalOrdering()
complexity = StructuralComplexity()
heterogeneity = StructuralHeterogeneity()
coulomb = SineCoulombMatrix()
ewald = EwaldEnergy()
xrd = XRDPowderPattern()
rdf = RadialDistributionFunction()
prdf = PartialRadialDistributionFunction(include_elems=['Li',])

In [9]:
featurizers = [chemordering, complexity, heterogeneity, coulomb, ewald, xrd, rdf, prdf]

for i, f in enumerate(featurizers):
    print(i)
    print(f.citations())

0
['@article{Ward2017,author = {Ward, Logan and Liu, Ruoqian and Krishna, Amar and Hegde, Vinay I. and Agrawal, Ankit and Choudhary, Alok and Wolverton, Chris},doi = {10.1103/PhysRevB.96.024104},journal = {Physical Review B},pages = {024104},title = {{Including crystal structure attributes in machine learning models of formation energies via Voronoi tessellations}},url = {http://link.aps.org/doi/10.1103/PhysRevB.96.024104},volume = {96},year = {2017}}']
1
['@article{complexity2013,author = {Krivovichev, S. V.},title = {Structural complexity of minerals: information storage and processing in the mineral world},journal = {Mineral. Mag.},volume = {77},number = {3},pages = {275-326},year = {2013},month = {04},issn = {0026-461X},doi = {10.1180/minmag.2013.077.3.05},url = {https://doi.org/10.1180/minmag.2013.077.3.05}}']
2
['@article{Ward2017,author = {Ward, Logan and Liu, Ruoqian and Krishna, Amar and Hegde, Vinay I. and Agrawal, Ankit and Choudhary, Alok and Wolverton, Chris},doi = {10.110

In [19]:
for f in featurizers:
    print(f)
    if f == chemordering or f == heterogeneity:
        for i, s in enumerate(tqdm(structures)):
            s.perturb(0.001)
            features = f.featurize(s)
            for j, label in enumerate(f.feature_labels()):
                df_s.loc[i, label] = features[j] 
    elif f == coulomb or f == rdf:
        f.fit(structures)
        for i, s in enumerate(tqdm(structures)):
            features = f.featurize(s)
            for j, label in enumerate(f.feature_labels()):
                df_s.loc[i, label] = features[j]
    else: 
        for i, s in enumerate(tqdm(structures)):
            features = f.featurize(s)
            for j, label in enumerate(f.feature_labels()):
                df_s.loc[i, label] = features[j]

df_s

ChemicalOrdering()


100%|██████████| 867/867 [22:09<00:00,  1.53s/it] 


StructuralComplexity()


100%|██████████| 867/867 [00:06<00:00, 138.98it/s]


StructuralHeterogeneity()


100%|██████████| 867/867 [15:06<00:00,  1.05s/it] 


SineCoulombMatrix()


100%|██████████| 867/867 [00:20<00:00, 42.99it/s]


EwaldEnergy()


100%|██████████| 867/867 [00:13<00:00, 64.50it/s] 


XRDPowderPattern(pattern_length=128)


100%|██████████| 867/867 [03:18<00:00,  4.38it/s]


RadialDistributionFunction()


100%|██████████| 867/867 [05:49<00:00,  2.48it/s]


PartialRadialDistributionFunction(exclude_elems=[], include_elems=['Li'])


  0%|          | 0/867 [00:00<?, ?it/s]


[0;31m---------------------------------------------------------------------------[0m
[0;31mException[0m                                 Traceback (most recent call last)
[0;32m<ipython-input-19-1d4392e780d4>[0m in [0;36m<module>[0;34m[0m
[1;32m     17[0m     [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m     18[0m         [0;32mfor[0m [0mi[0m[0;34m,[0m [0ms[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mtqdm[0m[0;34m([0m[0mstructures[0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 19[0;31m             [0mfeatures[0m [0;34m=[0m [0mf[0m[0;34m.[0m[0mfeaturize[0m[0;34m([0m[0ms[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     20[0m             [0;32mfor[0m [0mj[0m[0;34m,[0m [0mlabel[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mf[0m[0;34m.[0m[0mfeature_labels[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m     21[0m                 [0mdf_s[

In [None]:
df_s

In [None]:
df_ordered = pd.DataFrame(columns=df_s.columns)

for i, mat in enumerate(tqdm(mat_id_updt)):
    row = df_s.loc[df_s['material_id'] == mat].to_dict()
    for col in df_ordered.columns:
        val = list(row[col].values())[0]
        df_ordered.loc[i, col] = val
    
df_ordered

In [None]:
df_ordered.to_csv('structure_feature2_mac.csv')