In [1]:
import numpy as np
import os
import pandas as pd

import gzip
import json

from matminer.datasets.convenience_loaders import load_dielectric_constant, load_elastic_tensor

from ElMD import ElMD
from ElMD import elmd

In [2]:
def getFile(data_path):
    with gzip.open(data_path, "r") as f:
        data = f.read()
        j = json.loads (data.decode('utf-8'))
        
    return j

In [3]:
## MatBench datasets
## Download .json.gz files from MatBench (https://matbench.materialsproject.org/)

dielec = getFile("./matbench/matbench_dielectric.json.gz")
perov = getFile("./matbench/matbench_perovskites.json.gz")
gvrh = getFile("./matbench/matbench_log_gvrh.json.gz")
kvrh = getFile("./matbench/matbench_log_kvrh.json.gz")

In [5]:
len(perov['data']), len(gvrh['data']), len(kvrh['data']), len(dielec['data'])

(18928, 10987, 10987, 4764)

In [6]:
## Get Matminer datasets
mat_dielec = load_dielectric_constant()
mat_elas = load_elastic_tensor()

In [7]:
len(mat_dielec), len(mat_elas)

(1056, 1181)

In [8]:
from pymatgen.core.structure import Structure
import pandas as pd

In [9]:
def calcMagpie_matbench(f, targetname):
    '''
    Calculate Magpie features for datasets from Matbench
    Arguments:
    f: data
    targetname: when we have two same formulas, we should choose the one with low/ high/random target.
    '''
    
    new_df = {}
    if targetname == "random":
        for i in range(len(f['data'])):
            struct = Structure.from_dict(f['data'][i][0])
            targ = f['data'][i][1]

            comp = struct.composition.reduced_formula

            ## skip repeated composition
            if comp in new_df:
                continue

            try:
                ff = ElMD(comp, metric="magpie").feature_vector
            except:
                continue

            new_df[comp] = [targ]
            new_df[comp].extend(ff)
            
    elif targetname == "high":
        for i in range(len(f['data'])):
            struct = Structure.from_dict(f['data'][i][0])
            targ = f['data'][i][1]

            comp = struct.composition.reduced_formula

            ## skip repeated composition with smaller target number
            if comp in new_df and targ <= new_df[comp][0]:
                continue

            try:
                ff = ElMD(comp, metric="magpie").feature_vector
            except:
                continue

            new_df[comp] = [targ]
            new_df[comp].extend(ff)
        
    elif targetname == "low":
        for i in range(len(f['data'])):
            struct = Structure.from_dict(f['data'][i][0])
            targ = f['data'][i][1]

            comp = struct.composition.reduced_formula

            ## skip repeated composition with higher target number
            if comp in new_df and targ >= new_df[comp][0]:
                continue

            try:
                ff = ElMD(comp, metric="magpie").feature_vector
            except:
                continue

            new_df[comp] = [targ]
            new_df[comp].extend(ff)
            
    print(len(new_df))
    new_df = pd.DataFrame.from_dict(new_df, orient='index').reset_index()
    new_df = new_df.rename(columns={"index":'composition', 0:'target'})
    return new_df

In [10]:
dielec_feature = calcMagpie_matbench(dielec, "high")   ## 3992

3992


In [11]:
dielec_feature

Unnamed: 0,composition,target,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,20,21
0,KS,1.752064,45.500000,35.581650,362.445000,8.500000,3.500000,154.000000,1.700000,1.500000,...,3.500000,0.500000,1.000000,0.000000,0.0,1.500000,49.446771,1.101000,0.000000,149.500000
1,K3VO4,1.652859,39.875000,31.916612,461.690000,7.125000,3.250000,145.375000,1.903750,1.500000,...,3.375000,0.500000,0.750000,0.875000,0.0,2.125000,41.593958,0.000000,0.000000,147.625000
2,Rb2ZrO3,1.867858,58.833333,52.652333,788.810000,9.500000,3.500000,128.000000,2.300000,1.833333,...,4.500000,0.166667,1.000000,2.666667,0.0,3.833333,27.404583,0.000000,0.000000,108.833333
3,MnOF,2.676887,77.333333,29.978616,542.433333,13.333333,2.666667,87.333333,2.990000,2.000000,...,6.666667,0.000000,1.000000,1.666667,0.0,2.666667,9.766695,0.656667,0.000103,81.333333
4,Li2CoSiO4,1.793232,49.875000,36.712460,1215.147500,8.500000,3.125000,117.125000,1.852500,1.750000,...,6.000000,0.250000,0.750000,1.500000,0.0,2.500000,12.963958,0.096625,0.774236,184.125000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3987,Cr2O5,2.458294,76.142857,26.284171,662.000000,13.142857,2.571429,86.857143,2.931429,1.714286,...,6.000000,0.285714,1.428571,1.428571,0.0,3.142857,9.700714,0.000000,0.000000,74.000000
3988,Ca2FeWO6,2.136837,69.000000,43.160440,876.080000,12.000000,3.000000,99.800000,2.766000,2.000000,...,7.400000,0.000000,1.200000,1.400000,0.0,2.600000,12.991000,0.000000,0.422133,98.400000
3989,La4MnS6O,2.690619,44.916667,77.776212,1194.096667,7.166667,4.416667,152.750000,1.643333,2.000000,...,5.500000,0.000000,0.333333,5.500000,0.0,5.833333,20.450616,0.183500,0.000155,180.000000
3990,BaAg2GeSe4,2.811494,68.000000,90.322925,1101.182500,12.000000,4.500000,138.125000,1.917500,1.750000,...,12.000000,0.250000,2.250000,0.000000,0.0,2.500000,26.773750,0.291375,0.000000,199.125000


In [12]:
perov_feature = calcMagpie_matbench(perov, "low")   ## 9646
gvrh_feature = calcMagpie_matbench(gvrh, "random")  ## 9723
kvrh_feature = calcMagpie_matbench(kvrh, "random")   ## 9723

9646
9723
9723


In [13]:
def calcMagpie_matminer(df, targetname):
    '''
    Calculate Magpie features for datasets from Matminer
    Arguments:
    df: data
    targetname: The target name we choose
    '''
    new_df = {}
    for i in range(len(df)):
        tmp = []
        comp = df.iloc[i]["formula"]
        targ = df.iloc[i][targetname]
        
        ## skip repeated composition with smaller target number
        if comp in new_df and targ <= new_df[comp][0]:
            continue
        
        try:
            ff = ElMD(comp, metric="magpie").feature_vector
        except:
            continue

        new_df[comp] = [targ]
        new_df[comp].extend(ff)
        

    print(len(new_df))
    new_df = pd.DataFrame.from_dict(new_df, orient='index').reset_index()
    new_df = new_df.rename(columns={"index":'composition', 0:'target'})
    return new_df

In [14]:
mat_dielec_feature = calcMagpie_matminer(mat_dielec, "band_gap")   
mat_elas_feature = calcMagpie_matminer(mat_elas, "elastic_anisotropy") 

964
1087


In [15]:
mat_dielec_feature

Unnamed: 0,composition,target,1,2,3,4,5,6,7,8,...,12,13,14,15,16,17,18,19,20,21
0,Rb2Te,1.88,32.666667,99.511867,449.193333,6.000000,5.000000,192.666667,1.246667,1.333333,...,6.000000,0.666667,0.666667,0.000000,0.0,1.333333,72.069444,0.154667,0.000000,203.333333
1,CdCl2,3.52,78.000000,86.758333,453.346667,13.666667,4.333333,130.000000,2.180000,2.000000,...,10.333333,0.000000,0.333333,0.000000,0.0,0.333333,21.162500,0.831000,0.000000,150.666667
2,MnI2,1.17,81.333333,102.915662,764.233333,13.666667,4.666667,139.000000,2.290000,2.000000,...,13.666667,0.000000,0.666667,1.666667,0.0,2.333333,32.172529,0.708000,0.000103,115.000000
3,LaN,1.12,47.500000,76.456085,628.025000,9.000000,4.000000,139.000000,2.070000,2.000000,...,4.000000,0.000000,1.500000,4.500000,0.0,6.000000,25.833125,3.218500,0.000000,194.000000
4,MnF2,2.87,79.333333,30.978284,542.000000,13.666667,2.666667,84.333333,3.170000,2.000000,...,7.000000,0.000000,0.666667,1.666667,0.0,2.333333,9.967529,1.313333,0.000103,82.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
959,K2NiF6,2.36,75.888889,30.052835,457.058889,13.666667,2.666667,88.111111,3.168889,1.888889,...,7.000000,0.111111,0.666667,0.444444,0.0,1.222222,16.887963,1.313333,0.132310,85.444444
960,LaHBr2,3.60,73.000000,55.206338,371.705000,5.500000,3.000000,97.250000,2.115000,1.500000,...,5.500000,0.500000,0.250000,2.250000,0.0,3.000000,19.901875,4.290750,0.000000,161.500000
961,Li2AgSb,0.14,59.000000,89.582300,874.045000,10.500000,4.250000,137.750000,1.752500,1.500000,...,10.500000,0.500000,1.500000,0.000000,0.0,2.000000,24.010833,0.000000,0.000000,196.500000
962,Rb3AuO,0.21,66.200000,66.086514,362.838000,12.000000,3.400000,110.800000,2.736000,1.600000,...,8.800000,0.400000,1.200000,0.000000,0.0,1.600000,26.947500,0.000000,0.000000,98.000000


In [16]:
perov_feature.to_csv("./perov_features.csv", index=None)
kvrh_feature.to_csv("./kvrh_features.csv", index=None)
gvrh_feature.to_csv("./gvrh_features.csv", index=None)
dielec_feature.to_csv("./dielectric_feature.csv", index=None)

mat_dielec_feature.to_csv("./mat_dielectric_feature.csv", index=None)
mat_elas_feature.to_csv("./mat_elast_feature.csv", index=None)