In [1]:
import re
from matminer.featurizers.composition import ElementProperty
from matminer.featurizers.structure import DensityFeatures
from pymatgen.core.composition import Composition
from pymatgen.core.structure import Structure
import pandas as pd
from matminer.datasets import load_dataset
import os
import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import sys

In [3]:
from matminer.featurizers.base import MultipleFeaturizer

# compositional featurizers
from matminer.featurizers.composition import (
    ## composite
    ElementProperty,
    ## element
    Stoichiometry,
    ## ion
    IonProperty, ElectronAffinity,
    ## orbital
    AtomicOrbitals, ValenceOrbital
)

#structural featurizers
from matminer.featurizers.structure import (
    # bonding
    StructuralHeterogeneity,
    # order
    DensityFeatures,
)

In [4]:
dataset_path = "matbench_mp_gap_raw.data"

def load_data():
    if not os.path.exists(dataset_path):
        print("Loading data")
        data = load_dataset("matbench_mp_gap")
        print("Cleaning data")
        data = clean_data(data)
        print(data.head())
        with open(dataset_path, "wb") as f:
            print("Saving data")
            pickle.dump(data, f)
    else:
        with open(dataset_path, "rb") as f:
            data = pickle.load(f)
            
    data['composition'] = data['structure'].apply(lambda struct : struct.composition)
    data['formula'] = data['structure'].apply(lambda x: x.composition.reduced_formula)
    return data

def clean_data(data):
    data = data.dropna()
    data = data.drop_duplicates(subset=['formula'])
    data = data.reset_index(drop=True)
    return data


In [11]:
# compositional featurizers
element_featurizer = ElementProperty.from_preset("magpie", impute_nan=True)
stoich_featurizer = Stoichiometry()
ion_prop_featurizer = IonProperty(impute_nan=True)
# e_affinity_featurizer = ElectronAffinity()
atomic_orb_featurizer = AtomicOrbitals()
valence_orb_featurizer = ValenceOrbital(impute_nan=True)
# structural
struct_het_featurizer = StructuralHeterogeneity()
density_featurizer = DensityFeatures()
# chemical_featurizer = ChemicalSRO(nn=6).fit(data["structure"])


structural_featurizer = MultipleFeaturizer([
    density_featurizer,
    # struct_het_featurizer,
])

compositional_featurizer = MultipleFeaturizer([
    stoich_featurizer,
    ion_prop_featurizer,
    # e_affinity_featurizer,
    # atomic_orb_featurizer,
    valence_orb_featurizer,
    element_featurizer,
])

def featurize_data(d, size=None):

    indices = np.random.choice(d.index, size, replace=False)

    if size is not None:
        data = d.loc[indices]
    else:
        data = d
    if not os.path.exists("struct.features"):
        print("Getting structural features...")
        struct_features = structural_featurizer.featurize_dataframe(data, col_id="structure", ignore_errors=True, inplace=False)
        struct_features = struct_features.loc[:, structural_featurizer.feature_labels()]
        with open("struct.features", "wb") as f:
            pickle.dump(struct_features, f)
    else:
        with open("struct.features", "rb") as f:
            struct_features = pickle.load(f)

            if size is not None:
                struct_features = struct_features.loc[indices]


    if not os.path.exists("comp.features"):
        print("Getting compositional features...")
        comp_features = compositional_featurizer.featurize_dataframe(data, col_id="composition", ignore_errors=True, inplace=False)
        comp_features = comp_features.loc[:, compositional_featurizer.feature_labels()]
        with open("comp.features", "wb") as f:
            pickle.dump(comp_features, f)
    else:
        with open("comp.features") as f:
            comp_features = pickle.load(f)

            if size is not None:
                comp_features = comp_features.loc[indices]
    
    df_featurized = pd.concat([data, struct_features, comp_features], axis = 1)
        
    return df_featurized


In [6]:
featurized_path = "matbench_mp_gap_featurized.data"

def load_featurized(d=None, size=None):
    

    if os.path.exists(featurized_path):
        print("Loading featurized df...")
        with open(featurized_path, "rb") as f:
            df_featurized = pickle.load(f)
            return df_featurized
    else:
        data = d
        if data is None:
            data = load_data()

        df_featurized = featurize_data(data, size)

        with open(featurized_path, "wb") as f:
            pickle.dump(df_featurized, f)
            
        return df_featurized
            

In [8]:
data = load_data()

In [9]:
data.head()

Unnamed: 0,structure,gap pbe,formula,composition
0,"[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322,KMnO2,"(K, Mn, O)"
1,"[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0,Cr3Ni,"(Cr, Ni)"
2,"[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0,CsRbAs,"(Cs, Rb, As)"
3,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113,SiSn,"(Si, Sn)"
4,"[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514,Ca3P2,"(Ca, P)"


In [12]:
df_featurized = load_featurized(d=data, size=2500)

Getting compositional features...


MultipleFeaturizer:   0%|          | 0/2500 [00:00<?, ?it/s]

MultipleFeaturizer:  42%|████▏     | 1059/2500 [01:55<02:37,  9.17it/s]


KeyboardInterrupt: 

In [None]:
df_featurized

Unnamed: 0,structure,gap pbe,formula,composition,density,vpa,packing fraction,0-norm,2-norm,3-norm,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
9399,[[7.53704279e+00 4.04661400e-03 2.73521324e-03...,0.0,Cu4H10SO12,"(Cu, H, S, O)",3.046597,9.857706,0.213875,4,0.598352,0.521587,...,0.0,0.0,0.0,0.0,12.0,225.0,213.0,113.111111,93.069959,12.0
3299,"[[2.11442656 2.11442656 8.41484204] Ba, [4.228...",0.0,Ba4In2O7,"(Ba, In, O)",5.694467,19.984856,0.785401,3,0.638971,0.573772,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,98.307692,92.946746,12.0
50923,"[[ 3.95355535 -4.00325957 -4.00325957] Ta, [-0...",2.5102,TaTl3(SeS)2,"(Ta, Tl, Se, S)",6.668227,31.630467,0.474549,4,0.53033,0.441294,...,0.0,0.0,0.0,0.0,14.0,229.0,215.0,122.375,80.375,194.0
58228,"[[3.43133549 3.43133549 3.43133549] Dy, [0. 0....",0.0,Dy2CuOs,"(Dy, Cu, Os)",11.894332,20.200381,0.797106,3,0.612372,0.538609,...,0.0,0.0,0.0,0.0,194.0,225.0,31.0,201.75,11.625,194.0
56313,"[[3.20459519 0. 0. ] Er, [0. ...",0.0,ErGa3Os,"(Er, Ga, Os)",10.72215,17.551662,0.675268,3,0.663325,0.614463,...,0.0,0.0,0.0,0.0,64.0,194.0,130.0,116.0,62.4,64.0
3775,"[[ 1.45377051 3.20853614 -1.27841923] Cu, [1....",0.0,Cu3As4,"(Cu, As)",6.356695,18.297982,0.440333,2,0.714286,0.642563,...,0.0,0.0,0.0,0.0,166.0,225.0,59.0,191.285714,28.897959,166.0
20883,"[[0.91760361 0.79540381 7.57992721] Rb, [4.810...",1.9515,RbUC3O8,"(Rb, U, C, O)",3.847137,16.186916,0.419899,4,0.666173,0.62679,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,74.615385,78.852071,12.0
44036,"[[ 4.14728371 -2.3944353 0.50349105] Cr, [ 4...",0.0,Cr3Ni(SO4)6,"(Cr, Ni, S, O)",2.965658,13.02738,0.206885,4,0.733527,0.710011,...,0.595395,0.017512,0.033993,0.0,12.0,229.0,217.0,47.647059,50.32526,12.0
75725,"[[0. 0. 7.22181027] Nb, [0. ...",2.8553,NbAgF6,"(Nb, Ag, F)",4.138737,15.786206,0.26185,3,0.770552,0.752308,...,0.0,0.0,0.0,0.0,15.0,229.0,214.0,68.0,79.5,15.0
69362,"[[-2.9750674 5.16570023 11.21410894] Ba, [-0...",3.161,Ba3CaSb2O9,"(Ba, Ca, Sb, O)",5.75787,16.141903,0.755803,4,0.649786,0.609718,...,0.0,0.0,0.0,0.0,12.0,229.0,217.0,90.133333,93.76,12.0


In [11]:
print(df_featurized.shape)
display(df_featurized.head())
features = list(df_featurized.select_dtypes(include=[np.number]))
features.remove('gap pbe')
print(features)

(78164, 139)


Unnamed: 0,structure,gap pbe,formula,composition,density,vpa,packing fraction,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,"[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322,KMnO2,"(K, Mn, O)",3.781313,13.836878,1.046223,8.0,25.0,17.0,...,0.00031,7.8e-05,0.000116,0.0,12.0,229.0,217.0,117.5,105.5,12.0
1,"[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0,Cr3Ni,"(Cr, Ni)",7.839811,11.367855,0.984973,24.0,28.0,4.0,...,0.595395,0.148849,0.223273,0.0,225.0,229.0,4.0,228.0,1.5,229.0
2,"[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0,CsRbAs,"(Cs, Rb, As)",3.119812,52.035996,0.86065,33.0,55.0,22.0,...,0.0,0.0,0.0,0.0,166.0,229.0,63.0,208.0,28.0,166.0
3,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113,SiSn,"(Si, Sn)",4.361248,27.946089,0.328227,14.0,50.0,36.0,...,0.0,0.0,0.0,0.0,141.0,227.0,86.0,184.0,43.0,141.0
4,"[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514,Ca3P2,"(Ca, P)",2.020551,29.944258,0.545444,15.0,20.0,5.0,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,135.8,107.04,225.0


['density', 'vpa', 'packing fraction', 'MagpieData minimum Number', 'MagpieData maximum Number', 'MagpieData range Number', 'MagpieData mean Number', 'MagpieData avg_dev Number', 'MagpieData mode Number', 'MagpieData minimum MendeleevNumber', 'MagpieData maximum MendeleevNumber', 'MagpieData range MendeleevNumber', 'MagpieData mean MendeleevNumber', 'MagpieData avg_dev MendeleevNumber', 'MagpieData mode MendeleevNumber', 'MagpieData minimum AtomicWeight', 'MagpieData maximum AtomicWeight', 'MagpieData range AtomicWeight', 'MagpieData mean AtomicWeight', 'MagpieData avg_dev AtomicWeight', 'MagpieData mode AtomicWeight', 'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT', 'MagpieData range MeltingT', 'MagpieData mean MeltingT', 'MagpieData avg_dev MeltingT', 'MagpieData mode MeltingT', 'MagpieData minimum Column', 'MagpieData maximum Column', 'MagpieData range Column', 'MagpieData mean Column', 'MagpieData avg_dev Column', 'MagpieData mode Column', 'MagpieData minimum Row', 'Ma

In [26]:
[[i, row.structure, row.composition] for i, row in df_featurized.iterrows()]

[[0, 1, 1],
 [1, 1, 1],
 [2, 1, 1],
 [3, 1, 1],
 [4, 1, 1],
 [5, 1, 1],
 [6, 1, 1],
 [7, 1, 1],
 [8, 1, 1],
 [9, 1, 1],
 [10, 1, 1],
 [11, 1, 1],
 [12, 1, 1],
 [13, 1, 1],
 [14, 1, 1],
 [15, 1, 1],
 [16, 1, 1],
 [17, 1, 1],
 [18, 1, 1],
 [19, 1, 1],
 [20, 1, 1],
 [21, 1, 1],
 [22, 1, 1],
 [23, 1, 1],
 [24, 1, 1],
 [25, 1, 1],
 [26, 1, 1],
 [27, 1, 1],
 [28, 1, 1],
 [29, 1, 1],
 [30, 1, 1],
 [31, 1, 1],
 [32, 1, 1],
 [33, 1, 1],
 [34, 1, 1],
 [35, 1, 1],
 [36, 1, 1],
 [37, 1, 1],
 [38, 1, 1],
 [39, 1, 1],
 [40, 1, 1],
 [41, 1, 1],
 [42, 1, 1],
 [43, 1, 1],
 [44, 1, 1],
 [45, 1, 1],
 [46, 1, 1],
 [47, 1, 1],
 [48, 1, 1],
 [49, 1, 1],
 [50, 1, 1],
 [51, 1, 1],
 [52, 1, 1],
 [53, 1, 1],
 [54, 1, 1],
 [55, 1, 1],
 [56, 1, 1],
 [57, 1, 1],
 [58, 1, 1],
 [59, 1, 1],
 [60, 1, 1],
 [61, 1, 1],
 [62, 1, 1],
 [63, 1, 1],
 [64, 1, 1],
 [65, 1, 1],
 [66, 1, 1],
 [67, 1, 1],
 [68, 1, 1],
 [69, 1, 1],
 [70, 1, 1],
 [71, 1, 1],
 [72, 1, 1],
 [73, 1, 1],
 [74, 1, 1],
 [75, 1, 1],
 [76, 1, 1],
 [77, 1, 

In [23]:
from matminer.featurizers.structure import StructuralHeterogeneity
from matminer.featurizers.composition import IonProperty

test = IonProperty().featurize(df_featurized.composition[0])
print(test, IonProperty().feature_labels())

[True, np.float64(0.8202341305415322), np.float64(0.18414808339314626)] ['compound possible', 'max ionic char', 'avg ionic char']


In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
           

In [34]:
#drop features that dont vary
const_cols = [col for col in features if df_featurized[col].nunique() == 1] 
df_featurized.drop(const_cols)
# standardize the non-target numeric features
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(df_featurized[features]))
df_scaled = df_featurized.copy()
df_scaled[features] = features_scaled


In [None]:
# format the spearman correlation matrix to better display the labels
spearman_corr = df_scaled[features].corr(method='spearman')
old_index = spearman_corr.index
labels = []
prev_feature = None

for name in spearman_corr.columns:
    if "Magpie" in name:
        stat, feature = name.split(" ")[1:]
        if feature != prev_feature:  # Display feature name only for the first occurrence
            labels.append(f'{feature} - {stat}')
            prev_feature = feature
        else:
            labels.append(f'{stat}') 
    else:
        labels.append(name)
        
spearman_corr.columns = labels
spearman_corr.index = labels

plt.figure(figsize=(40, 40)) 
sns.heatmap(spearman_corr, 
            annot=False, 
            fmt=".2f", 
            cmap='coolwarm', 
            square=True, 
            linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()

spearman_corr.index = old_index
spearman_corr.columns = old_index