In [1]:
import re
from matminer.featurizers.composition import ElementProperty
from matminer.featurizers.structure import DensityFeatures
from pymatgen.core.composition import Composition
from pymatgen.core.structure import Structure
import pandas as pd
from matminer.datasets import load_dataset
import os
import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import subprocess
import sys

In [3]:
dataset_path = "matbench_mp_gap_raw.data"

def load_data():
    if not os.path.exists(dataset_path):
        print("Loading data")
        data = load_dataset("matbench_mp_gap")
        print("Cleaning data")
        data = clean_data(data)
        print(data.head())
        with open(dataset_path, "wb") as f:
            print("Saving data")
            pickle.dump(data, f)
            

def clean_data(data):
    data = data.dropna()
    data['formula'] = data['structure'].apply(lambda x: x.composition.reduced_formula)
    data = data.drop_duplicates(subset=['formula'])
    data = data.reset_index(drop=True)
    return data


In [4]:
featurized_path = "matbench_mp_gap_featurized.data"

def load_featurized():
    
    if not os.path.exists(featurized_path):
        load_data()
        subprocess.run([f'{sys.executable}', 'main.py'], capture_output=True, text=True)

    if os.path.exists(featurized_path):
        print("Loading featurized df...")
        with open(featurized_path, "rb") as f:
            df_featurized = pickle.load(f)
            return df_featurized
    else:
        return None

In [5]:
load_data()

Loading data
Fetching matbench_mp_gap.json.gz from https://ml.materialsproject.org/projects/matbench_mp_gap.json.gz to /home/markam/miniconda3/envs/materials/lib/python3.11/site-packages/matminer/datasets/matbench_mp_gap.json.gz


Fetching https://ml.materialsproject.org/projects/matbench_mp_gap.json.gz in MB: 137.070592MB [00:00, 477.91MB/s]                                        


Cleaning data
                                           structure  gap pbe formula
0  [[-0.00812638  0.02476014 -0.01698117] K, [-0....   1.3322   KMnO2
1  [[0.         1.78463544 1.78463544] Cr, [1.784...   0.0000   Cr3Ni
2  [[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...   0.0000  CsRbAs
3  [[0. 0. 0.] Si, [ 4.55195829  4.55195829 -4.55...   0.4113    SiSn
4  [[0.    2.655 2.655] Ca, [2.655 0.    2.655] C...   0.3514   Ca3P2
Saving data


8

In [11]:
display((dd))

Unnamed: 0,structure,gap pbe,formula,composition,density,vpa,packing fraction,mean absolute deviation in relative bond length,max relative bond length,min relative bond length,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,"[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322,KMnO2,"(K, Mn, O)",3.781313,13.836878,1.046223,6.082437e-02,1.121693,0.903900,...,0.000310,0.000078,0.000116,0.0,12.0,229.0,217.0,117.500000,105.500000,12.0
1,"[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0000,Cr3Ni,"(Cr, Ni)",7.839811,11.367855,0.984973,0.000000e+00,1.000000,1.000000,...,0.595395,0.148849,0.223273,0.0,225.0,229.0,4.0,228.000000,1.500000,229.0
2,"[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0000,CsRbAs,"(Cs, Rb, As)",3.119812,52.035996,0.860650,3.733800e-02,1.028004,0.943993,...,0.000000,0.000000,0.000000,0.0,166.0,229.0,63.0,208.000000,28.000000,166.0
3,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113,SiSn,"(Si, Sn)",4.361248,27.946089,0.328227,1.110223e-16,1.000000,1.000000,...,0.000000,0.000000,0.000000,0.0,141.0,227.0,86.0,184.000000,43.000000,141.0
4,"[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514,Ca3P2,"(Ca, P)",2.020551,29.944258,0.545444,8.415758e-02,1.116672,0.789606,...,0.000000,0.000000,0.000000,0.0,2.0,225.0,223.0,135.800000,107.040000,225.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251,"[[1.95267605 1.95267605 0. ] Al, [1.952...",0.0000,Al3V,"(Al, V)",3.676780,14.890888,0.585084,8.326673e-17,1.000000,1.000000,...,0.000000,0.000000,0.000000,0.0,225.0,229.0,4.0,226.000000,1.500000,225.0
252,"[[-1.53522488 0.88617083 -1.18670672] W, [-1....",0.0000,W2C,"(W, C)",16.510714,12.728931,0.577392,8.848239e-02,1.066362,0.867276,...,0.000000,0.000000,0.000000,0.0,194.0,229.0,35.0,217.333333,15.555556,229.0
253,"[[ 3.00761756 3.00761756 -3.00761756] Li, [0....",0.0000,LiMnAs,"(Li, Mn, As)",4.174836,18.137465,0.563010,3.725710e-02,1.027943,0.944114,...,0.000310,0.000103,0.000138,0.0,166.0,229.0,63.0,204.000000,25.333333,166.0
254,"[[0. 0. 0.] Ta, [1.53419936 1.53419936 1.53419...",0.0000,TaCo,"(Ta, Co)",13.788268,14.444595,0.798778,5.551115e-17,1.000000,1.000000,...,1.548471,0.774236,0.774236,0.0,194.0,229.0,35.0,211.500000,17.500000,194.0


In [7]:
df_featurized = load_featurized()

Loading featurized df...


In [8]:
df_featurized

Unnamed: 0,structure,gap pbe,formula,composition,structure.1,gap pbe.1,formula.1,composition.1,density,vpa,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,"[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322,KMnO2,"(K, Mn, O)","[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322,KMnO2,"(K, Mn, O)",3.781313,13.836878,...,0.00031,7.8e-05,0.000116,0.0,12.0,229.0,217.0,117.5,105.5,12.0
1,"[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0,Cr3Ni,"(Cr, Ni)","[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0,Cr3Ni,"(Cr, Ni)",7.839811,11.367855,...,0.595395,0.148849,0.223273,0.0,225.0,229.0,4.0,228.0,1.5,229.0
2,"[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0,CsRbAs,"(Cs, Rb, As)","[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0,CsRbAs,"(Cs, Rb, As)",3.119812,52.035996,...,0.0,0.0,0.0,0.0,166.0,229.0,63.0,208.0,28.0,166.0
3,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113,SiSn,"(Si, Sn)","[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113,SiSn,"(Si, Sn)",4.361248,27.946089,...,0.0,0.0,0.0,0.0,141.0,227.0,86.0,184.0,43.0,141.0
4,"[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514,Ca3P2,"(Ca, P)","[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514,Ca3P2,"(Ca, P)",2.020551,29.944258,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,135.8,107.04,225.0
5,"[[4.84254968 2.09688542 5.99234976] Li, [4.842...",0.0,Li3Co2(GeO4)3,"(Li, Co, Ge, O)","[[4.84254968 2.09688542 5.99234976] Li, [4.842...",0.0,Li3Co2(GeO4)3,"(Li, Co, Ge, O)",4.366641,10.431076,...,1.548471,0.154847,0.278725,0.0,12.0,229.0,217.0,94.7,99.24,12.0
6,"[[0. 0. 0.] In, [2.50951266 1.77449342 4.34660...",0.0,InSb,"(In, Sb)","[[0. 0. 0.] In, [2.50951266 1.77449342 4.34660...",0.0,InSb,"(In, Sb)",6.765321,29.033879,...,0.0,0.0,0.0,0.0,139.0,166.0,27.0,152.5,13.5,139.0
7,"[[-1.49720213 4.39586266 1.33097652] Hf, [3....",0.0,HfMg6Sb,"(Hf, Mg, Sb)","[[-1.49720213 4.39586266 1.33097652] Hf, [3....",0.0,HfMg6Sb,"(Hf, Mg, Sb)",4.099643,22.585298,...,0.0,0.0,0.0,0.0,166.0,194.0,28.0,190.5,6.125,194.0
8,"[[-4.80723014 -2.77545543 -0.5064036 ] Zn, [-4...",3.5902,ZnSiO3,"(Zn, Si, O)","[[-4.80723014 -2.77545543 -0.5064036 ] Zn, [-4...",3.5902,ZnSiO3,"(Zn, Si, O)",5.027982,9.345863,...,0.0,0.0,0.0,0.0,12.0,227.0,215.0,91.4,95.28,12.0
9,"[[-1.51036602 4.42914341 1.22400146] Mg, [3....",0.0,Mg6CdCo,"(Mg, Cd, Co)","[[-1.51036602 4.42914341 1.22400146] Mg, [3....",0.0,Mg6CdCo,"(Mg, Cd, Co)",3.310506,19.886691,...,1.548471,0.193559,0.338728,0.0,194.0,194.0,0.0,194.0,0.0,194.0


In [11]:
print(df_featurized.shape)
display(df_featurized.head())
features = list(df_featurized.select_dtypes(include=[np.number]))
features.remove('gap pbe')
print(features)

(78164, 139)


Unnamed: 0,structure,gap pbe,formula,composition,density,vpa,packing fraction,MagpieData minimum Number,MagpieData maximum Number,MagpieData range Number,...,MagpieData range GSmagmom,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,MagpieData minimum SpaceGroupNumber,MagpieData maximum SpaceGroupNumber,MagpieData range SpaceGroupNumber,MagpieData mean SpaceGroupNumber,MagpieData avg_dev SpaceGroupNumber,MagpieData mode SpaceGroupNumber
0,"[[-0.00812638 0.02476014 -0.01698117] K, [-0....",1.3322,KMnO2,"(K, Mn, O)",3.781313,13.836878,1.046223,8.0,25.0,17.0,...,0.00031,7.8e-05,0.000116,0.0,12.0,229.0,217.0,117.5,105.5,12.0
1,"[[0. 1.78463544 1.78463544] Cr, [1.784...",0.0,Cr3Ni,"(Cr, Ni)",7.839811,11.367855,0.984973,24.0,28.0,4.0,...,0.595395,0.148849,0.223273,0.0,225.0,229.0,4.0,228.0,1.5,229.0
2,"[[-2.13764909 -2.12540569 -2.14704542] Cs, [-6...",0.0,CsRbAs,"(Cs, Rb, As)",3.119812,52.035996,0.86065,33.0,55.0,22.0,...,0.0,0.0,0.0,0.0,166.0,229.0,63.0,208.0,28.0,166.0
3,"[[0. 0. 0.] Si, [ 4.55195829 4.55195829 -4.55...",0.4113,SiSn,"(Si, Sn)",4.361248,27.946089,0.328227,14.0,50.0,36.0,...,0.0,0.0,0.0,0.0,141.0,227.0,86.0,184.0,43.0,141.0
4,"[[0. 2.655 2.655] Ca, [2.655 0. 2.655] C...",0.3514,Ca3P2,"(Ca, P)",2.020551,29.944258,0.545444,15.0,20.0,5.0,...,0.0,0.0,0.0,0.0,2.0,225.0,223.0,135.8,107.04,225.0


['density', 'vpa', 'packing fraction', 'MagpieData minimum Number', 'MagpieData maximum Number', 'MagpieData range Number', 'MagpieData mean Number', 'MagpieData avg_dev Number', 'MagpieData mode Number', 'MagpieData minimum MendeleevNumber', 'MagpieData maximum MendeleevNumber', 'MagpieData range MendeleevNumber', 'MagpieData mean MendeleevNumber', 'MagpieData avg_dev MendeleevNumber', 'MagpieData mode MendeleevNumber', 'MagpieData minimum AtomicWeight', 'MagpieData maximum AtomicWeight', 'MagpieData range AtomicWeight', 'MagpieData mean AtomicWeight', 'MagpieData avg_dev AtomicWeight', 'MagpieData mode AtomicWeight', 'MagpieData minimum MeltingT', 'MagpieData maximum MeltingT', 'MagpieData range MeltingT', 'MagpieData mean MeltingT', 'MagpieData avg_dev MeltingT', 'MagpieData mode MeltingT', 'MagpieData minimum Column', 'MagpieData maximum Column', 'MagpieData range Column', 'MagpieData mean Column', 'MagpieData avg_dev Column', 'MagpieData mode Column', 'MagpieData minimum Row', 'Ma

In [26]:
[[i, row.structure, row.composition] for i, row in df_featurized.iterrows()]

[[0, 1, 1],
 [1, 1, 1],
 [2, 1, 1],
 [3, 1, 1],
 [4, 1, 1],
 [5, 1, 1],
 [6, 1, 1],
 [7, 1, 1],
 [8, 1, 1],
 [9, 1, 1],
 [10, 1, 1],
 [11, 1, 1],
 [12, 1, 1],
 [13, 1, 1],
 [14, 1, 1],
 [15, 1, 1],
 [16, 1, 1],
 [17, 1, 1],
 [18, 1, 1],
 [19, 1, 1],
 [20, 1, 1],
 [21, 1, 1],
 [22, 1, 1],
 [23, 1, 1],
 [24, 1, 1],
 [25, 1, 1],
 [26, 1, 1],
 [27, 1, 1],
 [28, 1, 1],
 [29, 1, 1],
 [30, 1, 1],
 [31, 1, 1],
 [32, 1, 1],
 [33, 1, 1],
 [34, 1, 1],
 [35, 1, 1],
 [36, 1, 1],
 [37, 1, 1],
 [38, 1, 1],
 [39, 1, 1],
 [40, 1, 1],
 [41, 1, 1],
 [42, 1, 1],
 [43, 1, 1],
 [44, 1, 1],
 [45, 1, 1],
 [46, 1, 1],
 [47, 1, 1],
 [48, 1, 1],
 [49, 1, 1],
 [50, 1, 1],
 [51, 1, 1],
 [52, 1, 1],
 [53, 1, 1],
 [54, 1, 1],
 [55, 1, 1],
 [56, 1, 1],
 [57, 1, 1],
 [58, 1, 1],
 [59, 1, 1],
 [60, 1, 1],
 [61, 1, 1],
 [62, 1, 1],
 [63, 1, 1],
 [64, 1, 1],
 [65, 1, 1],
 [66, 1, 1],
 [67, 1, 1],
 [68, 1, 1],
 [69, 1, 1],
 [70, 1, 1],
 [71, 1, 1],
 [72, 1, 1],
 [73, 1, 1],
 [74, 1, 1],
 [75, 1, 1],
 [76, 1, 1],
 [77, 1, 

In [23]:
from matminer.featurizers.structure import StructuralHeterogeneity
from matminer.featurizers.composition import IonProperty

test = IonProperty().featurize(df_featurized.composition[0])
print(test, IonProperty().feature_labels())

[True, np.float64(0.8202341305415322), np.float64(0.18414808339314626)] ['compound possible', 'max ionic char', 'avg ionic char']


In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
                    from the data source will be replaced by the average of that value
                    over the available elements.
                    This avoids NaNs after featurization that are often replaced by
                    dataset-dependent averages.
In a future release, impute_nan will be set to True by default.
                    This means that features that are missing or are NaNs for elements
           

In [34]:
#drop features that dont vary
const_cols = [col for col in features if df_featurized[col].nunique() == 1] 
df_featurized.drop(const_cols)
# standardize the non-target numeric features
scaler = StandardScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(df_featurized[features]))
df_scaled = df_featurized.copy()
df_scaled[features] = features_scaled


In [None]:
# format the spearman correlation matrix to better display the labels
spearman_corr = df_scaled[features].corr(method='spearman')
old_index = spearman_corr.index
labels = []
prev_feature = None

for name in spearman_corr.columns:
    if "Magpie" in name:
        stat, feature = name.split(" ")[1:]
        if feature != prev_feature:  # Display feature name only for the first occurrence
            labels.append(f'{feature} - {stat}')
            prev_feature = feature
        else:
            labels.append(f'{stat}') 
    else:
        labels.append(name)
        
spearman_corr.columns = labels
spearman_corr.index = labels

plt.figure(figsize=(40, 40)) 
sns.heatmap(spearman_corr, 
            annot=False, 
            fmt=".2f", 
            cmap='coolwarm', 
            square=True, 
            linewidths=.5)
plt.title('Correlation Heatmap')
plt.show()

spearman_corr.index = old_index
spearman_corr.columns = old_index