In [2]:
from pymatgen.core import Composition
from matminer.featurizers.composition import ElementProperty

featurizer = ElementProperty.from_preset('magpie') # load the magpie preset --> set of elemental properties 
feature_vector_H = featurizer.featurize(Composition("H")) # featurize the composition of H    
feature_vector_H2O = featurizer.featurize(Composition("H2O")) # featurize the composition of H2O

print(len(feature_vector_H))
print(len(feature_vector_H2O))


132
132



The sklearn.neighbors.unsupervised module is  deprecated in version 0.22 and will be removed in version 0.24. The corresponding classes / functions should instead be imported from sklearn.neighbors. Anything that cannot be imported from sklearn.neighbors is now part of the private API.



The feature vector contains information about elemental properties like the atomic masses or electronegativity in the structure and also takes statistical values (mean, min, max, var, range) of those properties into account. Since the H structure contains of only one single atom, values like the variance or range are automatically zero which is not the case for the H2O structure.

In [3]:
#test the different presets
presets = ["magpie", "deml", "matminer", "matscholar_el", "megnet_el"] # list contains different presets apart from magpie
for pre in presets:
    featurizer = ElementProperty.from_preset(pre)
    feature_vector_H2O = featurizer.featurize(Composition("H2O"))
    print(f"preset {pre}, elements in the feature vector: {len(feature_vector_H2O)}")
    #print(feature_vector_H2O)
    



preset magpie, elements in the feature vector: 132
preset deml, elements in the feature vector: 80
preset matminer, elements in the feature vector: 65
preset matscholar_el, elements in the feature vector: 1000
preset megnet_el, elements in the feature vector: 80


Different presets contain different information about the investigated structure. Here are some of the properties that are part of the preset:
- magpie --> General materials properties
    - Atomic number 
    - atomic mass
    - electronegativity
    - periodic table position
    - thermal/ electric properties (melting point, conductivity, etc.)
    - crystallographic feature (e.g. spacegroup)
- deml (data-driven effective model for materials) --> Energy/phase stability
    - atomic radius
    - ionization energy 
    - electron affinity
- matminer --> Interpretable models
    - subset of the magpie data (and excludes some crystallographic data)


--> it makes the featurization easier and consistent among different structures so that for example there is no conflict of different electronegativities (e.g. pauling vs. allen). This is crucial if the feature vectors are later used to train a model for new predictions.

In [4]:
from matminer.datasets import load_dataset
import pandas as pd
import pprint


bandgaps = load_dataset('matbench_expt_gap')


bandgaps_filtered = bandgaps[bandgaps["gap expt"]>0] # filter out compositions with no experimental band gap
featurizer = ElementProperty.from_preset('magpie') # load the magpie preset
# Convert strings to Composition objects
bandgaps_filtered['composition'] = bandgaps_filtered['composition'].apply(Composition)
 
# featurize all compositions of the dataframe append them to it
filtered_df = featurizer.featurize_dataframe(
    bandgaps_filtered, 
    col_id='composition',  # DataFrame column containing composition strings
    ignore_errors=True,    # Skip problematic entries
    return_errors=True     # Track any failures
)

# calculate the correlation matrix
corr_matrix = filtered_df.iloc[:, 1:].corr(method='pearson')

high_corr_pairs = []
for i in range(len(corr_matrix.columns)): #iterate through rows
    for j in range(i + 1, len(corr_matrix.columns)): # iterate through columns
        if abs(corr_matrix.iloc[i, j]) > 0.95: # check if correlation is over 0.95
            # append the features and their correlation
            high_corr_pairs.append([
                corr_matrix.columns[i],
                corr_matrix.columns[j],
                corr_matrix.iloc[i, j]
            ])
pprint.pprint(high_corr_pairs)


ElementProperty:   0%|          | 0/2154 [00:00<?, ?it/s]

[['MagpieData minimum Number',
  'MagpieData minimum AtomicWeight',
  0.997741955547722],
 ['MagpieData minimum Number', 'MagpieData minimum Row', 0.9510486060928031],
 ['MagpieData maximum Number',
  'MagpieData maximum AtomicWeight',
  0.9992666504576497],
 ['MagpieData range Number',
  'MagpieData range AtomicWeight',
  0.9967739753164336],
 ['MagpieData mean Number', 'MagpieData mean AtomicWeight', 0.9986266807924157],
 ['MagpieData mean Number', 'MagpieData mean Row', 0.9587451718427101],
 ['MagpieData avg_dev Number',
  'MagpieData avg_dev AtomicWeight',
  0.9974832107882347],
 ['MagpieData mode Number', 'MagpieData mode AtomicWeight', 0.9986114270946439],
 ['MagpieData mode Number', 'MagpieData mode Row', 0.9687152282226607],
 ['MagpieData minimum MendeleevNumber',
  'MagpieData range MendeleevNumber',
  -0.9919791728140207],
 ['MagpieData minimum MendeleevNumber',
  'MagpieData minimum Column',
  0.953111285120005],
 ['MagpieData mode AtomicWeight', 'MagpieData mode Row', 0.956

In [5]:
from matminer.featurizers.structure import SineCoulombMatrix


scm = SineCoulombMatrix(flatten=True)

phonon_data = load_dataset("phonon_dielectric_mp") # load phonon data from Materials Project
# check the variable type
print(type(phonon_data["structure"]))
print(type(phonon_data["structure"].iloc[0]))

#Fit the Sine Coulomb Matrix to a list of structures
scm.fit(phonon_data['structure']) 



# Add sine Coulomb matrix eigenvalues as features
df_scm = scm.featurize_dataframe(
    phonon_data, 
    col_id='structure',  # Column containing pymatgen Structure objects
    ignore_errors=True,  # Skip any structures that fail featurization
    return_errors=True   # Include error information if needed
)



<class 'pandas.core.series.Series'>
<class 'pymatgen.core.structure.Structure'>


SineCoulombMatrix:   0%|          | 0/1296 [00:00<?, ?it/s]


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part



Elemental features are only computed from the chemical composition and the arrangement of atoms in space is not considered. However the SCM features takes both the composition and the position of the atom in space into account. Elemtal features are therefore the same for all structures with the same formula whereas the SCM features can distinguish between them (e.g. when there have different crystal structures).

steps:
- for each structure in the loaded data set the SCM is computed
- then the eigenvalues for the structures are calculated
- the eigenvalues are appended as features to the phonon data frame

In [10]:
from matminer.featurizers.structure import SineCoulombMatrix
from matminer.datasets import load_dataset

scm = SineCoulombMatrix(flatten=True)
phonon_data = load_dataset("phonon_dielectric_mp")
#print(type(phonon_data["structure"].iloc[0]))

scm.fit(phonon_data['structure'])

# compute the eigenvalues of the SineCoulombMatrix and append these to the phonon data
phonon_data = scm.featurize_dataframe(
    phonon_data,
    col_id='structure',  # Column containing Structure objects
    ignore_errors=True,
    return_errors=True
)



SineCoulombMatrix:   0%|          | 0/1296 [00:00<?, ?it/s]


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part



In [28]:
from automatminer.featurization import AutoFeaturizer

# Load the dataset
#phonon_data = load_dataset("phonon_dielectric_mp")

# Initialize the AutoFeaturizer with the express preset
autofeaturizer = AutoFeaturizer(preset="express")

# compute features from data set with preset "express"
phonon_data_feat = autofeaturizer.fit_transform(phonon_data, target=None)


Length of header or names does not match length of data. This leads to a loss of data with index_col=False.



2025-06-16 20:19:00 INFO     AutoFeaturizer: Starting fitting.
2025-06-16 20:19:00 INFO     AutoFeaturizer: Adding compositions from structures.
2025-06-16 20:19:00 INFO     AutoFeaturizer: Guessing oxidation states of structures if they were not present in input.



The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.



StructureToOxidStructure:   0%|          | 0/1296 [00:00<?, ?it/s]

StructureToComposition:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:02 INFO     AutoFeaturizer: Guessing oxidation states of compositions, as they were not present in input.


CompositionToOxidComposition:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:02 INFO     AutoFeaturizer: Will remove YangSolidSolution because it's fraction passing the precheck for this dataset (0.2800925925925926) was less than the minimum (0.9)
2025-06-16 20:19:02 INFO     AutoFeaturizer: Will remove Miedema because it's fraction passing the precheck for this dataset (0.2800925925925926) was less than the minimum (0.9)
2025-06-16 20:19:02 INFO     AutoFeaturizer: Guessing oxidation states of structures if they were not present in input.


StructureToOxidStructure:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:03 INFO     AutoFeaturizer: Will remove GlobalInstabilityIndex because it's fraction passing the precheck for this dataset (0.8179012345679012) was less than the minimum (0.9)
2025-06-16 20:19:03 INFO     AutoFeaturizer: Featurizer type bandstructure not in the dataframe to be fitted. Skipping...
2025-06-16 20:19:03 INFO     AutoFeaturizer: Featurizer type dos not in the dataframe to be fitted. Skipping...
2025-06-16 20:19:03 INFO     AutoFeaturizer: Finished fitting.
2025-06-16 20:19:03 INFO     AutoFeaturizer: Starting transforming.
2025-06-16 20:19:03 INFO     AutoFeaturizer: Featurizing with ElementProperty.


ElementProperty:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:04 INFO     AutoFeaturizer: Featurizing with OxidationStates.


OxidationStates:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:04 INFO     AutoFeaturizer: Featurizing with ElectronAffinity.


ElectronAffinity:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:04 INFO     AutoFeaturizer: Featurizing with IonProperty.


IonProperty:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:04 INFO     AutoFeaturizer: Featurizing with DensityFeatures.


DensityFeatures:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:05 INFO     AutoFeaturizer: Featurizing with GlobalSymmetryFeatures.


GlobalSymmetryFeatures:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:07 INFO     AutoFeaturizer: Featurizing with EwaldEnergy.


EwaldEnergy:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:11 INFO     AutoFeaturizer: Featurizing with SineCoulombMatrix.


SineCoulombMatrix:   0%|          | 0/1296 [00:00<?, ?it/s]


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part


Casting complex values to real discards the imaginary part



2025-06-16 20:19:12 INFO     AutoFeaturizer: Featurizing with StructuralComplexity.


StructuralComplexity:   0%|          | 0/1296 [00:00<?, ?it/s]

2025-06-16 20:19:19 INFO     AutoFeaturizer: Featurizer type bandstructure not in the dataframe. Skipping...
2025-06-16 20:19:19 INFO     AutoFeaturizer: Featurizer type dos not in the dataframe. Skipping...
2025-06-16 20:19:19 INFO     AutoFeaturizer: Finished transforming.


In [46]:
print("Shape of featurized dataframe:", phonon_data_feat.shape)
# drop columns with more than 1% NaN
nan_fraction = phonon_data_feat.isna().mean() # pandas dataframe with fractional nan content for each column
cols_to_keep = nan_fraction[nan_fraction <= 0.01].index
phonon_data_feat_clean = phonon_data_feat[cols_to_keep] #select the columns

# replace the NaN values with the mean of each column
phonon_data_feat_clean = phonon_data_feat_clean.fillna(phonon_data_feat_clean.mean()) 
print("Shape of fully cleaned dataframe:", phonon_data_feat_clean.shape)
print("Number of samples: ", phonon_data_feat_clean.shape[0])
print("Number of features: ", phonon_data_feat_clean.shape[1])





Shape of featurized dataframe: (1296, 203)
Shape of fully cleaned dataframe: (1296, 202)
Number of samples:  1296
Number of features:  202



Dropping of nuisance columns in DataFrame reductions (with 'numeric_only=None') is deprecated; in a future version this will raise TypeError.  Select only valid columns before calling the reduction.

