# Featurization in Materials Science

## Composition-based Featurization

In [None]:
from elementembeddings.core import Embedding

# Load the magpie data
magpie = Embedding.load_data("magpie")

assert magpie is not None

print(f"The magpie representation has embeddings of dimension {magpie.dim}")
print(f"The magpie representation contains these elements: \n {magpie.element_list}")
print(f"The magpie representation contains these features: \n {magpie.feature_labels}")

These embeddings can be used to visualize trends/groupings in the periodic table.

In [None]:
from elementembeddings.plotter import dimension_plotter
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1, figsize=(6, 6))

dimension_plotter(
    embedding=magpie,  # type: ignore
    reducer="pca",
    n_components=2,
    ax=ax,
    adjusttext=True,
)
ax.set_title("Magpie PCA")
ax.legend().remove()
handles, labels = ax.get_legend_handles_labels()
fig.legend(handles, labels, bbox_to_anchor=(1.25, 0.5), loc="center right", ncol=1)

fig.tight_layout()
fig.show()

Now, let us use the embedding to create a feature vector for a composition.

In [None]:
from elementembeddings.composition import CompositionalEmbedding
import numpy as np

Fe2O3_magpie = CompositionalEmbedding("Fe2O3", "magpie")
Fe2O3_feature_vec = Fe2O3_magpie.feature_vector(stats="mean")
print(Fe2O3_feature_vec)

Fe = CompositionalEmbedding("Fe", "magpie")
O = CompositionalEmbedding("O", "magpie")
Fe_feature_vec = Fe.feature_vector()
O_feature_vec = O.feature_vector()
Fe2O3_mean = (2 * Fe_feature_vec + 3 * O_feature_vec) / 5.0
print(np.isclose(Fe2O3_feature_vec, Fe2O3_mean).all())

The embedding can also be used to get a distance (or similarity) in feature space between two compositions.

In [None]:
print(Fe2O3_magpie.distance("Fe3O4"))
print(Fe2O3_magpie.distance("ZrN"))

## Atomistic Featurization

We will use DScribe for creating atomistic descriptors. It utilizes the Atomic Simulation Environment (ASE) which is similar to Pymatgen. We will cover this in more detail in a later lecture.

In [None]:
from dscribe.descriptors import CoulombMatrix
from ase.build import molecule
from ase.visualize import view

# Setting up the CM descriptor
cm = CoulombMatrix(n_atoms_max=6)

methanol = molecule("CH3OH")
view(methanol)
# Create CM output for the system
cm_methanol = cm.create(methanol)

print(cm_methanol)

# Create output for multiple system
samples = [molecule("H2O"), molecule("NO2"), molecule("CO2")]
coulomb_matrices = cm.create(samples)

Let us explore different sorting of the Coulomb matrix.

In [None]:
# No sorting
cm = CoulombMatrix(n_atoms_max=6, permutation='none')

cm_methanol_noperm = cm.create(methanol)
print(methanol.get_chemical_symbols())
print("in order of appearance", cm_methanol_noperm)

# Sort by Euclidean (L2) norm.
cm = CoulombMatrix(n_atoms_max=6, permutation='sorted_l2')

cm_methanol_l2 = cm.create(methanol)
print("default: sorted by L2-norm", cm_methanol_l2)

# Random
cm = CoulombMatrix(
    n_atoms_max=6,
    permutation='random',
    sigma=70,
    seed=None
)

cm_methanol_random = cm.create(methanol)
print("randomly sorted", cm_methanol_random)

# Eigenspectrum
cm = CoulombMatrix(
    n_atoms_max=6,
    permutation='eigenspectrum'
)

cm_methanol_eig = cm.create(methanol)
print("eigenvalues", cm_methanol_eig)

For smaller molecules, zeros will be added for padding to satisfy the `n_atoms_max` size that the user set.

In [None]:
# Zero-padding
cm = CoulombMatrix(n_atoms_max=10)
cm_methanol = cm.create(methanol)

print("zero-padded", cm_methanol)
print(cm_methanol.shape)  # type: ignore

Let us verify that the Coulomb matrix satisifies invariance to translations, rotations, and permutations.

In [None]:
# Invariance
cm = CoulombMatrix(
    n_atoms_max=6,
    permutation="sorted_l2"
)

# Translation
methanol.translate((5, 7, 9))
cm_methanol = cm.create(methanol)
print(cm_methanol)

# Rotation
methanol.rotate(90, 'z', center=(0, 0, 0))
cm_methanol = cm.create(methanol)
print(cm_methanol)

# Permutation
upside_down_methanol = methanol[::-1]
cm_methanol = cm.create(upside_down_methanol)
print(cm_methanol)