In [None]:
from data import DATA_3DCD, DATA_MP

In [None]:
import seaborn as sns
from matplotlib import colors

sns.set(style="white", palette="bright", color_codes=True)
my_c = colors.ListedColormap(["mediumblue", "red"])
c = ["mediumblue", "red"]
sns.set_palette(sns.color_palette(c))

In [None]:
from constants import PG_LIST_ORDER

In [None]:
from dask.distributed import Client

client = Client()
client

The geometric.npz file already exists for example data, so we can just import it as follows to create the figures. Otherwise, run the whole notebook. 

## 3DCD data analysis

### Load data

In [None]:
import ase.io
import numpy as np

# frames = ase.io.read(DATA_3DCD.structures, index=":")
frames = ase.io.read(DATA_3DCD.structures, index="::100")  # DEBUG

In [None]:
from tqdm.auto import tqdm

for frame in tqdm(frames):
    frame.wrap(eps=1e-10)

### Compute geometric descriptors

In [None]:
# This cell should only create either delayed objects or perform extremely cheap operations.
from ase import Atoms
from math import pi
from pymatgen.analysis.local_env import CrystalNN
from tqdm.notebook import tqdm
from utils import get_pymatgen, point_group, get_r as get_radii
import dask.array as da
import dask.bag as db


def compute_cn(frame):
    "Compute the average coordination number."
    cnn = CrystalNN(
        weighted_cn=False,
        distance_cutoffs=None,
        x_diff_weight=0.0,
        porous_adjustment=False,
    )
    pymatgen_frame = get_pymatgen(frame)
    nn = np.empty((len(frame),))
    nn[:] = np.nan
    for i in range(len(frame)):
        nn[i] = cnn.get_cn(pymatgen_frame, i)
    return nn.mean()


def compute_point_group(frame):
    print(type(frame), frame)
    sg = ase.spacegroup.get_spacegroup(frame, symprec=1e-05)
    return point_group(sg.no)


def get_num_species(frame):
    elements = frame.get_chemical_symbols()
    print(elements)
    assert 0
    return len(np.unique(elements))


def compute_x(radii):
    n_s = np.count_nonzero(radii == np.min(radii))
    n_l = np.count_nonzero(radii == np.max(radii))
    return n_s / (n_s + n_l)


db_frames = db.from_sequence(frames, npartitions=len(frames) // 10)

natoms = da.from_array(db_frames.map(len).compute())
magic = np.array((natoms % 4 == 0).compute(), dtype=int)

average_num_nearest_neighbors = db_frames.map(compute_cn)
elements = db_frames.map(lambda frame: frame.get_chemical_symbols())
num_species = elements.map(np.unique).map(len)
radii = elements.map(get_radii)
alpha = radii.map(lambda r: np.min(r) / np.max(r))
std_ratio = radii.map(np.std)
x = radii.map(compute_x)
point_groups = db_frames.map(compute_point_group)
volume_atom = radii.map(lambda r: np.sum((np.pi * 4 / 3) * (np.array(r) / 100) ** 3))
volume = db_frames.map(lambda frame: frame.get_volume())

In [None]:
from dask import compute

(
    c_magic,
    # c_cn,
    c_num_species,
    c_natoms,
    c_volume_atom,
    c_volume,
    c_alpha,
    c_x,
    c_std_ratio,
    c_point_groups,
) = compute(
    magic,
    # average_num_nearest_neighbors,
    num_species,
    natoms,
    volume_atom,
    volume,
    alpha,
    x,
    std_ratio,
    point_groups,
)


np.savez(
    DATA_3DCD.geo,
    magic=c_magic,
    # cn=c_cn,
    num_species=c_num_species,
    natoms=c_natoms,
    packing=np.array(c_volume_atom) / np.array(c_volume),
    alpha=c_alpha,
    x=c_x,
    std_ratio=c_std_ratio,
    pg_list=c_point_groups,
)

### Generate plots

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from utils import inh_symm

npzfile = np.load(DATA_3DCD.geo, allow_pickle=True)

num_species = npzfile["num_species"]  # not sure if needed

dict_ = {
    r"$\alpha$": npzfile["alpha"],
    r"$x$": npzfile["x"],
    r"$\sigma_{radii}$": npzfile["std_ratio"],
    r"$N_{atoms,cell}$": npzfile["natoms"].ravel(),
    r"$PF$": npzfile["packing"],
    "magic": npzfile["magic"].ravel(),
    r"$N_{species}$": npzfile["num_species"],
    "point_group": npzfile["pg_list"].ravel(),
}
df = pd.DataFrame(dict_)

#   First set of histograms generation

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5))

sns.histplot(
    ax=ax1,
    data=df,
    x=r"$N_{species}$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    bins=int(max(num_species)),
    kde=False,
)

sns.histplot(
    ax=ax2,
    data=df,
    x=r"$x$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    binwidth=0.10,
    kde=True,
)
ax2.set_ylabel(None)
ax2.set_xlim(0, 1)

sns.histplot(
    ax=ax3,
    data=df,
    x=r"$\alpha$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    binwidth=0.1,
    kde=True,
)
ax3.set_xlim(0, 1)
ax3.set_ylabel(None)

sns.histplot(
    ax=ax4,
    data=df,
    x=r"$PF$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    kde=True,
)
ax4.set_ylabel(None)
ax4.set_xlim(0, 1)

sns.despine(left=True, bottom=True)
plt.show()
plt.close()

m = df.loc[df["magic"] == 1]
nm = df.loc[df["magic"] == 0]

inh_symm_m = inh_symm(m["point_group"])
inh_symm_whole = inh_symm(df["point_group"])
inh_symm_nm = inh_symm(nm["point_group"])

inh_symm_m = inh_symm_m.reindex(PG_LIST_ORDER)
inh_symm_nm = inh_symm_nm.reindex(PG_LIST_ORDER)
inh_symm_whole = inh_symm_whole.reindex(PG_LIST_ORDER)

inh_symm_m = inh_symm_m / inh_symm_m.loc["1"]
inh_symm_nm = inh_symm_nm / inh_symm_nm.loc["1"]

inh_symm_m.columns = ["magic"]
inh_symm_m["non-magic"] = inh_symm_nm["point_group"]

fig = inh_symm_m.plot.barh(
    figsize=(8, 9), color={"magic": "red", "non-magic": "mediumblue"}
)
plt.ylabel("point group")
plt.xlabel("proportion of inherited symmetries")
sns.despine(left=True, bottom=True)

## MP data analysis 

In [None]:
import ase.io
import numpy as np

# frames = ase.io.read(DATA_MP.structures, index=":")
frames = ase.io.read(DATA_MP.structures, index="::100")  # DEBUG

In [None]:
from tqdm.auto import tqdm

for frame in tqdm(frames):
    frame.wrap(eps=1e-10)

### Compute geometric descriptors

In [None]:
# This cell should only create either delayed objects or perform extremely cheap operations.
from ase import Atoms
from math import pi
from pymatgen.analysis.local_env import CrystalNN
from tqdm.notebook import tqdm
from utils import get_pymatgen, point_group, get_r as get_radii
import dask.array as da
import dask.bag as db


def compute_cn(frame):
    "Compute the average coordination number."
    cnn = CrystalNN(
        weighted_cn=False,
        distance_cutoffs=None,
        x_diff_weight=0.0,
        porous_adjustment=False,
    )
    pymatgen_frame = get_pymatgen(frame)
    nn = np.empty((len(frame),))
    nn[:] = np.nan
    for i in range(len(frame)):
        nn[i] = cnn.get_cn(pymatgen_frame, i)
    return nn.mean()


def compute_point_group(frame):
    print(type(frame), frame)
    sg = ase.spacegroup.get_spacegroup(frame, symprec=1e-05)
    return point_group(sg.no)


def get_num_species(frame):
    elements = frame.get_chemical_symbols()
    print(elements)
    assert 0
    return len(np.unique(elements))


def compute_x(radii):
    n_s = np.count_nonzero(radii == np.min(radii))
    n_l = np.count_nonzero(radii == np.max(radii))
    return n_s / (n_s + n_l)


db_frames = db.from_sequence(frames, npartitions=len(frames) // 10)

natoms = da.from_array(db_frames.map(len).compute())
magic = np.array((natoms % 4 == 0).compute(), dtype=int)

average_num_nearest_neighbors = db_frames.map(compute_cn)
elements = db_frames.map(lambda frame: frame.get_chemical_symbols())
num_species = elements.map(np.unique).map(len)
radii = elements.map(get_radii)
alpha = radii.map(lambda r: np.min(r) / np.max(r))
std_ratio = radii.map(np.std)
x = radii.map(compute_x)
point_groups = db_frames.map(compute_point_group)
volume_atom = radii.map(lambda r: np.sum((np.pi * 4 / 3) * (np.array(r) / 100) ** 3))
volume = db_frames.map(lambda frame: frame.get_volume())

In [None]:
from dask import compute

(
    c_magic,
    # c_cn,
    c_num_species,
    c_natoms,
    c_volume_atom,
    c_volume,
    c_alpha,
    c_x,
    c_std_ratio,
    c_point_groups,
) = compute(
    magic,
    # average_num_nearest_neighbors,
    num_species,
    natoms,
    volume_atom,
    volume,
    alpha,
    x,
    std_ratio,
    point_groups,
)


np.savez(
    DATA_MP.geo,
    magic=c_magic,
    # cn=c_cn,
    num_species=c_num_species,
    natoms=c_natoms,
    packing=np.array(c_volume_atom) / np.array(c_volume),
    alpha=c_alpha,
    x=c_x,
    std_ratio=c_std_ratio,
    pg_list=c_point_groups,
)

### Generate plots

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from utils import inh_symm

npzfile = np.load(DATA_MP.geo, allow_pickle=True)

num_species = npzfile["num_species"]  # not sure if needed

dict_ = {
    r"$\alpha$": npzfile["alpha"],
    r"$x$": npzfile["x"],
    r"$\sigma_{radii}$": npzfile["std_ratio"],
    r"$N_{atoms,cell}$": npzfile["natoms"].ravel(),
    r"$PF$": npzfile["packing"],
    "magic": npzfile["magic"].ravel(),
    r"$N_{species}$": npzfile["num_species"],
    "point_group": npzfile["pg_list"].ravel(),
}
df = pd.DataFrame(dict_)

#   First set of histograms generation

fig, (ax1, ax2, ax3, ax4) = plt.subplots(1, 4, figsize=(15, 5))

sns.histplot(
    ax=ax1,
    data=df,
    x=r"$N_{species}$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    bins=int(max(num_species)),
    kde=False,
)

sns.histplot(
    ax=ax2,
    data=df,
    x=r"$x$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    binwidth=0.10,
    kde=True,
)
ax2.set_ylabel(None)
ax2.set_xlim(0, 1)

sns.histplot(
    ax=ax3,
    data=df,
    x=r"$\alpha$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    binwidth=0.1,
    kde=True,
)
ax3.set_xlim(0, 1)
ax3.set_ylabel(None)

sns.histplot(
    ax=ax4,
    data=df,
    x=r"$PF$",
    hue="magic",
    stat="probability",
    legend=False,
    common_norm=False,
    multiple="dodge",
    shrink=0.8,
    # bins=30,
    kde=True,
)
ax4.set_ylabel(None)
ax4.set_xlim(0, 1)

sns.despine(left=True, bottom=True)
plt.show()
plt.close()

m = df.loc[df["magic"] == 1]
nm = df.loc[df["magic"] == 0]

inh_symm_m = inh_symm(m["point_group"])
inh_symm_whole = inh_symm(df["point_group"])
inh_symm_nm = inh_symm(nm["point_group"])

inh_symm_m = inh_symm_m.reindex(PG_LIST_ORDER)
inh_symm_nm = inh_symm_nm.reindex(PG_LIST_ORDER)
inh_symm_whole = inh_symm_whole.reindex(PG_LIST_ORDER)

inh_symm_m = inh_symm_m / inh_symm_m.loc["1"]
inh_symm_nm = inh_symm_nm / inh_symm_nm.loc["1"]

inh_symm_m.columns = ["magic"]
inh_symm_m["non-magic"] = inh_symm_nm["point_group"]

fig = inh_symm_m.plot.barh(
    figsize=(8, 9), color={"magic": "red", "non-magic": "mediumblue"}
)
plt.ylabel("point group")
plt.xlabel("proportion of inherited symmetries")
sns.despine(left=True, bottom=True)