This notebook is just a demonstration on how the SOAP vectors are obtained starting from the .xyz files. 
The soap.npz files were already generated and can be downloaded, so there is no real need to run this notebook. 
If you decide to run it anyways using production data, be prepared to wait for a while, depending on the power of your RAM. The output files will be called 'generated_soap*.npz'. 

In [None]:
from data import DATA_3DCD, DATA_MP

In [None]:
%run ./modules.ipynb

In [None]:
if DATA_3DCD.soap.exists() and DATA_MP.soap.exists():
    print("Data alreeady present, no need to execute this notebook.")

In [None]:
frames_3dcd = ase.io.read(DATA_3DCD.structures, index=":")
frames_mp = ase.io.read(DATA_MP.structures, index=":")
len(frames_3dcd), len(frames_mp)

In [None]:
frames = frames_3dcd

# number of most diverse features selected
n_FPS = 2000

# number of structures analysed, the whole dataset will take some time
n_frames = len(frames)

# number of principal components to include in the analysis (min=2)
n_PC = 5

In [None]:
magic = np.zeros((len(frames)))
for i, frame in enumerate(tqdm(frames)):
    frame.wrap(eps=1e-12)
    if len(frame) % 4 == 0:
        magic[i] = 1

In [None]:
species = list(set([int(n) for frame in frames for n in frame.numbers]))

hypers = dict(
    soap_type="PowerSpectrum",
    interaction_cutoff=3.5,
    max_radial=4,
    max_angular=4,
    gaussian_sigma_type="Constant",
    gaussian_sigma_constant=0.5,
    cutoff_smooth_width=0.5,
    global_species=species,
    expansion_by_species_method="user defined",
    normalize=False,
)

In [None]:
fps_soap = SOAP(
    **hypers,
)
idx_for_fps = np.random.randint(0, len(frames), n_FPS)

In [None]:
frames_for_fps = [frames[i] for i in idx_for_fps]

In [None]:
X_temp = np.array(
    [
        np.mean(fps_soap.transform([frame]).get_features(fps_soap), axis=0)
        for frame in tqdm(frames_for_fps, desc="Compute SOAP features")
    ]
)

In [None]:
THRESH = 1e-12
high_var_features = np.where(np.var(X_temp - X_temp.mean(axis=0), axis=0) > THRESH)[0]
X_temp = X_temp[:, high_var_features]

X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(X_temp)
del X_temp  # save on memory
fps = FPS(n_to_select=n_FPS, progress_bar=True).fit(X_raw)
del X_raw
u_species = np.unique(species)
sp_pairs = fps_soap.get_keys(u_species)

In [None]:
coefficient_subselection = np.zeros((n_FPS, 5))
index_mapping = get_power_spectrum_index_mapping(
    sp_pairs, n_max=hypers["max_radial"], l_max=hypers["max_angular"] + 1
)
for fi, i in enumerate(high_var_features[fps.selected_idx_]):
    coefficient_subselection[fi] = [
        index_mapping[i][k] for k in ["a", "b", "n1", "n2", "l"]
    ]
coefficient_subselection_dict = {
    "a": coefficient_subselection[:, 0].tolist(),
    "b": coefficient_subselection[:, 1].tolist(),
    "n1": coefficient_subselection[:, 2].tolist(),
    "n2": coefficient_subselection[:, 3].tolist(),
    "l": coefficient_subselection[:, 4].tolist(),
}

In [None]:
soap = SOAP(**hypers, coefficient_subselection=coefficient_subselection_dict)
idx = np.random.randint(0, len(frames), n_frames)
my_frames = [frames[i] for i in idx]

magic = magic.reshape(-1, 1)[idx]

X_raw = np.zeros((len(my_frames), n_FPS))
for fi, frame in enumerate(tqdm(my_frames, desc="Get SOAP features")):
    X_raw[fi] = np.mean(soap.transform([frame]).get_features(soap), axis=0)

In [None]:
X_raw.shape

SAVING THE NPZ SOAP VECTOR FILE IN YOUR REPO SO THAT YOU DON'T HAVE TO RUN THE ABOVE EVERY TIME (14 MB FOR FPS=2000, n_PC=5, n_frames=840)

In [None]:
from pathlib import Path

assert DATA_3DCD.soap.resolve() == Path("../r4data/3DCD/soap.npz").resolve()

In [None]:
file = np.savez(
    DATA_3DCD.soap,
    idx=idx,
    my_frames=my_frames,
    magic=magic,
    X_raw=X_raw,
    coeff=coefficient_subselection,
)

In [None]:
n_FPS = 2000  # number of most fiverse features selected
n_frames = len(
    frames_mp
)  # number of structures analysed, the whole dataset will take some time
n_PC = 5  # number of principal components to include in the analysis (min=2)

In [None]:
frames = frames_mp

magic = np.zeros((len(frames)))
for i, frame in enumerate(tqdm(frames)):
    frame.wrap(eps=1e-12)
    if len(frame) % 4 == 0:
        magic[i] = 1

In [None]:
species = list(set([int(n) for frame in frames for n in frame.numbers]))

hypers = dict(
    soap_type="PowerSpectrum",
    interaction_cutoff=3.5,
    max_radial=4,
    max_angular=4,
    gaussian_sigma_type="Constant",
    gaussian_sigma_constant=0.5,
    cutoff_smooth_width=0.5,
    global_species=species,
    expansion_by_species_method="user defined",
    normalize=False,
)
fps_soap = SOAP(
    **hypers,
)
idx_for_fps = np.random.randint(0, len(frames), n_FPS)
frames_for_fps = [frames[i] for i in idx_for_fps]
X_temp = np.array(
    [
        np.mean(fps_soap.transform([frame]).get_features(fps_soap), axis=0)
        for frame in frames_for_fps
    ]
)

THRESH = 1e-12
high_var_features = np.where(np.var(X_temp - X_temp.mean(axis=0), axis=0) > THRESH)[0]
X_temp = X_temp[:, high_var_features]

X_raw = StandardFlexibleScaler(column_wise=False).fit_transform(X_temp)
del X_temp  # save on memory
fps = FPS(n_to_select=n_FPS, progress_bar=True).fit(X_raw)
del X_raw
u_species = np.unique(species)
sp_pairs = fps_soap.get_keys(u_species)

coefficient_subselection = np.zeros((n_FPS, 5))
index_mapping = get_power_spectrum_index_mapping(
    sp_pairs, n_max=hypers["max_radial"], l_max=hypers["max_angular"] + 1
)
for fi, i in enumerate(high_var_features[fps.selected_idx_]):
    coefficient_subselection[fi] = [
        index_mapping[i][k] for k in ["a", "b", "n1", "n2", "l"]
    ]
coefficient_subselection_dict = {
    "a": coefficient_subselection[:, 0].tolist(),
    "b": coefficient_subselection[:, 1].tolist(),
    "n1": coefficient_subselection[:, 2].tolist(),
    "n2": coefficient_subselection[:, 3].tolist(),
    "l": coefficient_subselection[:, 4].tolist(),
}

soap = SOAP(**hypers, coefficient_subselection=coefficient_subselection_dict)
idx = np.random.randint(0, len(frames), n_frames)
my_frames = [frames[i] for i in idx]

magic = magic.reshape(-1, 1)[idx]

X_raw = np.zeros((len(my_frames), n_FPS))
for fi, frame in enumerate(my_frames):
    X_raw[fi] = np.mean(soap.transform([frame]).get_features(soap), axis=0)

In [None]:
if not DATA_MP.soap.exists():
    print("Saving the data if it doesn't already exist")
    file = np.savez(
        "../r4data/MP/soap.npz",  # TODO: should use DATA_3DCD.soap
        idx=idx,
        my_frames=my_frames,
        magic=magic,
        X_raw=X_raw,
        coeff=coefficient_subselection,
    )