These produce a final .json file that can be used as an input in chemiscope to produce the final figures of the manuscript. 
The file can be already found in the downloaded folder.
However, if you want to run the notebook yourself, a new chemiscope file will be generated, along with new pcovr figures. 

In [None]:
from data import DATA_MC3D, DATA_MP
import pickle
from IPython.display import clear_output
from sklearn.model_selection import learning_curve

In [None]:
%run ./modules.ipynb

sns.set(style="white", palette="muted", color_codes=True)
my_c = colors.ListedColormap(["mediumblue", "red"])
c = ["mediumblue", "red"]
sns.set_palette(sns.color_palette(c))

n_PC = 5  # number of principal components
mixing = 0.5  # beta parameter for pcvor

In [None]:
import sys


def sizeof_fmt(num, suffix="B"):
    """by Fred Cirera,  https://stackoverflow.com/a/1094933/1870254, modified"""
    for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
        if abs(num) < 1024.0:
            return "%3.1f %s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f %s%s" % (num, "Yi", suffix)


for name, size in sorted(
    ((name, sys.getsizeof(value)) for name, value in locals().items()),
    key=lambda x: -x[1],
)[:10]:
    print("{:>30}: {:>8}".format(name, sizeof_fmt(size)))

In [None]:
npzfile_MC3D = np.load(DATA_MC3D.soap, allow_pickle=True)
my_frames_MC3D = ase.io.read(DATA_MC3D.structures, index=":")
magic_MC3D = np.array(npzfile_MC3D["magic"], dtype=int).reshape(-1, 1)
y_magic_MC3D = np.copy(magic_MC3D.reshape(-1, 1).ravel())

In [None]:
npzfile_mp = np.load(DATA_MP.soap, allow_pickle=True)
my_frames_mp = ase.io.read(DATA_MP.structures, index=":")
magic_mp = np.array(npzfile_mp["magic"], dtype=int).reshape(-1, 1)
y_magic_mp = np.copy(magic_mp.reshape(-1, 1).ravel())

In [None]:
my_frames = np.array([*my_frames_MC3D, *my_frames_mp])
my_orig_frames = np.array([*my_frames_MC3D, *my_frames_mp])
del my_frames_mp, my_frames_MC3D

In [None]:
y_magic = np.hstack((y_magic_MC3D, y_magic_mp))
del magic_mp, magic_MC3D, y_magic_MC3D, y_magic_mp

In [None]:
if os.path.exists("train_indices_all.npy"):
    i_train, i_test = np.load("train_indices_all.npy"), np.load("test_indices_all.npy")
    y_train, y_test = y_magic[i_train], y_magic[i_test]
else:
    print("generating")
    i_train, i_test, y_train, y_test = train_test_split(
        np.arange(X.shape[0]), y_magic, train_size=0.9
    )
    np.save("train_indices_all.npy", i_train)
    np.save("test_indices_all.npy", i_test)

In [None]:
r = 4.0
X_raw = np.load("soaps_{}.npy".format(r))
x_scaler = pickle.load(open("x_scaler_blanked_{}.sav".format(r), "rb"))
X = x_scaler.transform(X_raw)
X_train, X_test = X_raw[i_train], X_raw[i_test]
del X_raw
X_train = x_scaler.transform(X_train)
X_test = x_scaler.transform(X_test)
clf = pickle.load(open("random_forest_all_blanked_{}.sav".format(r), "rb"))

In [None]:
print("Accuracy on test set:" + str(clf.score(X_test, y_test.ravel())))

In [None]:
X_raw_MC3D = npzfile_MC3D["X_raw"]
X_raw_mp = npzfile_mp["X_raw"]
X_raw_tagged = np.vstack((X_raw_MC3D, X_raw_mp))
del X_raw_MC3D, X_raw_mp
x_tagged_scaler = StandardFlexibleScaler(column_wise=False).fit(X_raw_tagged)

In [None]:
X_tagged = x_tagged_scaler.transform(X_raw_tagged)
X_tagged_train = X_tagged[i_train]
X_tagged_test = X_tagged[i_test]
del X_raw_tagged, X_tagged

In [None]:
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.svm import LinearSVC as SVC
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import SGDClassifier as SGDC
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.ensemble import AdaBoostClassifier as ABC
from sklearn.naive_bayes import GaussianNB as GNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

In [None]:
estimators = {
    "Random Forest": {
        "model": RandomForestClassifier(
            verbose=2, n_estimators=100, random_state=2, n_jobs=4
        ),
        "save_name": "random_forest_all",
    },
    "Decision Tree": {
        "model": DTC(random_state=2, max_features=80),
        "save_name": "dtc_all",
    },
    "Linear SVM": {"model": SVC(verbose=2, random_state=2), "save_name": "svc_all"},
    "Cross-Validated Logistic Regression": {
        "model": LogisticRegressionCV(verbose=2, cv=2, random_state=2, n_jobs=4),
        "save_name": "cvlr_all",
    },
    "Stochastic Gradient Descent Classifier": {
        "model": SGDC(verbose=2, n_jobs=4, max_iter=100, random_state=2),
        "save_name": "sgdc_all",
    },
    "QDA": {"model": QDA(), "save_name": "qda_all"},
    "Gaussian Naive Bayes": {"model": GNB(), "save_name": "gnb"},
    "MLP Classifier": {"model": MLP(verbose=2, random_state=2), "save_name": "mlp"},
}

In [None]:
class_table = (
    "\\begin{table}[htbp!]"
    + "\n\\centering\n\\begin{tabular}{|m{2.0cm}|m{2.0cm}|m{10cm}|}"
    + "\n\\hline\nClassifier & Test Set R$^2$ & Test Set R$^2$ &Classifier parameters\\\\"
    + "\n\\hline\n & (Species-Invariant) & (Species-Tagged) &\\\\"
    + "\n\\hline"
    + "\n\\hline"
)

for e, v in estimators.items():
    save_name = v["save_name"]
    if "blanked_error" not in v:
        if not os.path.exists(save_name + "_XX.sav"):
            print(f"Fitting {e}")
            model = v["model"]
            pickle.dump(model.fit(X_train, y_train), open(save_name + "_XX.sav", "wb"))
            print(save_name + "_XX.sav")
        model = pickle.load(open(save_name + "_XX.sav", "rb"))
        estimators[e]["blanked_error"] = model.score(X_test, y_test)
    if "tagged_error" not in v:
        if not os.path.exists(save_name + ".sav"):
            print(f"Fitting {e}")
            model = v["model"]
            pickle.dump(
                model.fit(X_tagged_train, y_train), open(save_name + ".sav", "wb")
            )
            print(save_name + ".sav")
        if os.path.exists(save_name + ".sav"):
            model = pickle.load(open(save_name + ".sav", "rb"))
            estimators[e]["tagged_error"] = model.score(X_tagged_test, y_test)

    params = ", ".join(
        [
            f'\\texttt{{{k.replace("_", " ")}}}: {v.replace("_", " ") if isinstance(v, str) else v}'
            for k, v in estimators[e]["model"].get_params().items()
        ]
    )
    class_table += f'\n{e} & {round(estimators[e].get("blanked_error", -1),3)} & {round(estimators[e].get("tagged_error", -1),3)} &\\{{{params}\\}}\\\\\n\\hline'

class_table += (
    "\\end{tabular}"
    + "\n\\caption{Accuracy on test set achieved by different classifiers.}  "
    + "\n\\label{table:classif}"
    "\n\\end{table}"
)

In [None]:
print(class_table)