# Pactice notebook

Index

[CaMgO dataset](#camgo)


In [23]:
import os
import copy
import json
import itertools
import shutil as sh
from pathlib import Path
import numpy as np
import pandas as pd
import time
from datetime import datetime

from CRYSTALpytools.crystal_io import Crystal_output, Crystal_input, Crystal_density, Crystal_gui
from CRYSTALpytools.convert import cry_gui2pmg, cry_out2pmg
from CRYSTALpytools.utils import view_pmg

from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.io.cif import CifWriter
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer, PointGroupAnalyzer

from ase.visualize import view

from dscribe.descriptors import CoulombMatrix
from dscribe.descriptors import SOAP
from dscribe.descriptors import MBTR

from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,15)

# <a id='camgo'>CaMgO dataset</a>

### Structures

- structures is a list of all the structures as pymatgen objects

In [18]:
new_atom = 'Mg'

#cry_output = Crystal_output('data/classification/ml/cao_CONFCNT.out')
cry_output = Crystal_output().read_cry_output('data/crystal/cao_confcount.out')

cry_output.get_config_analysis()

#original_structure = cry_gui2pmg('data/classification/ml/cao_CONFCNT.gui')
original_structure_gui =  Crystal_gui().read_cry_gui('data/crystal/cao_confcount.gui')
original_structure = cry_gui2pmg(original_structure_gui)

structures_mco = []
ca_atoms = []
mg_atoms = []
for j,substitutions in enumerate(cry_output.atom_type1):
    new_structure = original_structure.copy()
    for i in substitutions:
        new_structure.replace(i-1,new_atom)
    structures_mco.append(new_structure)
    ca_atoms.append((np.array(cry_output.atom_type1[j])-1).tolist())
    mg_atoms.append((np.array(cry_output.atom_type2[j])-1).tolist())

structures = copy.deepcopy(structures_mco)

### Single point energies & band gap

- energies_sp is a list of single point energies (non optimised geometry)
- gap_sp is a list of band gap values for the non optimised geometry

In [19]:
energies_sp = []
gap_sp = []
#structuress = []
for i in range(len(structures)):
    crystal_output = Crystal_output().read_cry_output('./data/crystal/sp/CaMgO_sp_%s.out'%str(i))
    if crystal_output.get_final_energy() != None:
        energies_sp.append(crystal_output.get_final_energy())
        gap_sp.append(crystal_output.get_band_gap())
        #structuress.append(i)

## Descriptors

In [20]:
#descriptors: setting up
atomic_numbers = [1, 8]
rcut = 6.0
nmax = 8
lmax = 6

# Setting up the CM descriptor
cm = CoulombMatrix(
    n_atoms_max=6,
)

#setting up the SOAP descriptor
soap = SOAP(
    species = ["Ca", "Mg", "O"],
    periodic= False,
    r_cut=rcut,
    n_max=nmax,
    l_max=lmax
)

#setting up the MBTR descriptor
mbtr = MBTR(
    species=["Ca", "O", "Mg"],
    k1={
        "geometry": {"function": "atomic_number"},
        "grid": {"min": 0, "max": 8, "n": 100, "sigma": 0.1},
    },
    k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 1, "n": 100, "sigma": 0.1},
        "weighting": {"function": "exp", "scale": 0.5, "threshold": 1e-3},
    },
    k3={
        "geometry": {"function": "cosine"},
        "grid": {"min": -1, "max": 1, "n": 100, "sigma": 0.1},
        "weighting": {"function": "exp", "scale": 0.5, "threshold": 1e-3},
    },
    periodic=False,
    normalization="l2_each",
)

In [21]:
#changing it back to the non-ASE structure:
NotAse_struct = []
for i in range(len(structures)):
    x = AseAtomsAdaptor().get_atoms(structures[i])
    NotAse_struct.append(x)

In [24]:
#CM descriptor with Dscribe
cm_dscribe_list = []
cm_ds = CoulombMatrix(n_atoms_max=56,permutation="eigenspectrum")
start = datetime.now()
for i in range(len(NotAse_struct)):
    dscribe_matrix = cm_ds.create([NotAse_struct[i]])
    dscribe_matrix = np.real(dscribe_matrix)
    cm_dscribe_list.append(dscribe_matrix)
    now = datetime.now()
    if int(len(cm_dscribe_list))%200 == 0: 
        print("matrices read:", len(cm_dscribe_list),", time:", (now - start))
    
print('Number of matrices read: ', len(cm_dscribe_list))
print("--- %s time taken ---" % (datetime.now()- start))

matrices read: 200 , time: 0:00:00.050199
matrices read: 400 , time: 0:00:00.081432
matrices read: 600 , time: 0:00:00.109489
matrices read: 800 , time: 0:00:00.136968
matrices read: 1000 , time: 0:00:00.167572
matrices read: 1200 , time: 0:00:00.195467
matrices read: 1400 , time: 0:00:00.222756
matrices read: 1600 , time: 0:00:00.250474
matrices read: 1800 , time: 0:00:00.279025
matrices read: 2000 , time: 0:00:00.310541
matrices read: 2200 , time: 0:00:00.339295
matrices read: 2400 , time: 0:00:00.377436
matrices read: 2600 , time: 0:00:00.405539
matrices read: 2800 , time: 0:00:00.433332
matrices read: 3000 , time: 0:00:00.461434
matrices read: 3200 , time: 0:00:00.489988
matrices read: 3400 , time: 0:00:00.518328
matrices read: 3600 , time: 0:00:00.546419
matrices read: 3800 , time: 0:00:00.581447
matrices read: 4000 , time: 0:00:00.609910
Number of matrices read:  4023
--- 0:00:00.613200 time taken ---


In [26]:
#CM descriptors with matminer
cm_matminer_list=[]
cm_mm = sf.CoulombMatrix(flatten=True)
start = datetime.now()
for i in range(len(structures)):
    matminer_matrix = cm_mm.fit([structures[i]])
    featurized_structure = matminer_matrix.featurize(structures[i])
    cm_matminer_list.append(featurized_structure)
    now = datetime.now()
    if int(len(cm_matminer_list))%200 == 0: 
        print("matrices read:", len(cm_matminer_list),", time:", (now - start))
    
print('Number of matrices read: ', len(cm_matminer_list))
print("--- %s time taken ---" % ((datetime.now() - start)))

matrices read: 200 , time: 0:00:18.980661
matrices read: 400 , time: 0:00:38.250775
matrices read: 600 , time: 0:00:56.780742
matrices read: 800 , time: 0:01:14.876733
matrices read: 1000 , time: 0:01:32.994759
matrices read: 1200 , time: 0:01:51.016947
matrices read: 1400 , time: 0:02:09.048188
matrices read: 1600 , time: 0:02:27.076852
matrices read: 1800 , time: 0:02:45.187393
matrices read: 2000 , time: 0:03:03.191558
matrices read: 2200 , time: 0:03:21.180393
matrices read: 2400 , time: 0:03:39.196877
matrices read: 2600 , time: 0:03:57.173002
matrices read: 2800 , time: 0:04:15.193776
matrices read: 3000 , time: 0:04:33.173697
matrices read: 3200 , time: 0:04:51.160521
matrices read: 3400 , time: 0:05:09.139729
matrices read: 3600 , time: 0:05:27.145687
matrices read: 3800 , time: 0:06:19.340803
matrices read: 4000 , time: 0:07:23.543777
Number of matrices read:  4023
--- 0:07:25.750529 time taken ---


In [27]:
#SOAP Descriptor
soap_dscribe_list = []
start = datetime.now()
for i in range(len(NotAse_struct)):
    soap_matrix = soap.create([NotAse_struct[i]])
    soap_matrix = np.real(soap_matrix)
    soap_dscribe_list.append(soap_matrix)
    now = datetime.now()
    if int(len(soap_dscribe_list))%200 == 0: 
        print("matrices read:", len(soap_dscribe_list),", time:", (now - start))
    
print('Number of matrices read: ', len(soap_dscribe_list))
print("--- %s time taken ---" % ((datetime.now() - start)))

matrices read: 200 , time: 0:00:00.754888
matrices read: 400 , time: 0:00:01.757691
matrices read: 600 , time: 0:00:02.486868
matrices read: 800 , time: 0:00:03.348624
matrices read: 1000 , time: 0:00:04.293102
matrices read: 1200 , time: 0:00:05.422214
matrices read: 1400 , time: 0:00:06.117106
matrices read: 1600 , time: 0:00:06.648000
matrices read: 1800 , time: 0:00:07.164531
matrices read: 2000 , time: 0:00:07.689149
matrices read: 2200 , time: 0:00:08.203683
matrices read: 2400 , time: 0:00:08.760536
matrices read: 2600 , time: 0:00:09.339357
matrices read: 2800 , time: 0:00:09.948847
matrices read: 3000 , time: 0:00:10.490252
matrices read: 3200 , time: 0:00:11.095575
matrices read: 3400 , time: 0:00:11.969868
matrices read: 3600 , time: 0:00:13.464492
matrices read: 3800 , time: 0:00:14.340422
matrices read: 4000 , time: 0:00:15.034460
Number of matrices read:  4023
--- 0:00:15.117159 time taken ---


In [None]:
#MBTR descriptor
mbtr_dscribe_list = []
start = datetime.now()
for i in range(len(NotAse_struct)):
    mbtr_matrix = mbtr.create([NotAse_struct[i]])
    mbtr_matrix = np.real(soap_matrix)
    mbtr_dscribe_list.append(soap_matrix)
    now = datetime.now()
    if int(len(mbtr_dscribe_list))%200 == 0: 
        print("matrices read:", len(mbtr_dscribe_list),", time:", (now - start))
print('Number of matrices read: ', len(mbtr_dscribe_list))
print("--- %s time taken ---" % ((datetime.now() - start)))

matrices read: 200 , time: 0:00:16.962957
matrices read: 400 , time: 0:00:32.206095
matrices read: 600 , time: 0:00:47.616529
matrices read: 800 , time: 0:01:02.940795
matrices read: 1000 , time: 0:01:18.494297
matrices read: 1200 , time: 0:01:34.577250
matrices read: 1400 , time: 0:01:49.982984
matrices read: 1600 , time: 0:02:05.156014
matrices read: 1800 , time: 0:02:20.340043
matrices read: 2000 , time: 0:02:35.625311
matrices read: 2200 , time: 0:02:50.776213
matrices read: 2400 , time: 0:03:05.897166
matrices read: 2600 , time: 0:03:20.939322
matrices read: 2800 , time: 0:03:36.021859
matrices read: 3000 , time: 0:03:51.075341
matrices read: 3200 , time: 0:04:06.129055
matrices read: 3400 , time: 0:04:21.259476
matrices read: 3600 , time: 0:04:36.353780
matrices read: 3800 , time: 0:04:51.398104
