# Machine learning for solid solutions (Li2TiS3)

This notebook is divided into these sections:
- [LTS dataset](#lts)
    - write CRYSTAL input files
    - read CRYSTAL output files
- [Descriptos](#descriptors)
- [Machine learning](#ml)
    - linear regression
- [Protocol](#protocol)
    - [simmetry analysis](#symmetry)

In [2]:
import os
import copy
import json
import itertools
import shutil as sh
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime

from CRYSTALpytools.crystal_io import Crystal_output, Crystal_input, Crystal_density, Crystal_gui
from CRYSTALpytools.convert import cry_gui2pmg, cry_out2pmg
from CRYSTALpytools.utils import view_pmg

from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.io.cif import CifWriter
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer, PointGroupAnalyzer

from ase.visualize import view

#from dscribe.descriptors import CoulombMatrix

from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,15)

# <a id='lts'>LTS dataset - pymatgen</a>

In [3]:
# New atom
new_atom = 'Li'

# Read the confcount output
cry_output = Crystal_output().read_cry_output('data/crystal/lts/lts_confcount.out')
cry_output.get_config_analysis()

# Read the initial structure (before substitution)
original_structure_gui =  Crystal_gui().read_cry_gui('data/crystal/lts/lts_confcount.gui')
original_structure = cry_gui2pmg(original_structure_gui)

structures_lts = []
li_atoms = []
ti_atoms = []
for j,substitutions in enumerate(cry_output.atom_type2):
    new_structure = original_structure.copy()
    for i in substitutions:
        new_structure.replace(i-1,new_atom)
    structures_lts.append(new_structure)
    ti_atoms.append((np.array(cry_output.atom_type1[j])-1).tolist())
    li_atoms.append((np.array(cry_output.atom_type2[j])-1).tolist())

structures = copy.deepcopy(structures_lts)

## Covert the LTS dataset to ASE

In [4]:
ase_structures = []
for i in range(len(structures)):
    ase_struct = AseAtomsAdaptor().get_atoms(structures[i])
    ase_structures.append(ase_struct)

# <a id='descriptors'>Descriptors</a>

## Coulomb Matrix

$M_{ij}^{Coulomb}= \Bigl\{^{0.5Z_i^{2.4} \; \; for \; i=j}_{\frac{Z_iZ_j}{R_{ij}}\;\;\;\; for \; i\neq j}$

(taken from the Dscribe website)

The diagonal elements are the interaction of an atom with itself and are a polynomial fit of the atomic energies to the nuclear charge $Z_i$. 

The off-diagonal elements represent the Coulomb repulsion between nuclei $i$ and $j$.

In [5]:
#dscribe descriptors
from dscribe.descriptors import CoulombMatrix

cm_dscribe = []
cm_ds = CoulombMatrix(n_atoms_max=54,permutation="eigenspectrum")
start = datetime.now()
for i,ase_struct in enumerate(ase_structures):
    dscribe_matrix = cm_ds.create(ase_struct)
    cm_dscribe.append(dscribe_matrix)
    now = datetime.now()
    if (i+1)%200 == 0: 
        print("matrices read:", len(cm_dscribe),", time:", (now - start))
cm_dscribe = np.array(cm_dscribe)   
print('Number of matrices read: ', len(cm_dscribe))
print("--- %s time taken ---" % (datetime.now()- start))

matrices read: 200 , time: 0:00:00.025561
matrices read: 400 , time: 0:00:00.051472
matrices read: 600 , time: 0:00:00.077386
matrices read: 800 , time: 0:00:00.103255
matrices read: 1000 , time: 0:00:00.129109
matrices read: 1200 , time: 0:00:00.154999
matrices read: 1400 , time: 0:00:00.180728
matrices read: 1600 , time: 0:00:00.206362
matrices read: 1800 , time: 0:00:00.231342
matrices read: 2000 , time: 0:00:00.256384
matrices read: 2200 , time: 0:00:00.281816
matrices read: 2400 , time: 0:00:00.307712
matrices read: 2600 , time: 0:00:00.333564
matrices read: 2800 , time: 0:00:00.359458
matrices read: 3000 , time: 0:00:00.385120
matrices read: 3200 , time: 0:00:00.410757
matrices read: 3400 , time: 0:00:00.436378
matrices read: 3600 , time: 0:00:00.461966
matrices read: 3800 , time: 0:00:00.487627
matrices read: 4000 , time: 0:00:00.513173
Number of matrices read:  4023
--- 0:00:00.520786 time taken ---


### Inspect the descriptor

#### Full matrix

In [6]:
cm_ds = CoulombMatrix(n_atoms_max=54,permutation='none',flatten=False)
cm_ds.create(ase_structures[0])

array([[  6.98330508,   2.50278676,   2.50278676, ...,  18.87719988,
          6.29239996,  10.89875643],
       [  2.50278676,   6.98330508,   1.25139338, ...,  10.89875643,
          4.33072666,   6.29239996],
       [  2.50278676,   1.25139338,   6.98330508, ...,  10.89875643,
         10.89875643,  18.87719988],
       ...,
       [ 18.87719988,  10.89875643,  10.89875643, ..., 388.02344103,
         35.59518946,  71.19037891],
       [  6.29239996,   4.33072666,  10.89875643, ...,  35.59518946,
        388.02344103,  71.19037891],
       [ 10.89875643,   6.29239996,  18.87719988, ...,  71.19037891,
         71.19037891, 388.02344103]])

#### Eigenvalues only

In [7]:
cm_ds = CoulombMatrix(n_atoms_max=54,permutation='eigenspectrum')
cm_ds.create(ase_structures[0])

array([2679.77619447, 1133.67577892,  913.07374493,  841.08011951,
        712.6289803 ,  708.830714  ,  692.25295309,  656.98047438,
        655.93789699,  611.12050477,  467.43255314,  434.36093126,
        405.95650918,  373.37691024,  360.90885612,  360.73370676,
        330.59963438,  323.17143181,  321.91922861,  319.70671666,
        317.12026943,  310.00661992,  304.11487923,  301.61723033,
        299.21583906,  298.12478161,  296.60093965,  292.83209275,
        292.52858515,  292.14396424,  291.40954568,  283.9316412 ,
        283.8511157 ,  283.12161825,  282.57751345,  280.79923438,
         12.8666636 ,    7.39275912,    7.03464999,    5.57408163,
          5.28041001,    4.54978872,    4.52989384,    4.34655433,
          4.22686951,    4.14798147,    3.80471049,    3.74741204,
          3.72709043,    3.52626779,    3.47022584,    3.37089434,
          3.33626234,    3.28758742])

In [10]:
# Save cme to file
np.save('./data/descriptors/cm_dscribe.npy',cm_dscribe,allow_pickle=True)

In [21]:
# Read cme from file
cm_dscribe = np.load('./data/descriptors/cm_dscribe.npy',allow_pickle=True)
cm_dscribe.shape

(4023, 54)

### Test the parameters

#there's not really any parameters that needed to be set for CME as there is only the maximum atoms

# SOAP

$P_{nn'l}^{Z_1Z_2} = \pi \sqrt{\frac{8}{2l+1}}\sum c_{nlm}^{Z_1}*c_{n'lm}^{Z_2}$

where the $n$ indices for the different radial basis functions for up to $n_{max}$, $l$ is the angular degree of the spherical harmonics up to $l_{max}$. 
<br>
The defult for Dscribe descriptors are the spherical gaussian type orbitals as riadial baiss functions.

In [13]:
#setting up the SOAP descriptor

from dscribe.descriptors import SOAP


rcut = 6.0
nmax = 8
lmax = 6

soap = SOAP(
    species = ["Li", "Ti", "S"],
    periodic= True,
    r_cut=rcut,
    n_max=nmax,
    l_max=lmax
)

In [17]:
soap_dscribe = []
start = datetime.now()
for i,ase_struct in enumerate(ase_structures):
    soap_matrix = soap.create(ase_struct)
    #soap_matrix = np.real(soap_matrix)
    soap_dscribe.append(soap_matrix)
    now = datetime.now()
    if int(len(soap_dscribe)) == 20:
        print("matrices read:", len(soap_dscribe),", time:", (now - start))
    if int(len(soap_dscribe))%200 == 0: 
        print("matrices read:", len(soap_dscribe),", time:", (now - start))
    
print('Number of matrices read: ', len(soap_dscribe))
print("--- %s time taken ---" % ((datetime.now() - start)))

matrices read: 20 , time: 0:00:00.311403
matrices read: 200 , time: 0:00:02.815383
matrices read: 400 , time: 0:00:05.536582
matrices read: 600 , time: 0:00:08.269389
matrices read: 800 , time: 0:00:10.971945
matrices read: 1000 , time: 0:00:13.685848
matrices read: 1200 , time: 0:00:16.391911
matrices read: 1400 , time: 0:00:19.191675
matrices read: 1600 , time: 0:00:21.916543
matrices read: 1800 , time: 0:00:24.629409
matrices read: 2000 , time: 0:00:27.375925
matrices read: 2200 , time: 0:00:30.110905
matrices read: 2400 , time: 0:00:32.816456
matrices read: 2600 , time: 0:00:35.615587
matrices read: 2800 , time: 0:00:38.381084
matrices read: 3000 , time: 0:00:41.158453
matrices read: 3200 , time: 0:00:43.958775
matrices read: 3400 , time: 0:00:46.748881
matrices read: 3600 , time: 0:00:49.490178
matrices read: 3800 , time: 0:00:52.329624
matrices read: 4000 , time: 0:00:55.200733
Number of matrices read:  4023
--- 0:00:55.548953 time taken ---


In [18]:
# Save SOAP to file
np.save('./data/descriptors/soap_dscribe.npy',soap_dscribe,allow_pickle=True)

In [22]:
# Read SOAP from file
soap_dscribe = np.load('./data/descriptors/soap_dscribe.npy',allow_pickle=True)
soap_dscribe.shape

(4023, 54, 2100)

In [None]:
print(nmax)

###  Inspection of the descriptor

In [None]:
#changing the parameters to see how the value change:
#the changes for the rcut and the nmax:
rcut = np.linspace(20,100,81)
rcut = rcut/10

nmax = list(range(1,13))

lmax = 6

for i in range(len(rcut)):
    for j in range(len(nmax)):
        soap = SOAP(
            species = ["Li", "Ti", "S"],
            periodic= True,
            r_cut=rcut[i],
            n_max=nmax[j],
            l_max=lmax
        )
        
        soap_dscribe_ = []
        soap_matrix = soap.create(ase_structures)
        soap_dscribe_.append(soap_matrix)
        now = datetime.now()
        
        print('rcut =',rcut[i] , 'nmax =',nmax[j] , 'done')
        

## MBTR Descriptor

In [56]:
from dscribe.descriptors import MBTR

#setting up the MBTR descriptor
mbtr = MBTR(
    species=["Li", "Ti", "S"],
    k1={
        "geometry": {"function": "atomic_number"},
        "grid": {"min": 0, "max": 8, "n": 200, "sigma": 0.1},
    },
    k2={
        "geometry": {"function": "inverse_distance"},
        "grid": {"min": 0, "max": 1, "n": 100, "sigma": 0.1},
        "weighting": {"function": "exp", "scale": 0.5, "threshold": 1e-3},
    },
    k3={
        "geometry": {"function": "cosine"},
        "grid": {"min": -1, "max": 1, "n": 100, "sigma": 0.1},
        "weighting": {"function": "exp", "scale": 0.5, "threshold": 1e-3},
    },
    periodic=True,
    normalization="l2_each",
    flatten=True,
    #sparse=False (only changes the return type)
)

In [90]:
#MBTR descriptor
mbtr_dscribe = []
start = datetime.now()
for i,ase_struct in enumerate(ase_structures):
    mbtr_matrix = mbtr.create(ase_struct)
    #mbtr_matrix = np.real(mbtr_matrix)
    mbtr_dscribe.append(mbtr_matrix)
    now = datetime.now()
    if int(len(mbtr_dscribe)) == 20:
        print("matrices read:", len(mbtr_dscribe),", time:", (now - start))
    if int(len(mbtr_dscribe))%200 == 0: 
        print("matrices read:", len(mbtr_dscribe),", time:", (now - start))
print('Number of matrices read: ', len(mbtr_dscribe))
print("--- %s time taken ---" % ((datetime.now() - start)))

matrices read: 20 , time: 0:00:18.676465
matrices read: 200 , time: 0:03:04.618426
matrices read: 400 , time: 0:06:11.888169
matrices read: 600 , time: 0:09:21.978380
matrices read: 800 , time: 0:12:30.620042
matrices read: 1000 , time: 0:15:43.693479
matrices read: 1200 , time: 0:18:55.219530
matrices read: 1400 , time: 0:22:06.129798
matrices read: 1600 , time: 0:25:17.181301
matrices read: 1800 , time: 0:28:27.808202
matrices read: 2000 , time: 0:31:38.854499
matrices read: 2200 , time: 0:34:50.827510
matrices read: 2400 , time: 0:38:02.551797
matrices read: 2600 , time: 0:41:15.546765
matrices read: 2800 , time: 0:44:27.683920
matrices read: 3000 , time: 0:47:43.159158
matrices read: 3200 , time: 0:50:58.429130
matrices read: 3400 , time: 0:54:15.664723
matrices read: 3600 , time: 0:57:28.496517
matrices read: 3800 , time: 1:00:42.225574
matrices read: 4000 , time: 1:03:56.288877
Number of matrices read:  4023
--- 1:04:18.460093 time taken ---


In [None]:
# Save MBTR to file
np.save('./data/descriptors/MBTR_dscribe.npy',mbtr_dscribe,allow_pickle=True)

In [None]:
# Read MBTR from file
MBTR_dscribe = np.load('./data/descriptors/MBTR_dscribe.npy',allow_pickle=True)
MBTR_dscribe.shape

### Inspection of the descriptor

## Coulomb Matrix from Matminer

# <a id='ml'>Machine learning</a>

In [16]:
AseAtomsAdaptor().get_atoms(structures[0])

ase.atoms.Atoms

# <a id='protocol'>Protocol</a>

# <a id='symmetry'>Symmetry analysis</a>

Selected structures:
- 8 - 4008
- 6 - 0
- 4 - 25
- 3 - 3291
- 2 - 2278
- 1 - 1829

In [17]:
selected_structures = [4008, 0, 25, 3291, 2278, 1829]
