# Machine learning for solid solutions (Li2TiS3)

This notebook is divided into these sections:
- [LTS dataset](#lts)
    - write CRYSTAL input files
    - read CRYSTAL output files
- [Descriptos](#descriptors)
- [Machine learning](#ml)
    - linear regression
- [Protocol](#protocol)
    - [simmetry analysis](#symmetry)

In [1]:
import os
import copy
import json
import itertools
import shutil as sh
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime

from CRYSTALpytools.crystal_io import Crystal_output, Crystal_input, Crystal_density, Crystal_gui
from CRYSTALpytools.convert import cry_gui2pmg, cry_out2pmg
from CRYSTALpytools.utils import view_pmg

from pymatgen.io.ase import AseAtomsAdaptor
from pymatgen.io.cif import CifWriter
from pymatgen.symmetry.analyzer import SpacegroupAnalyzer, PointGroupAnalyzer

from ase.visualize import view

#from dscribe.descriptors import CoulombMatrix

from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor
from sklearn.metrics import mean_squared_error 
from sklearn.cluster import KMeans

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (15,15)



# <a id='lts'>LTS dataset - pymatgen</a>

In [3]:
# New atom
new_atom = 'Li'

# Read the confcount output
cry_output = Crystal_output().read_cry_output('data/crystal/lts/lts_confcount.out')
cry_output.get_config_analysis()

# Read the initial structure (before substitution)
original_structure_gui =  Crystal_gui().read_cry_gui('data/crystal/lts/lts_confcount.gui')
original_structure = cry_gui2pmg(original_structure_gui)

structures_lts = []
li_atoms = []
ti_atoms = []
for j,substitutions in enumerate(cry_output.atom_type2):
    new_structure = original_structure.copy()
    for i in substitutions:
        new_structure.replace(i-1,new_atom)
    structures_lts.append(new_structure)
    ti_atoms.append((np.array(cry_output.atom_type1[j])-1).tolist())
    li_atoms.append((np.array(cry_output.atom_type2[j])-1).tolist())

structures = copy.deepcopy(structures_lts)

## Read the energy and band gap

#### Single point

In [4]:
energies_sp = []
gap_sp = []
for i in range(len(structures)):
    crystal_output = Crystal_output().read_cry_output('./data/crystal/lts/sp/output/lts_sp_%s.out'%str(i))
    if crystal_output.get_final_energy() != None and crystal_output.converged == True:
        energies_sp.append(crystal_output.get_final_energy())
        gap_sp.append(crystal_output.get_band_gap())
    else:
        print(i)

#### Optimised geometry

In [None]:
energies_opt = []
gap_opt = []
for i in range(len(structures)):
    #crystal_output = Crystal_output().read_cry_output('./data/crystal/lts/sp/output/lts_sp_%s.out'%str(i))
    if crystal_output.get_final_energy() != None:
        energies_opt.append(crystal_output.get_final_energy())
        gap_opt.append(crystal_output.get_band_gap())

## Covert the LTS dataset to ASE

In [5]:
ase_structures = []
for i in range(len(structures)):
    ase_struct = AseAtomsAdaptor().get_atoms(structures[i])
    ase_structures.append(ase_struct)

# <a id='descriptors'>Descriptors</a>

## Coulomb Matrix

Equations

In [38]:
#dscribe descriptors
from dscribe.descriptors import CoulombMatrix

cm_dscribe = []
cm_ds = CoulombMatrix(n_atoms_max=54,permutation="eigenspectrum")
start = datetime.now()
for i,ase_struct in enumerate(ase_structures):
    dscribe_matrix = cm_ds.create(ase_struct)
    cm_dscribe.append(dscribe_matrix)
    now = datetime.now()
    if (i+1)%200 == 0: 
        print("matrices read:", len(cm_dscribe),", time:", (now - start))
cm_dscribe = np.array(cm_dscribe)   
print('Number of matrices read: ', len(cm_dscribe))
print("--- %s time taken ---" % (datetime.now()- start))

matrices read: 200 , time: 0:00:00.039308
matrices read: 400 , time: 0:00:00.072035
matrices read: 600 , time: 0:00:00.094009
matrices read: 800 , time: 0:00:00.113769
matrices read: 1000 , time: 0:00:00.133621
matrices read: 1200 , time: 0:00:00.153423
matrices read: 1400 , time: 0:00:00.173584
matrices read: 1600 , time: 0:00:00.193368
matrices read: 1800 , time: 0:00:00.213342
matrices read: 2000 , time: 0:00:00.233015
matrices read: 2200 , time: 0:00:00.252825
matrices read: 2400 , time: 0:00:00.272754
matrices read: 2600 , time: 0:00:00.292513
matrices read: 2800 , time: 0:00:00.312169
matrices read: 3000 , time: 0:00:00.331856
matrices read: 3200 , time: 0:00:00.351466
matrices read: 3400 , time: 0:00:00.371298
matrices read: 3600 , time: 0:00:00.391288
matrices read: 3800 , time: 0:00:00.411228
matrices read: 4000 , time: 0:00:00.431108
Number of matrices read:  4023
--- 0:00:00.436803 time taken ---


### Inspect the descriptor

#### Full matrix

In [53]:
cm_ds = CoulombMatrix(n_atoms_max=54,permutation='none',flatten=False)
cm_ds.create(ase_structures[0])

array([[  6.98330508,   2.50278676,   2.50278676, ...,  18.87719988,
          6.29239996,  10.89875643],
       [  2.50278676,   6.98330508,   1.25139338, ...,  10.89875643,
          4.33072666,   6.29239996],
       [  2.50278676,   1.25139338,   6.98330508, ...,  10.89875643,
         10.89875643,  18.87719988],
       ...,
       [ 18.87719988,  10.89875643,  10.89875643, ..., 388.02344103,
         35.59518946,  71.19037891],
       [  6.29239996,   4.33072666,  10.89875643, ...,  35.59518946,
        388.02344103,  71.19037891],
       [ 10.89875643,   6.29239996,  18.87719988, ...,  71.19037891,
         71.19037891, 388.02344103]])

#### Eigenvalues only

In [54]:
cm_ds = CoulombMatrix(n_atoms_max=54,permutation='eigenspectrum')
cm_ds.create(ase_structures[0])

array([2679.77619447, 1133.67577892,  913.07374493,  841.08011951,
        712.6289803 ,  708.830714  ,  692.25295309,  656.98047438,
        655.93789699,  611.12050477,  467.43255314,  434.36093126,
        405.95650918,  373.37691024,  360.90885612,  360.73370676,
        330.59963438,  323.17143181,  321.91922861,  319.70671666,
        317.12026943,  310.00661992,  304.11487923,  301.61723033,
        299.21583906,  298.12478161,  296.60093965,  292.83209275,
        292.52858515,  292.14396424,  291.40954568,  283.9316412 ,
        283.8511157 ,  283.12161825,  282.57751345,  280.79923438,
         12.8666636 ,    7.39275912,    7.03464999,    5.57408163,
          5.28041001,    4.54978872,    4.52989384,    4.34655433,
          4.22686951,    4.14798147,    3.80471049,    3.74741204,
          3.72709043,    3.52626779,    3.47022584,    3.37089434,
          3.33626234,    3.28758742])

In [44]:
# Save cme to file
np.save('./data/descriptors/cm_dscribe.npy',cm_dscribe,allow_pickle=True)

In [55]:
# Read cme from file
cm_dscribe = np.load('./data/descriptors/cm_dscribe.npy',allow_pickle=True)

### Test the parameters

# SOAP

# <a id='ml'>Machine learning</a>

## Data normalisation

### MinMaxScaler

In [None]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler 

X_train, X_test, y_train, y_test = train_test_split(descriptor, energies, random_state=1)

scaler = MinMaxScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

### StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler , MinMaxScaler 

X_train, X_test, y_train, y_test = train_test_split(descriptor, energies, random_state=1)

scaler = StandardScaler()  
scaler.fit(X_train)  
X_train = scaler.transform(X_train)  
X_test = scaler.transform(X_test)  

# <a id='protocol'>Protocol</a>

# <a id='symmetry'>Symmetry analysis</a>

Selected structures:
- 8 - 4008
- 6 - 0
- 4 - 25
- 3 - 3291
- 2 - 2278
- 1 - 1829

In [17]:
selected_structures = [4008, 0, 25, 3291, 2278, 1829]
