In [7]:
import numpy as np
import pandas as pd
import json
import os
import seaborn as sns
import numpy as np
'''
from pymatgen.core import Structure
from pymatgen.ext.matproj import MPRester
from pymatgen.core.composition import *
from pymatgen.analysis.chemenv.coordination_environments.coordination_geometry_finder import LocalGeometryFinder
from pymatgen.analysis.chemenv.coordination_environments.structure_environments import LightStructureEnvironments
from pymatgen.analysis.chemenv.coordination_environments.chemenv_strategies import SimplestChemenvStrategy
from scipy.spatial import ConvexHull
from pymatgen.core.periodic_table import *
'''
%matplotlib inline

### Initial database : Materials project Ternary Li-oxides (Total of 1,830 structures to start with)
<br>
A portion of these will be removed in cases of (1) not contain any occupied tetrahedrla site, (2) have energy above hull (stability) above 50 meV/atom, (3) oxidation states cannot be automatically labeled (meaning that the compound is not easy to identify as an ionic crystal).

### After removing structures that have higher than 50 meV/atom above hull, we get 735 entries left.

### Next, I have exclude entries where the Li can sit in multiple sites OR X can sit in multiple sites OR Li and X sit in the same site.
<br>
Also, entries that DO NOT have any species that occupy a tetrahedral site are excluded.

<br>
This ends up with 113 final entries. All of these composition can be written as Li-X-O ternary lithium oxides. This is smaller than the initial database that we started with (1,830). This definitely is larger than the required minimum in the guideline. However, if we want to have a larger database, there are a few ways we can increase the size of our data. 
<br>
(1) Increase the E above hull constraint so that we allow higher E above hull (perhaps up to 80 meV/atom). 
<br>
(2) Exclude less entries in the process: This is doable, but I am not sure how this would affect the fitting.
<br> 

In [8]:
df = pd.read_pickle("Tetrahedral-Dataset_V3.pickle")

In [11]:
df.describe()

Unnamed: 0,Tetrahedral_Volume,Tet_CSM,Competing_Volume,Competing_CSM,X_charge,X Ionic Radius,X Atomic Radius,X Electronegativity
count,156.0,156.0,156.0,156.0,156.0,156.0,151.0,156.0
mean,3.640856,2.685024,12.38355,3.503847,3.639178,0.796135,1.643576,1.680769
std,0.635372,4.509254,3.336511,6.921897,1.022891,0.180092,0.275875,0.296838
min,0.876182,2.7550660000000003e-17,1.636495e-07,4e-05,1.0,0.0,0.53,1.1
25%,3.461709,0.05566409,10.15381,0.593653,3.0,0.68,1.54,1.54
50%,3.772226,0.768034,10.72919,0.789219,3.547727,0.745,1.61,1.55
75%,4.06927,3.254739,14.01431,1.969925,4.0,0.9025,1.76,1.88
max,4.99171,26.8582,25.93375,38.993163,7.0,1.29,2.47,3.16


In [12]:
print("Number of data: ", len(df))

Number of data:  156


### The dataframe is loaded into "df", which was saved into Tetrahedral-Dataset_V2.pickle file, also included in this folder.
* The y vector should be made from the column "Tetrahedral Occupancy". We would give "1" if this column value is "Li" (meaning that the tetrahedral site is occupied to lithium) and "0" if this column is not "Li" (meaning that another cation occupies the tetrahedral site in this structure).
* (1) Feature 1 : Tetrahedral volume (unit in $\unicode{x212B}^3$)
* (2) Feature 2 : Competing volume (unit in $\unicode{x212B}^3$) - Smaller coefficient is expected for this part since this is a general volume for all kinds of coordination environments (cubic, 12-coordination, etc).
* (3) Feature 3 : Electronegativity of X
* (4) Feature 4 : Ionic radius of X - Since we are focused on the competition between Li and other cation for a given tetrahedral site, we may even convert this value to the ratio of radius ($r_X/r_{Li}$). We can decide later, or we can quickly add an additional column.
* (5) Feature 5 : Atomic radius of X - This will be less relevant than the ionic radius, but is still included. This can also be converted to the ratio of radius.
* (6) Feature 6 : One-hot encoding for different parts of periodic table, such as: "Is it a transition metal?", "Is it an Alkali metal?", "Is it an Alkali earth metal?", "Is it rare earth?"
* (7) We can add the row and column of the X element in the periodic table.

# Periodic table information (Row & group & block) is added : Column name "competing_row", "competing_group", "competing_block"
* Block means which block of periodic table the competing element lies in (s-, p-, d-, f- blocks are the possibilities)
* Block takes into account whether the element X is a transition metal, rare earth, alkali metal, etc. Also the row and group information contains this.
* Therefore, I do not add an additional column for a boolean value "is_transition_metal", "is_alkali", "is_rare_earth", etc.

In [13]:
df['competing_row']=[Element(i).row for i in df['X_species']]
df['competing_group']=[Element(i).group for i in df['X_species']]
df['competing_block']=[Element(i).block for i in df['X_species']]

In [15]:
df.head()
df.to_pickle("Tetrahedral-Dataset_V4.pickle")

# DF at this point is the final DataFrame through a series of data manipulation.
# All of the columns are explained again here. You don't need pymatgen installed. All of the data is included in the latest Dataframe that is pickled into "Tetrahedral-Dataset_V4.pickle". You can perform the modeling using this dataframe.
* mpid : Materials Project ID (Please refer to this https://materialsproject.org/ for more information on the database and what it is made out of)
* struct : Pymatgen (https://pymatgen.org/index.html) - Structure object expressed as dictionary
* formula : The chemical formula of this compound
* X_species : All of the entries are Li-ternary oxides (Li-X-O). X_species refer to the element X here.
* tet_li : the raw Li environment dictionary (Every information is extracted already and it is not necessary for the modelling)
* tet_X : the raw X environment dictionary (Every information is extracted already and it is not necessary for the modelling)
* Tetrahedral_Occupancy : Whether Li atom takes the tetrahedral site, or another element takes it (the tetrahedral-site occupying element)
* Tetrahedral_Volume : The volume of the tetrahedron
* Tet_CSM : CSM value for the tetrahedral site (Please refer to "Continuous Symmetry Measures. 5. The Classical Polyhedra", 1998, Inorganic Chemistry). Closer to 0 means perfect symmetrical polyhedron, closer to 100 means totally non-symmetric polyhedron
* Competing_Volume : Volume of the competing environment (Competing_Environment)
* Competing_Environment : Which environment is provided as a competition to the tetrahedral site. Here, all of them are octahedral sites, by construction.
* Competing_CSM : CSM value for the competing octahedral site
* X_charge : Charge of the X-atom by imposing +1 charge for Li, -2 charge for O. Some of them are non-integer because in reality you can have multiple valence charge for the same elements in a given structure.
    This can be used directly for the regression model, or can be grouped by rounding the numbers to the nearest integer, then modelling separately for each integer charge values.
* X Ionic Radius : The Ionic radius of X atom
* X Atomic Radius : The atomic radius of X atom
* X Electronegativity : Electronegativity value of X atom
* competing_row : The "row" value of the element X in periodic table
* competing_group : The "group" value of the element X in periodic table
* competing_block : The "block" information of the element X in periodic table


In [16]:
df = pd.read_pickle("Tetrahedral-Dataset_V4.pickle")

In [17]:
df.head()

Unnamed: 0,mpid,struct,formula,X_species,tet_li,tet_X,Tetrahedral_Occupancy,Tetrahedral_Volume,Tet_CSM,Competing_Volume,Competing_Environment,Competing_CSM,X_charge,X Ionic Radius,X Atomic Radius,X Electronegativity,competing_row,competing_group,competing_block
0,mp-1177528,"{'@module': 'pymatgen.core.structure', '@class...",Li3Ti7O14,Ti,"{'oct': [], 'tet': [{'csm': 0.0022241712812721...","{'oct': [{'csm': 0.6215316794802629, 'vol': 10...",Li,4.201175,0.001484,10.648055,oct,0.606701,3.571429,0.745,1.76,1.54,4,4,d
1,mp-976726,"{'@module': 'pymatgen.core.structure', '@class...",LiDyO2,Dy,"{'oct': [], 'tet': [{'csm': 1.601612267430422,...","{'oct': [{'csm': 0.521012997254456, 'vol': 16....",Li,4.160156,1.601607,16.077181,oct,0.521007,3.0,1.052,2.28,1.22,8,12,f
2,mp-771290,"{'@module': 'pymatgen.core.structure', '@class...",LiMn2O4,Mn,"{'oct': [], 'tet': [{'csm': 0.1071326396070336...","{'oct': [{'csm': 0.5747060430678905, 'vol': 9....",Li,3.664453,1.060414,10.626605,oct,0.692949,3.5,0.67,1.61,1.55,4,7,d
3,mp-772147,"{'@module': 'pymatgen.core.structure', '@class...",LiNb7O12,Nb,"{'oct': [{'csm': 28.007747747947796, 'vol': 10...","{'oct': [{'csm': 0.22608504681156708, 'vol': 1...",Nb,3.078004,9.909873,10.042085,oct,28.007748,3.285714,0.86,1.98,1.6,5,5,d
4,mp-772108,"{'@module': 'pymatgen.core.structure', '@class...",LiTi11O20,Ti,"{'oct': [], 'tet': [{'csm': 11.715618783359764...","{'oct': [{'csm': 0.5485547790999911, 'vol': 10...",Li,3.380015,11.715619,10.749769,oct,0.486442,3.545455,0.745,1.76,1.54,4,4,d
