In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re

from matminer.utils.conversions import str_to_composition

In [3]:
# Retrieve NIST SCD dataset from Citrine using matminer.
# The data will be stored in the df DataFrame.

first_retrieve = False #change it to indicate first time retrieve dataset or not

from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from os import environ

if first_retrieve:
    from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
    from os import environ

    api_key = environ['CITRINATION_API_KEY'] # insert your api key here
    c = CitrineDataRetrieval(api_key=api_key)
    df = c.get_dataframe(criteria={'data_set_id': '151803'})
    
    # Save downloaded dataset
    df.to_csv('NIST_CeramicDataSet.csv')
    df.to_pickle('NIST_CeramicDataSet.pkl')
else:
    df  = pd.read_pickle('NIST_CeramicDataSet.pkl')

In [4]:
# Get the number of samples and number of features of the dataset
df.shape

(4098, 161)

In [None]:
# Looking at the first 5 entries
df.head()

In [None]:
# Taking a look at a sample entry
df.loc[42,:].dropna()

In [None]:
# Plot a bar chart showing the 50 most common features
plt.figure(figsize=(8,16))
df.count().sort_values()[-50:].plot.barh()
plt.show()

In [None]:
density = df['Density'].dropna()
density = pd.to_numeric(density, errors='coerce')
density.hist(bins=100)
plt.xlabel('Density')
plt.ylabel('# of samples')
plt.show()

# Featurization

In [None]:
# Create copy of original data to not mess with them
feat = df.copy()

### Make chemical formula compatible with pymatgen.core.composition

In [None]:
# Initialize composition column
feat['composition'] = feat['chemicalFormula']

In [None]:
# Check how many formulas cause an error when they are fed to pymatgen.core.composition (via str_to_composition)

N_errors, N_total = 0, 0
for entry in feat['composition']:
    try:
        pd.Series([entry]).transform(str_to_composition)
    except:
        N_errors +=1
        #print(entry)
    finally:
        N_total +=1

print('{0} errors in {1} samples'.format(N_errors, N_total))

In [None]:
# This function removes certain characters and expressions from a chemical formula
# so that it can be converted using pymatgen.core.composition

def make_chem_form_compatible(formula):
    
    for bad_str in ['\.','x', 'y', '\+', '\-', 'z', 'w', '\%', '\^',   # individual characters
                 'Cordierite','hisker','Sialon', # certain words that show up in some formulas
                 '\$(.*?)\$',    # LaTeX expressions
                 '\((.*?)\)',    # bracketed expressions
                 '^\d{1,2}']:    # leading numbers of 1 or 2 digits
        formula = re.sub(bad_str, '', formula)
    
    return formula

In [None]:
# Convert chemical formulas using above function
feat["composition"] = feat["composition"].transform(make_chem_form_compatible)

# Converting chemical formula to composition object using
# matminer.utils.conversions.str_to_composition
# which in turn uses pymatgen.core.composition
feat["composition"] = feat["composition"].transform(str_to_composition)

In [None]:
feat['composition']

### Parse chemical formula
#### x, y, z, and w are setted to be 0.1 (can be modified)

In [5]:
df1 = df.copy()
df1.loc[3892,'chemicalFormula'] = 'BN' #fix 'B-N' to 'BN'

In [6]:
# Parse the chemicalFormula
def formula_decompose(formula):
    '''
    decompose chemical formula 
    return
        composition: list, [(element,num),...]
            element: string
            num: string, can be math expression such as '1+0.5x'
    '''

    comp = []
    p = re.compile(r'(\d?[w-z]?)([A-Z][a-u]?)(\d*\+?\-?\d*\.?\d*[w-z]?)')

    #split the chemical formula if there is dots, but not for cases like Mg1.5x
    if re.search(r'\.', formula) and not re.search(r'\d+\.\d[w-z]', formula): 
        formula = formula.split('.')
        for item in formula:
            prefactor = '1'
            for i in re.findall(p, item):
                pre, elem, num = i
                if pre:
                    prefactor = pre
                if num == '':
                    num = '1'
                num = prefactor + '*({})'.format(num)
                comp.append((elem, num))
    else:
        prefactor = '1'
        for i in re.findall(p, formula):
            pre, elem, num = i
            if pre:
                prefactor = pre
            if num == '':
                num = '1'
            num = prefactor + '*({})'.format(num)
            comp.append((elem, num))
    return comp 

def formula_reconstruct(composition, x=0.1, y=0.1, z=0.1, w=0.1):
    '''
    reconstruct chemical formula from composition
    composition in form of [(element,num), (element,num),...]
        element: string
        num: string, can be math expression such as '1+0.5x'

    return 
        flat chemcial formula: string, such as 'Ti1.5Cu0.1Au1.0'
    '''
    flat_list = []
    for (elem, num) in composition:
        num = re.sub(r'(\d)([w-z])', r'\1*\2', num) #convert 5x to 5*x
        flat_list.append(elem)
        flat_list.append(format(eval(num), '.1f'))
    return ''.join(flat_list)
  
def formula_parser(formula):
    return formula_reconstruct(formula_decompose(formula))

In [7]:
df1["flatFormula"] = df1["chemicalFormula"].map(formula_parser)
df1.dropna(axis=1)

Unnamed: 0,chemicalFormula,licenses,references,Chemical Family,flatFormula
1,SiO2,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Gaseous Corrosion of Ceramics, ...",Si-O,Si1.0O2.0
2,3Al2O3.2SiO2.xZrO2.yY2O3,[{'name': 'NIST_SRD-30'}],[{'citation': 'Thermal Diffusivity/Conductivit...,Al-Si-O:ZrO,Al6.0O9.0Si2.0O4.0Zr0.1O0.2Y0.2O0.3
3,Al2O3,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Mechanical Properties of Pure, ...",Al-O,Al2.0O3.0
4,Si3N4.xCeO2.ySiC,[{'name': 'NIST_SRD-30'}],[{'citation': 'Effect of Silicon Carbide Whisk...,Si-N:CeO,Si3.0N4.0Ce0.1O0.2Si0.1C0.1
5,Si3N4,[{'name': 'NIST_SRD-30'}],[{'citation': 'Corrosion of Silicon Nitride Ce...,Si-N,Si3.0N4.0
6,ZrO2.xY2O3,[{'name': 'NIST_SRD-30'}],[{'citation': 'Diffusional Creep and Kinetic D...,Zr-O:Y,Zr1.0O2.0Y0.2O0.3
7,ZrO2.xY2O3,[{'name': 'NIST_SRD-30'}],[{'citation': 'Porosity-Dependence of Elastic ...,Zr-O:Y,Zr1.0O2.0Y0.2O0.3
8,Si3N4,[{'name': 'NIST_SRD-30'}],[{'citation': 'Microhardness Load Size Effect ...,Si-N,Si3.0N4.0
9,SiO2,[{'name': 'NIST_SRD-30'}],[{'citation': 'High Temperature-Elastic Moduli...,Si-O,Si1.0O2.0
10,Si3N4.xY2O3.ySrO,[{'name': 'NIST_SRD-30'}],[{'citation': 'Ceramic Technology Project Data...,"Si-N:SrO,Y",Si3.0N4.0Y0.2O0.3Sr0.1O0.1


In [8]:
df1["composition"] =df1["flatFormula"].transform(str_to_composition)
df1.dropna(axis=1).head()

Unnamed: 0,chemicalFormula,licenses,references,Chemical Family,flatFormula,composition
1,SiO2,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Gaseous Corrosion of Ceramics, ...",Si-O,Si1.0O2.0,"(O, Si)"
2,3Al2O3.2SiO2.xZrO2.yY2O3,[{'name': 'NIST_SRD-30'}],[{'citation': 'Thermal Diffusivity/Conductivit...,Al-Si-O:ZrO,Al6.0O9.0Si2.0O4.0Zr0.1O0.2Y0.2O0.3,"(Zr, O, Al, Si, Y)"
3,Al2O3,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Mechanical Properties of Pure, ...",Al-O,Al2.0O3.0,"(O, Al)"
4,Si3N4.xCeO2.ySiC,[{'name': 'NIST_SRD-30'}],[{'citation': 'Effect of Silicon Carbide Whisk...,Si-N:CeO,Si3.0N4.0Ce0.1O0.2Si0.1C0.1,"(O, Ce, Si, C, N)"
5,Si3N4,[{'name': 'NIST_SRD-30'}],[{'citation': 'Corrosion of Silicon Nitride Ce...,Si-N,Si3.0N4.0,"(Si, N)"


In [10]:
#check the composition object
df1.loc[4,'composition']

Comp: Ce0.1 Si3.1 C0.1 N4 O0.2