In [14]:
import pandas as pd
import re

from matminer.utils.conversions import str_to_composition
from matminer.featurizers.composition import ElementProperty

In [15]:
df  = pd.read_pickle('../deliver/NIST_CeramicDataSet.pkl')
df.head()

Unnamed: 0,chemicalFormula,licenses,names,preparation,references,Axis Length,Axis Length-conditions,Axis Length-units,Bulk Modulus,Bulk Modulus-conditions,...,Thermal Expansion-conditions,Thermal Expansion-units,Thermal Shock Resistance,Thermal Shock Resistance-conditions,Thermal Shock Resistance-units,Weibull Modulus,Weibull Modulus-conditions,Weibull Strength,Weibull Strength-conditions,Weibull Strength-units
1,SiO2,[{'name': 'NIST_SRD-30'}],[Silica],,"[{'citation': 'Gaseous Corrosion of Ceramics, ...",,,,,,...,,,,,,,,,,
2,3Al2O3.2SiO2.xZrO2.yY2O3,[{'name': 'NIST_SRD-30'}],[Mullite composite],[{'name': 'Hot Pressing'}],[{'citation': 'Thermal Diffusivity/Conductivit...,,,,,,...,,,,,,,,,,
3,Al2O3,[{'name': 'NIST_SRD-30'}],[Alumina],[{'name': 'Hot Pressing'}],"[{'citation': 'Mechanical Properties of Pure, ...",,,,,,...,,,,,,,,,,
4,Si3N4.xCeO2.ySiC,[{'name': 'NIST_SRD-30'}],[Silicon nitride composite],[{'name': 'Hot Pressing'}],[{'citation': 'Effect of Silicon Carbide Whisk...,,,,,,...,,,,,,,,,,
5,Si3N4,[{'name': 'NIST_SRD-30'}],[Silicon nitride],[{'name': 'Hot Isostatic Pressing'}],[{'citation': 'Corrosion of Silicon Nitride Ce...,,,,,,...,,,,,,,,,,


### Composition with Chih-Hao's method

In [16]:
df1 = df.copy()
df1.loc[3892,'chemicalFormula'] = 'BN' #fix 'B-N' to 'BN'

In [17]:
# Parse the chemicalFormula
def formula_decompose(formula):
    '''
    decompose chemical formula 
    return
        composition: list, [(element,num),...]
            element: string
            num: string, can be math expression such as '1+0.5x'
    '''

    comp = []
    p = re.compile(r'(\d?[w-z]?)([A-Z][a-u]?)(\d*\+?\-?\d*\.?\d*[w-z]?)')

    #split the chemical formula if there is dots, but not for cases like Mg1.5x
    if re.search(r'\.', formula) and not re.search(r'\d+\.\d[w-z]', formula): 
        formula = formula.split('.')
        for item in formula:
            prefactor = '1'
            for i in re.findall(p, item):
                pre, elem, num = i
                if pre:
                    prefactor = pre
                if num == '':
                    num = '1'
                num = prefactor + '*({})'.format(num)
                comp.append((elem, num))
    else:
        prefactor = '1'
        for i in re.findall(p, formula):
            pre, elem, num = i
            if pre:
                prefactor = pre
            if num == '':
                num = '1'
            num = prefactor + '*({})'.format(num)
            comp.append((elem, num))
    return comp 

def formula_reconstruct(composition, x=0.1, y=0.1, z=0.1, w=0.1):
    '''
    reconstruct chemical formula from composition
    composition in form of [(element,num), (element,num),...]
        element: string
        num: string, can be math expression such as '1+0.5x'

    return 
        flat chemcial formula: string, such as 'Ti1.5Cu0.1Au1.0'
    '''
    flat_list = []
    for (elem, num) in composition:
        num = re.sub(r'(\d)([w-z])', r'\1*\2', num) #convert 5x to 5*x
        flat_list.append(elem)
        flat_list.append(format(eval(num), '.1f'))
    return ''.join(flat_list)
  
def formula_parser(formula):
    return formula_reconstruct(formula_decompose(formula))

In [18]:
df1["flatFormula"] = df1["chemicalFormula"].map(formula_parser)
df1.dropna(axis=1).head()

Unnamed: 0,chemicalFormula,licenses,references,Chemical Family,flatFormula
1,SiO2,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Gaseous Corrosion of Ceramics, ...",Si-O,Si1.0O2.0
2,3Al2O3.2SiO2.xZrO2.yY2O3,[{'name': 'NIST_SRD-30'}],[{'citation': 'Thermal Diffusivity/Conductivit...,Al-Si-O:ZrO,Al6.0O9.0Si2.0O4.0Zr0.1O0.2Y0.2O0.3
3,Al2O3,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Mechanical Properties of Pure, ...",Al-O,Al2.0O3.0
4,Si3N4.xCeO2.ySiC,[{'name': 'NIST_SRD-30'}],[{'citation': 'Effect of Silicon Carbide Whisk...,Si-N:CeO,Si3.0N4.0Ce0.1O0.2Si0.1C0.1
5,Si3N4,[{'name': 'NIST_SRD-30'}],[{'citation': 'Corrosion of Silicon Nitride Ce...,Si-N,Si3.0N4.0


In [19]:
df1["composition"] =df1["flatFormula"].transform(str_to_composition)
df1.dropna(axis=1).head()

Unnamed: 0,chemicalFormula,licenses,references,Chemical Family,flatFormula,composition
1,SiO2,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Gaseous Corrosion of Ceramics, ...",Si-O,Si1.0O2.0,"(Si, O)"
2,3Al2O3.2SiO2.xZrO2.yY2O3,[{'name': 'NIST_SRD-30'}],[{'citation': 'Thermal Diffusivity/Conductivit...,Al-Si-O:ZrO,Al6.0O9.0Si2.0O4.0Zr0.1O0.2Y0.2O0.3,"(Al, O, Si, Zr, Y)"
3,Al2O3,[{'name': 'NIST_SRD-30'}],"[{'citation': 'Mechanical Properties of Pure, ...",Al-O,Al2.0O3.0,"(Al, O)"
4,Si3N4.xCeO2.ySiC,[{'name': 'NIST_SRD-30'}],[{'citation': 'Effect of Silicon Carbide Whisk...,Si-N:CeO,Si3.0N4.0Ce0.1O0.2Si0.1C0.1,"(Si, N, Ce, O, C)"
5,Si3N4,[{'name': 'NIST_SRD-30'}],[{'citation': 'Corrosion of Silicon Nitride Ce...,Si-N,Si3.0N4.0,"(Si, N)"


In [22]:
df1.shape

(4098, 163)

In [20]:
df1_feat = df1.copy()

In [21]:
# Add features with matminer
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df1_feat = ep_feat.featurize_dataframe(df1_feat, col_id="composition", ignore_errors=True)

In [23]:
df1_feat.shape

(4098, 295)

In [24]:
# List of the new columns
list(set(df1_feat.columns) ^ set(df1))

['range GSvolume_pa',
 'range NdUnfilled',
 'mean Electronegativity',
 'mean NValence',
 'mode NValence',
 'mode SpaceGroupNumber',
 'mean NpUnfilled',
 'avg_dev Column',
 'avg_dev NsUnfilled',
 'mean CovalentRadius',
 'mean AtomicWeight',
 'mode NdValence',
 'maximum NfValence',
 'avg_dev NdValence',
 'maximum NfUnfilled',
 'mode CovalentRadius',
 'maximum GSvolume_pa',
 'mean NfValence',
 'mode NfUnfilled',
 'maximum Row',
 'avg_dev NfValence',
 'mean Column',
 'minimum NsUnfilled',
 'mean GSmagmom',
 'minimum MeltingT',
 'minimum NfUnfilled',
 'range SpaceGroupNumber',
 'minimum NValence',
 'avg_dev NValence',
 'mean NsUnfilled',
 'mean NfUnfilled',
 'maximum NValence',
 'maximum NpUnfilled',
 'avg_dev NUnfilled',
 'maximum Electronegativity',
 'minimum GSbandgap',
 'avg_dev NpUnfilled',
 'range AtomicWeight',
 'avg_dev MeltingT',
 'range NValence',
 'range Column',
 'mode MendeleevNumber',
 'mean SpaceGroupNumber',
 'range NdValence',
 'maximum AtomicWeight',
 'minimum NpValence',


In [25]:
df1_feat['avg_dev MeltingT'].head()

1    725.422222
2    516.580380
3    421.761600
4    823.174827
5    795.404082
Name: avg_dev MeltingT, dtype: float64