In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re

from matminer.utils.conversions import str_to_composition

In [None]:
# Retrieve NIST SCD dataset from Citrine using matminer.
# The data will be stored in the df DataFrame.

from matminer.data_retrieval.retrieve_Citrine import CitrineDataRetrieval
from os import environ

api_key = environ['CITRINATION_API_KEY'] # insert your api key here
c = CitrineDataRetrieval(api_key=api_key)

df = c.get_dataframe(criteria={'data_set_id': '151803'})

In [None]:
# Get the number of samples and number of features of the dataset
df.shape

In [None]:
# Looking at the first 5 entries
df.head()

In [None]:
# Taking a look at a sample entry
df.loc[42,:].dropna()

In [None]:
# Plot a bar chart showing the 50 most common features
plt.figure(figsize=(8,16))
df.count().sort_values()[-50:].plot.barh()
plt.show()

In [None]:
density = df['Density'].dropna()
density = pd.to_numeric(density, errors='coerce')
density.hist(bins=100)
plt.xlabel('Density')
plt.ylabel('# of samples')
plt.show()

# Featurization

In [None]:
# Create copy of original data to not mess with them
feat = df.copy()

### Make chemical formula compatible with pymatgen.core.composition

In [None]:
# Initialize composition column
feat['composition'] = feat['chemicalFormula']

In [None]:
# Check how many formulas cause an error when they are fed to pymatgen.core.composition (via str_to_composition)

N_errors, N_total = 0, 0
for entry in feat['composition']:
    try:
        pd.Series([entry]).transform(str_to_composition)
    except:
        N_errors +=1
        #print(entry)
    finally:
        N_total +=1

print('{0} errors in {1} samples'.format(N_errors, N_total))

In [None]:
# This function removes certain characters and expressions from a chemical formula
# so that it can be converted using pymatgen.core.composition

def make_chem_form_compatible(formula):
    
    for bad_str in ['\.','x', 'y', '\+', '\-', 'z', 'w', '\%', '\^',   # individual characters
                 'Cordierite','hisker','Sialon', # certain words that show up in some formulas
                 '\$(.*?)\$',    # LaTeX expressions
                 '\((.*?)\)',    # bracketed expressions
                 '^\d{1,2}']:    # leading numbers of 1 or 2 digits
        formula = re.sub(bad_str, '', formula)
    
    return formula

In [None]:
# Convert chemical formulas using above function
feat["composition"] = feat["composition"].transform(make_chem_form_compatible)

# Converting chemical formula to composition object using
# matminer.utils.conversions.str_to_composition
# which in turn uses pymatgen.core.composition
feat["composition"] = feat["composition"].transform(str_to_composition)

In [None]:
feat['composition']