In [218]:
from os import environ
from citrination_client import CitrinationClient
from citrination_client import *

In [219]:
client = CitrinationClient(environ['CITRINATION_API_KEY'], 'https://citrination.com')
dataset_id = '151803'

In [220]:
value_query = FieldQuery(extract_as="Fracture Toughness", extract_all=True)
property_query = PropertyQuery(name=FieldQuery(filter=[Filter(equal="Fracture Toughness")]), value=value_query)
formula_query = ChemicalFieldQuery(extract_as="formula")
system_query = PifSystemQuery(chemical_formula=formula_query, properties=property_query)
dataset_query = DatasetQuery(id=[Filter(equal=dataset_id)])
data_query = DataQuery(dataset=dataset_query, system=system_query)
pif_query = PifSystemReturningQuery(size=5000, random_results=True, query=data_query)

search_result = client.search.pif_search(pif_query)

print("We found {} records".format(len(search_result.hits)))
print([x.extracted for x in search_result.hits[0:2]])

We found 396 records
[{'Fracture Toughness': ['4.6', '11.1'], 'formula': 'Al2O3.xZrO2'}, {'Fracture Toughness': ['3.8'], 'formula': 'Si3N4'}]


In [221]:
from pypif import pif
import csv

rows = []
pif_records = [x.system for x in search_result.hits]
for system in pif_records:
    if "x" not in system.chemical_formula and "." not in system.chemical_formula:
        for prop in system.properties:
            if prop.name == "Fracture Toughness":
                for cond in prop.conditions:
                    if cond.name == "Temperature":
                        if len(prop.scalars) == len(cond.scalars):
                            for prop_sca, cond_sca in zip(prop.scalars, cond.scalars):
                                if prop_sca.value and cond_sca.value:
                                    if cond_sca.value.isdigit():
                                        try:
                                            float(prop_sca.value)
                                            row = [system.chemical_formula, float(prop_sca.value), cond_sca.value]
                                            rows.append(row)
                                        except ValueError as e:
                                            print(e)


with open('fracture_toughness.csv', 'w') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Formula', 'Fracture Toughness', "Temperature"])
    writer.writerows(rows)
    

In [222]:
from matminer.utils.conversions import str_to_composition
from matminer.featurizers.composition import ElementProperty
import pandas as pd

df = pd.read_csv('fracture_toughness.csv')

df["composition"] = df["Formula"].transform(str_to_composition)
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df_feat = ep_feat.featurize_dataframe(df, col_id="composition", ignore_errors=False)


In [223]:
df_feat = df_feat.drop(['Formula'], axis=1)
df_feat = df_feat.drop(['composition'], axis=1)

In [224]:
import numpy as np
labels = np.array(df_feat['Fracture Toughness'])
features = df_feat.drop('Fracture Toughness', axis = 1)
features = np.array(features)
feature_list = list(df_feat.columns)

print(features)

[[  25.            7.           14.         ...  208.14285714
    16.16326531  194.        ]
 [1000.            7.           14.         ...  208.14285714
    16.16326531  194.        ]
 [1200.            7.           14.         ...  208.14285714
    16.16326531  194.        ]
 ...
 [1200.            6.            6.         ...  194.
     0.          194.        ]
 [1160.            6.            6.         ...  194.
     0.          194.        ]
 [1190.            6.            6.         ...  194.
     0.          194.        ]]


In [225]:
from sklearn.model_selection import train_test_split
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)


Training Features Shape: (425, 133)
Training Labels Shape: (425,)
Testing Features Shape: (142, 133)
Testing Labels Shape: (142,)


In [226]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [227]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2))
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Mean Absolute Error: 0.78
Accuracy: 75.09 %.


In [228]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];

Variable: Fracture Toughness   Importance: 0.35
Variable: avg_dev GSbandgap    Importance: 0.23
Variable: range GSbandgap      Importance: 0.12
Variable: mean NValence        Importance: 0.08
Variable: range MeltingT       Importance: 0.03
Variable: mode NfValence       Importance: 0.02
Variable: minimum MeltingT     Importance: 0.01
Variable: maximum Column       Importance: 0.01
Variable: range Electronegativity Importance: 0.01
Variable: maximum NValence     Importance: 0.01
Variable: mean NpUnfilled      Importance: 0.01
Variable: maximum GSvolume_pa  Importance: 0.01
Variable: range SpaceGroupNumber Importance: 0.01
Variable: Temperature          Importance: 0.0
Variable: minimum Number       Importance: 0.0
Variable: maximum Number       Importance: 0.0
Variable: range Number         Importance: 0.0
Variable: mean Number          Importance: 0.0
Variable: avg_dev Number       Importance: 0.0
Variable: mode Number          Importance: 0.0
Variable: minimum MendeleevNumber Importan