# 1 - Parse CSV / work up training data

GOAL: Convert training data to a format that is amenable to a machine learning model. This will also work on test_data.csv.
1. Load CSV in Pandas
2. Stabilityvec is interpreted as a string, let's convert it to a list and breakout/explode the dataframe so each row now represents a unique compound and it's stability.
3. After the breakout, the ordering implicitly defines composition (row 1 = 100% A 0% B , row 5 = 50% A 50% B), let's explicitly define this with a FORMULA column.
4. The feature columns given are not weighted, it may be quicker to use our FORMULA column to generate weighted features in magpie.

output files:
1. "XXXX_exploded.csv" = breakout on stabiltyVec
2. "XXXX_exploded_no_features.csv" = removed non-weighted features
3. "XXXX_exploded_magpie_featurized.csv" = added magpie features
4. "XXXX_exploded_magpie_featurized_predictions.csv" = added predictions column (rf = random forest, logreg = logistic regression)
5. "test_data_with_predictions.csv" = re-formatted initial test_data.csv with StabilityVec

In [1]:
# import relevant packages
import pandas as pd
import numpy as np
from ast import literal_eval
from pymatgen import Composition

# this placeholder vector is added to the test data so tha df.explode() generates the correct number of rows.
def return_placeholder_vec(row):
    val = ['0','0','0','0','0','0','0','0','0','0','0']
    return val

In [2]:
# 1. Load CSV in Pandas, toggle training or test data

input_file_path = 'data/training_data.csv'
# input_file_path = 'data/test_data.csv'


df = pd.read_csv(input_file_path)
print("original: ", len(df))

if 'test' in input_file_path:
    df['sv_placeholder'] = df.apply(return_placeholder_vec, axis=1)
    df = df.explode('sv_placeholder')

else:
    df['stabilityVec'] = df['stabilityVec'].apply(lambda x: literal_eval(x))
    df = df.explode('stabilityVec')

# convert stability vector to list, explode on list
print("exploded: ", len(df))

original:  2572
exploded:  28292


In [3]:
# calculate weights column for A and B, useful for calc a composition column
weights = []
num_rows = int(len(df)/11)
for i in range(num_rows):
    for i in np.linspace(0,1,11):
        weights.append(round(i,1))

df['weight_element_A'] = weights[::-1]
df['weight_element_B'] = weights

In [4]:
# add comp column (useful for magpie featurization)
def calc_formula(row):
    comp = Composition(row['formulaA']+str(row['weight_element_A'])+row['formulaB']+str(row['weight_element_B']))
    return comp.formula

df['FORMULA'] = df.apply(calc_formula, axis=1)

  % self.symbol)
  % self.symbol)
  % self.symbol)


In [5]:
outfile_path = input_file_path.replace('.csv', '_exploded.csv')
df.to_csv(outfile_path)

In [6]:
# Since features for A and B are not weighted by composition, it may be easier to use features generated in magpie
# Let's filter on composition and generate new (properly weighted) features in magpie
df = df.filter(['FORMULA', 'stabilityVec'], axis=1)
# df = df.drop_duplicates(subset='FORMULA')
outfile_path = input_file_path.replace('.csv', '_exploded_no_features.csv')
df.to_csv(outfile_path)

In [7]:
# calc comp-weighters features with matminer
# for info on featurzation see: https://github.com/hackingmaterials/matminer_examples/blob/master/matminer_examples/machine_learning-nb/bulk_modulus.ipynb)
from matminer.featurizers.conversions import StrToComposition
from matminer.featurizers.composition import ElementProperty
df = StrToComposition().featurize_dataframe(df, "FORMULA")
ep_feat = ElementProperty.from_preset(preset_name="magpie")
df = ep_feat.featurize_dataframe(df, col_id="composition")             # input the "composition" column to the featurizer







In [8]:
# check for rows that didn't completely featurize
# rows with noble gases do not have electroneg, either drops rows or features, let's drop elecneg features.
nan_rows = df[df.isnull().any(1)]
print(len(nan_rows))
nan_rows.to_csv('nan_rows.csv')
df = df.dropna(axis=1)

1843


In [9]:
outfile_path = input_file_path.replace('.csv', '_exploded_magpie_featurized.csv')
df.to_csv(outfile_path)