In [2]:
from matbench.bench import MatbenchBenchmark
import os
import numpy as np
import pandas as pd
import torch
from sklearn.metrics import roc_auc_score  
from lgdcnn.fusion_lstm_dcnn import LGDCNN
from lgdcnn.train import Model
from lgdcnn.utils.get_compute_device import get_compute_device

compute_device = get_compute_device(prefer_last=False)
RNG_SEED = 42
torch.manual_seed(RNG_SEED)
np.random.seed(RNG_SEED)
model_name = "L-G-DCNN-matbench"

In [3]:
# %%
def get_model(data_dir,model_name, mat_prop, i, classification=False, batch_size=None,
              transfer=None, verbose=True):
    # Get the TorchedLGDCNN architecture loaded
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}{i}', verbose=verbose)

    # Train network starting at pretrained weights
    if transfer is not None:
        model.load_network(f'{transfer}.pth')
        model.model_name = f'{mat_prop}'

    # Apply BCEWithLogitsLoss to model output if binary classification is True
    if classification:
        model.classification = True

    # Get the datafiles you will learn from
    train_data = f'{data_dir}/{mat_prop}/train.csv'
    val_data = f'{data_dir}/{mat_prop}/val.csv'

    # Load the train and validation data before fitting the network
    data_size = pd.read_csv(train_data).shape[0]
    batch_size = 2**round(np.log2(data_size)-4)
    if batch_size < 2**7:
        batch_size = 2**7
    if batch_size > 2**12:
        batch_size = 2**12
  
    model.load_data(train_data, batch_size=batch_size//2, train=True)
    print(f'training with batchsize {model.batch_size} '
          f'(2**{np.log2(model.batch_size):0.3f})')
    model.load_data(val_data, batch_size=batch_size//2)

    # Set the number of epochs, decide if you want a loss curve to be plotted
    model.fit(epochs=300, losscurve=False)

    # Save the network (saved as f"{model_name}.pth")
    model.save_network(model_name)
    return model

def load_model(data_dir, model_name, mat_prop, i, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}{i}', verbose=verbose)
    model.load_network(model_name, f'{mat_prop}{i}.pth')

    # Check if classifcation task
    if classification:
        model.classification = True
    # Load the data you want to predict with
    data = f'{data_dir}/{mat_prop}/{file_name}'
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9)
    return model

def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output

def to_csv(output, save_name):
    # parse output and save to csv
    act, pred, formulae, uncertainty = output
    df = pd.DataFrame([formulae, act, pred, uncertainty]).T
    # df.columns = ['composition', 'target', 'pred-0', 'uncertainty']
    df.columns = ['formula', 'actual', 'predicted', 'uncertainty']
    save_path = 'matbench_predictions/'
    os.makedirs(save_path, exist_ok=True)
    df.to_csv(f'{save_path}/{save_name}', index_label='Index')
    

def save_results(data_dir, model_name,mat_prop, fold, classification, file_name, ):
    model = load_model(data_dir, model_name,mat_prop, fold, classification, file_name = 'test.csv' )
    model, output = get_results(model)
    
    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.3f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.3g}')

    # save predictions to a csv
    fname = f'{mat_prop}_{file_name.replace(".csv", "")}_output{fold}.csv'
    to_csv(output, fname)
    return model, output

In [4]:

#condesne_formula takes a material and returns the chemical formula in the correct format for LGDCNN
def condense_formula(mat):
    if isinstance(mat, str):
        return mat
    else:
        return mat.formula.replace(' ', '')

#change_input runs condesne_formula on all the input data used for training
def change_input(train_inputs):
  inputs = []
  for input in train_inputs:
    inputs.append(condense_formula(input))
  return inputs

#make_df creates a data frame containing the train inputs and outputs for LGDCNN
def make_df(train_inputs, train_outputs):
  input_df = pd.DataFrame({'formula': train_inputs, 'target': train_outputs})
  return input_df

#make_df_test creates a data frame containing the test inputs for LGDCNN
def make_df_test(test_inputs, test_outputs):
  test_df = pd.DataFrame({'formula' : test_inputs, 'target': test_outputs})
  # test_df['target'] = np.nan
  return test_df

#split_train_val splits the training data into two sets: training and validation
def split_train_val(df):
  df = df.sample(frac = 1.0, random_state = 7)
  val_df = df.sample(frac = 0.1, random_state = 7)
  train_df = df.drop(val_df.index)  
  print(train_df.shape, val_df.shape) 
  return train_df, val_df

In [None]:
subset = ["matbench_jdft2d", "matbench_steels", 
          "matbench_perovskites", "matbench_expt_gap",
          "matbench_phonons", "matbench_dielectric", 
          "matbench_log_gvrh", "matbench_log_kvrh",
          "matbench_mp_gap", "matbench_mp_e_form"]

mb = MatbenchBenchmark(autoload=False, subset=subset)
data_dir = 'data/matbench_temp'
os.makedirs(data_dir, exist_ok= True)

results_dict = {}

for task in mb.tasks:
    task.load()
    mat_prop = task.dataset_name
    os.makedirs(f'{data_dir}/{mat_prop}', exist_ok= True)
    for fold in task.folds:
        train_inputs, train_outputs = task.get_train_and_val_data(fold)
        test_inputs , test_outputs = task.get_test_data(fold, include_target=True)

        #Preparing the inputs data for LGDCNN
        inputs = change_input(train_inputs)
        df = make_df(inputs, train_outputs)

        #Creating the training and validation sets
        train_df, val_df = split_train_val(df)
        train_df.to_csv(f'{data_dir}/{mat_prop}/train.csv')
        val_df.to_csv(f'{data_dir}/{mat_prop}/val.csv')

        #Getting and preparing the testing data
        test_inputs_formula = change_input(test_inputs)
        test_df = make_df_test(test_inputs_formula, test_outputs)
        test_df.to_csv(f'{data_dir}/{mat_prop}/test.csv')

        #Training LGDCNN
        model = get_model(data_dir, model_name, mat_prop, fold, classification = False, verbose = True, )
        
        model_test, output = save_results(data_dir, model_name,mat_prop, fold, classification = False,
                                     file_name='test.csv',)
        
        # Recording our data!
        predictions = output[1]
        task.record(fold, predictions)

# Saving our results
mb.to_file("LGDCNN_"+mat_prop +".json")

In [None]:
import gzip
import shutil

def compress_json(input_file, output_file):
    with open(input_file, 'rb') as f_in:
        with gzip.open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

compress_json('results.json', 'results.json.gz')