In [2]:

import sys
import os
# To get the absolute path of the directory where the lgdcnn module is located
lgdcnn_dir = r"D:\deep\LGDCNN"
# Add the directory where the lgdcnn module is located to the module search path
sys.path.append(lgdcnn_dir)

import numpy as np
import pandas as pd
import torch
from lgdcnn.fusion_lstm_dcnn import LGDCNN
# from lgdcnn.fusion_lstm_dcnn_v1 import LGDCNN
# from crabnet.model_application import Model
from lgdcnn.train import Model
from lgdcnn.utils.get_compute_device import get_compute_device

compute_device = get_compute_device(prefer_last=True)
RNG_SEED = 42
torch.manual_seed(RNG_SEED)  
np.random.seed(RNG_SEED)
from sklearn.metrics import roc_auc_score

lgdcnn_dir = r"D:\deep\LGDCNN"

  from .autonotebook import tqdm as notebook_tqdm


## train


In [3]:
def get_model(model_name, mat_prop, classification=False, batch_size=None,
              transfer=None, verbose=True):
    # Get the TorchedCrabNet architecture loaded
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)

    # Train network starting at pretrained weights
    if transfer is not None:
        model.load_network(f'{transfer}.pth')
        model.model_name = f'{mat_prop}'

    # Apply BCEWithLogitsLoss to model output if binary classification is True
    if classification:
        model.classification = True

    # Get the datafiles you will learn from
    train_data = os.path.join(lgdcnn_dir,"data", "benchmark_data", mat_prop, 'train.csv') 
    val_data = os.path.join(lgdcnn_dir,"data", "benchmark_data", mat_prop, 'val.csv')

    # Load the train and validation data before fitting the network
    data_size = pd.read_csv(train_data).shape[0]
    batch_size = 2**round(np.log2(data_size)-4)
    if batch_size < 2**7:
        batch_size = 2**7
    if batch_size > 2**12:
        batch_size = 2**12
    # batch_size = 2**7
    model.load_data(train_data, batch_size=batch_size, train=True)
    print(f'training with batchsize {model.batch_size} '
          f'(2**{np.log2(model.batch_size):0.3f})')
    model.load_data(val_data, batch_size=batch_size)

    # Set the number of epochs, decide if you want a loss curve to be plotted
    model.fit(epochs=300, losscurve=False)

    # Save the network (saved as f"{model_name}.pth")
    model.save_network(model_name)
    return model


def to_csv(output, save_name):
    # parse output and save to csv
    act, pred, formulae, uncertainty = output
    df = pd.DataFrame([formulae, act, pred, uncertainty]).T
    df.columns = ['formula', 'actual', 'predicted', 'uncertainty']
    save_path = os.path.join(lgdcnn_dir,"results", "Benchmark") 
    os.makedirs(save_path, exist_ok=True)
    df.to_csv(f'{save_path}/{save_name}', index_label='Index')


def load_model(lgdcnn_dir, model_name, mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(model_name, f'{mat_prop}.pth') # multi_lstm_attention_residual_dpcnn_V8_512

    # Check if classifcation task
    if classification:
        model.classification = True

    # Load the data you want to predict with
    data = os.path.join(lgdcnn_dir,"data","benchmark_data",mat_prop,file_name)
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results( lgdcnn_dir, model_name, mat_prop, classification, file_name, verbose=True):
    model = load_model( lgdcnn_dir, model_name, mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.4f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.4g}')
        
     # save predictions to a csv
    fname = f'{mat_prop}_{file_name.replace(".csv", "")}_output.csv'
    to_csv(output, fname)
    return model, mae


if __name__ == '__main__':
    model_name = "L-G-DCNN-TEST"
    # Get data to benchmark on
    # data_dir = 'data/benchmark_data'
    benchmark_data_dir = os.path.join(lgdcnn_dir,"data","benchmark_data")
    mat_props = os.listdir(benchmark_data_dir)
    classification_list = []
    print(f'training: {mat_props}')
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        model = get_model(model_name, mat_prop, classification, verbose=True)
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(lgdcnn_dir,model_name, mat_prop, classification,
                                         'test.csv', verbose=False)
        print('calculating val mae')
        model_val, v_mae = save_results(lgdcnn_dir, model_name, mat_prop, classification,
                                        'val.csv', verbose=False)
        print('=====================================================')


training: ['aflow__ael_bulk_modulus_vrh', 'aflow__ael_debye_temperature', 'aflow__ael_shear_modulus_vrh', 'aflow__agl_thermal_conductivity_300K', 'aflow__agl_thermal_expansion_300K', 'aflow__Egap', 'aflow__energy_atom', 'CritExam__Ed', 'CritExam__Ef', 'mp_bulk_modulus', 'mp_elastic_anisotropy', 'mp_e_hull', 'mp_mu_b', 'mp_shear_modulus', 'OQMD_Bandgap', 'OQMD_Energy_per_atom', 'OQMD_Formation_Enthalpy', 'OQMD_Volume_per_atom']
property: aflow__ael_bulk_modulus_vrh
Running on compute device: cuda:0
Model size: 8635911 parameters



Generating EDM: 100%|██████████| 3428/3428 [00:00<00:00, 193834.67formulae/s]


loading data with up to 3 elements in the formula
training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 732/732 [00:00<00:00, 244192.36formulae/s]

loading data with up to 3 elements in the formula
stepping every 140 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler





Epoch: 0/300 --- train mae: 53 val mae: 54
Epoch: 19/300 --- train mae: 10.9 val mae: 13.8
Epoch: 39/300 --- train mae: 7.39 val mae: 11.1
Epoch: 59/300 --- train mae: 5.17 val mae: 10.4
Epoch: 79/300 --- train mae: 3.65 val mae: 10.3
Epoch: 99/300 --- train mae: 2.69 val mae: 10.2
Epoch: 119/300 --- train mae: 2.15 val mae: 10.1
Epoch: 139/300 --- train mae: 1.75 val mae: 9.99
Epoch: 159/300 --- train mae: 1.48 val mae: 9.87
Epoch: 179/300 --- train mae: 1.25 val mae: 9.82
Epoch: 199/300 --- train mae: 1.14 val mae: 9.86
Epoch: 219/300 --- train mae: 0.956 val mae: 9.83
Epoch: 239/300 --- train mae: 0.911 val mae: 9.84
Epoch: 259/300 --- train mae: 0.836 val mae: 9.85
Epoch: 279/300 --- train mae: 0.808 val mae: 9.84
Epoch: 299/300 --- train mae: 0.72 val mae: 9.79
Saving network (aflow__ael_bulk_modulus_vrh) to D:\deep\LGDCNN\models\Benchmark\L-G-DCNN-TEST\aflow__ael_bulk_modulus_vrh.pth
calculating test mae
loading data with up to 3 elements in the formula

aflow__ael_bulk_modulus_v

Generating EDM: 100%|██████████| 3428/3428 [00:00<00:00, 215076.43formulae/s]

loading data with up to 3 elements in the formula





training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 732/732 [00:00<00:00, 197607.68formulae/s]


loading data with up to 3 elements in the formula
stepping every 140 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 125 val mae: 127
Epoch: 19/300 --- train mae: 37.1 val mae: 44.1
Epoch: 39/300 --- train mae: 25.5 val mae: 38.9
Epoch: 59/300 --- train mae: 16.9 val mae: 37.9
Epoch: 79/300 --- train mae: 11.1 val mae: 36.4
Epoch: 99/300 --- train mae: 7.23 val mae: 35.9
Epoch: 119/300 --- train mae: 5.14 val mae: 35.2
Epoch: 139/300 --- train mae: 4.19 val mae: 34.8
Epoch: 159/300 --- train mae: 3.4 val mae: 34.8
Epoch: 179/300 --- train mae: 3.15 val mae: 34.8
Epoch: 199/300 --- train mae: 2.97 val mae: 34.6
Epoch: 219/300 --- train mae: 2.65 val mae: 34.5
Epoch: 239/300 --- train mae: 2.62 val mae: 34.6
Epoch: 259/300 --- train mae: 2.26 val mae: 34.1
Epoch: 279/300 --- train mae: 2.28 val mae: 33.8
Epoch: 299/300 --- train mae: 2.06 val mae: 34.1
Saving network (aflow__ael_debye_temperature) to D:\deep\LGDCNN\models

Generating EDM: 100%|██████████| 3428/3428 [00:00<00:00, 198455.13formulae/s]

loading data with up to 3 elements in the formula





training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 732/732 [00:00<00:00, 243901.38formulae/s]


loading data with up to 3 elements in the formula
stepping every 140 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 29.6 val mae: 30
Epoch: 19/300 --- train mae: 10.3 val mae: 12.3
Epoch: 39/300 --- train mae: 6.89 val mae: 10.9
Epoch: 59/300 --- train mae: 4.55 val mae: 10.7
Epoch: 79/300 --- train mae: 2.98 val mae: 10.5
Epoch: 99/300 --- train mae: 1.96 val mae: 10.4
Epoch: 119/300 --- train mae: 1.41 val mae: 10
Epoch: 139/300 --- train mae: 1.16 val mae: 10
Epoch: 159/300 --- train mae: 0.938 val mae: 9.85
Epoch 179 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 179/300 --- train mae: 0.84 val mae: 9.93
Epoch: 199/300 --- train mae: 0.779 val mae: 9.84
Epoch: 219/300 --- train mae: 0.671 val mae: 9.8
Epoch: 239/300 --- train mae: 0.65 val mae: 9.84
Epoch: 259/300 --- train mae: 0.604 val mae: 9.75
Epoch: 279/300 --- train mae: 0.53 val mae: 9.64
Epoch: 299/300 --- train mae: 0.494 val mae: 9.59
Saving

Generating EDM: 100%|██████████| 3422/3422 [00:00<00:00, 203737.63formulae/s]

loading data with up to 3 elements in the formula





training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 733/733 [00:00<00:00, 244370.47formulae/s]


loading data with up to 3 elements in the formula
stepping every 140 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 5.44 val mae: 4.9
Epoch: 19/300 --- train mae: 2.19 val mae: 2.53
Epoch: 39/300 --- train mae: 1.51 val mae: 2.29
Epoch: 59/300 --- train mae: 0.925 val mae: 2.18
Epoch: 79/300 --- train mae: 0.624 val mae: 2.16
Epoch: 99/300 --- train mae: 0.449 val mae: 2.16
Epoch 119 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 119/300 --- train mae: 0.352 val mae: 2.18
Epoch: 139/300 --- train mae: 0.287 val mae: 2.16
Epoch: 159/300 --- train mae: 0.246 val mae: 2.16
Epoch: 179/300 --- train mae: 0.226 val mae: 2.17
Epoch: 199/300 --- train mae: 0.2 val mae: 2.15
Epoch 219 failed to improve.
Discarded: 2/5 weight updates ♻🗑️
Epoch: 219/300 --- train mae: 0.18 val mae: 2.17
Epoch: 239/300 --- train mae: 0.176 val mae: 2.15
Epoch: 259/300 --- train mae: 0.16 val mae: 2.16
Epoch 279 failed to improve.
Disc

Generating EDM: 100%|██████████| 3421/3421 [00:00<00:00, 220220.92formulae/s]


loading data with up to 3 elements in the formula
training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 733/733 [00:00<00:00, 244312.21formulae/s]

loading data with up to 3 elements in the formula
stepping every 140 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler





Epoch: 0/300 --- train mae: 2.48e-05 val mae: 2.42e-05
Epoch: 19/300 --- train mae: 5.04e-06 val mae: 5.58e-06
Epoch: 39/300 --- train mae: 3.33e-06 val mae: 4.48e-06
Epoch: 59/300 --- train mae: 2.48e-06 val mae: 4.2e-06
Epoch: 79/300 --- train mae: 1.89e-06 val mae: 4.07e-06
Epoch: 99/300 --- train mae: 1.54e-06 val mae: 4.04e-06
Epoch: 119/300 --- train mae: 1.35e-06 val mae: 4.02e-06
Epoch: 139/300 --- train mae: 1.21e-06 val mae: 4e-06
Epoch: 159/300 --- train mae: 1.07e-06 val mae: 3.99e-06
Epoch: 179/300 --- train mae: 1e-06 val mae: 4.03e-06
Epoch: 199/300 --- train mae: 9.09e-07 val mae: 3.98e-06
Epoch: 219/300 --- train mae: 8.73e-07 val mae: 3.97e-06
Epoch: 239/300 --- train mae: 8e-07 val mae: 3.98e-06
Epoch: 259/300 --- train mae: 7.57e-07 val mae: 3.92e-06
Epoch 279 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 279/300 --- train mae: 7.14e-07 val mae: 3.96e-06
Epoch: 299/300 --- train mae: 7.29e-07 val mae: 3.97e-06
Saving network (aflow__agl_thermal_expansi

Generating EDM: 100%|██████████| 19330/19330 [00:00<00:00, 84128.24formulae/s]


loading data with up to 8 elements in the formula
training with batchsize 1024 (2**10.000)


Generating EDM: 100%|██████████| 4125/4125 [00:00<00:00, 123074.93formulae/s]


loading data with up to 8 elements in the formula
stepping every 190 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 1.37 val mae: 1.35
Epoch: 19/300 --- train mae: 0.452 val mae: 0.47
Epoch: 39/300 --- train mae: 0.326 val mae: 0.381
Epoch: 59/300 --- train mae: 0.247 val mae: 0.339
Epoch: 79/300 --- train mae: 0.18 val mae: 0.327
Epoch: 99/300 --- train mae: 0.126 val mae: 0.321
Epoch: 119/300 --- train mae: 0.0893 val mae: 0.318
Epoch: 139/300 --- train mae: 0.0657 val mae: 0.319
Epoch: 159/300 --- train mae: 0.0504 val mae: 0.32
Epoch: 179/300 --- train mae: 0.0405 val mae: 0.318
Epoch: 199/300 --- train mae: 0.034 val mae: 0.317
Epoch 219 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 219/300 --- train mae: 0.0303 val mae: 0.32
Epoch: 239/300 --- train mae: 0.0268 val mae: 0.318
Epoch: 259/300 --- train mae: 0.0246 val mae: 0.318
Epoch: 279/300 --- train mae: 0.0232 val mae: 0.314
Epoch: 299/300 --- tr

Generating EDM: 100%|██████████| 19344/19344 [00:00<00:00, 181680.73formulae/s]


loading data with up to 8 elements in the formula
training with batchsize 1024 (2**10.000)


Generating EDM: 100%|██████████| 4118/4118 [00:00<00:00, 188655.26formulae/s]

loading data with up to 8 elements in the formula
stepping every 190 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler





Epoch: 0/300 --- train mae: 1.38 val mae: 1.36
Epoch: 19/300 --- train mae: 0.179 val mae: 0.183
Epoch: 39/300 --- train mae: 0.112 val mae: 0.126
Epoch: 59/300 --- train mae: 0.0894 val mae: 0.111
Epoch: 79/300 --- train mae: 0.0756 val mae: 0.105
Epoch: 99/300 --- train mae: 0.0664 val mae: 0.103
Epoch: 119/300 --- train mae: 0.0574 val mae: 0.101
Epoch: 139/300 --- train mae: 0.0493 val mae: 0.101
Epoch: 159/300 --- train mae: 0.0425 val mae: 0.101
Epoch: 179/300 --- train mae: 0.0369 val mae: 0.101
Epoch: 199/300 --- train mae: 0.033 val mae: 0.1
Epoch: 219/300 --- train mae: 0.0302 val mae: 0.101
Epoch: 239/300 --- train mae: 0.0282 val mae: 0.1
Epoch: 259/300 --- train mae: 0.0266 val mae: 0.0999
Epoch: 279/300 --- train mae: 0.0251 val mae: 0.0996
Epoch: 299/300 --- train mae: 0.024 val mae: 0.1
Saving network (aflow__energy_atom) to D:\deep\LGDCNN\models\Benchmark\L-G-DCNN-TEST\aflow__energy_atom.pth
calculating test mae
loading data with up to 7 elements in the formula

aflow_

Generating EDM: 100%|██████████| 59509/59509 [00:00<00:00, 167730.78formulae/s]


loading data with up to 9 elements in the formula
training with batchsize 4096 (2**12.000)


Generating EDM: 100%|██████████| 12752/12752 [00:00<00:00, 175368.91formulae/s]


loading data with up to 9 elements in the formula
stepping every 150 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 0.112 val mae: 0.113
Epoch: 19/300 --- train mae: 0.0877 val mae: 0.0895
Epoch: 39/300 --- train mae: 0.0638 val mae: 0.0756
Epoch: 59/300 --- train mae: 0.0475 val mae: 0.0699
Epoch: 79/300 --- train mae: 0.0365 val mae: 0.0681
Epoch: 99/300 --- train mae: 0.0289 val mae: 0.0673
Epoch: 119/300 --- train mae: 0.0235 val mae: 0.0664
Epoch: 139/300 --- train mae: 0.0198 val mae: 0.066
Epoch: 159/300 --- train mae: 0.0167 val mae: 0.0656
Epoch: 179/300 --- train mae: 0.0146 val mae: 0.0654
Epoch: 199/300 --- train mae: 0.013 val mae: 0.0651
Epoch: 219/300 --- train mae: 0.0116 val mae: 0.0649
Epoch: 239/300 --- train mae: 0.0106 val mae: 0.0648
Epoch: 259/300 --- train mae: 0.00965 val mae: 0.0646
Epoch: 279/300 --- train mae: 0.00883 val mae: 0.0642
Epoch: 299/300 --- train mae: 0.00813 val mae: 0.064
Savin

Generating EDM: 100%|██████████| 59509/59509 [00:00<00:00, 172790.99formulae/s]


loading data with up to 9 elements in the formula
training with batchsize 4096 (2**12.000)


Generating EDM: 100%|██████████| 12752/12752 [00:00<00:00, 175574.43formulae/s]


loading data with up to 9 elements in the formula
stepping every 150 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 0.935 val mae: 0.939
Epoch: 19/300 --- train mae: 0.16 val mae: 0.159
Epoch: 39/300 --- train mae: 0.0962 val mae: 0.101
Epoch: 59/300 --- train mae: 0.0715 val mae: 0.0813
Epoch: 79/300 --- train mae: 0.058 val mae: 0.0725
Epoch: 99/300 --- train mae: 0.0486 val mae: 0.0674
Epoch: 119/300 --- train mae: 0.0415 val mae: 0.0649
Epoch: 139/300 --- train mae: 0.0358 val mae: 0.0634
Epoch: 159/300 --- train mae: 0.0311 val mae: 0.0623
Epoch: 179/300 --- train mae: 0.0266 val mae: 0.0614
Epoch: 199/300 --- train mae: 0.0229 val mae: 0.0608
Epoch: 219/300 --- train mae: 0.0201 val mae: 0.0604
Epoch: 239/300 --- train mae: 0.018 val mae: 0.0602
Epoch: 259/300 --- train mae: 0.016 val mae: 0.0599
Epoch: 279/300 --- train mae: 0.0145 val mae: 0.0598
Epoch: 299/300 --- train mae: 0.0133 val mae: 0.0596
Saving netwo

Generating EDM: 100%|██████████| 4414/4414 [00:00<00:00, 214112.41formulae/s]


loading data with up to 6 elements in the formula
training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 946/946 [00:00<00:00, 204526.37formulae/s]


loading data with up to 6 elements in the formula
stepping every 180 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 58.6 val mae: 57.5
Epoch: 19/300 --- train mae: 13.6 val mae: 15.2
Epoch: 39/300 --- train mae: 9.09 val mae: 12.9
Epoch: 59/300 --- train mae: 6.35 val mae: 12.1
Epoch: 79/300 --- train mae: 4.67 val mae: 11.7
Epoch: 99/300 --- train mae: 3.73 val mae: 11.7
Epoch 119 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 119/300 --- train mae: 2.99 val mae: 11.8
Epoch: 139/300 --- train mae: 2.56 val mae: 11.7
Epoch: 159/300 --- train mae: 2.16 val mae: 11.7
Epoch: 179/300 --- train mae: 1.83 val mae: 11.6
Epoch: 199/300 --- train mae: 1.66 val mae: 11.8
Epoch: 219/300 --- train mae: 1.59 val mae: 11.6
Epoch: 239/300 --- train mae: 1.5 val mae: 11.6
Epoch: 259/300 --- train mae: 1.41 val mae: 11.5
Epoch: 279/300 --- train mae: 1.32 val mae: 11.5
Epoch: 299/300 --- train mae: 1.25 val mae: 11.6
Savin

Generating EDM: 100%|██████████| 4431/4431 [00:00<00:00, 97818.67formulae/s]


loading data with up to 6 elements in the formula
training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 950/950 [00:00<00:00, 344240.93formulae/s]


loading data with up to 6 elements in the formula
stepping every 180 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 6.68 val mae: 6.72
Epoch: 19/300 --- train mae: 6.46 val mae: 6.51
Epoch 39 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 39/300 --- train mae: 6.07 val mae: 6.61
Epoch: 59/300 --- train mae: 5.14 val mae: 6.55
Epoch: 79/300 --- train mae: 4.59 val mae: 6.6
Epoch 99 failed to improve.
Discarded: 2/5 weight updates ♻🗑️
Epoch: 99/300 --- train mae: 4.25 val mae: 6.59
Epoch: 119/300 --- train mae: 3.96 val mae: 6.55
Epoch: 139/300 --- train mae: 3.6 val mae: 6.53
Epoch: 159/300 --- train mae: 3.37 val mae: 6.53
Epoch: 179/300 --- train mae: 3.17 val mae: 6.55
Epoch 199 failed to improve.
Discarded: 3/5 weight updates ♻🗑️
Epoch: 199/300 --- train mae: 3.05 val mae: 6.59
Epoch: 219/300 --- train mae: 2.95 val mae: 6.54
Epoch: 239/300 --- train mae: 2.85 val mae: 6.51
Epoch: 259/300 --- train mae:

Generating EDM: 100%|██████████| 39663/39663 [00:00<00:00, 173515.86formulae/s]


loading data with up to 8 elements in the formula
training with batchsize 2048 (2**11.000)


Generating EDM: 100%|██████████| 8499/8499 [00:00<00:00, 219703.85formulae/s]


loading data with up to 8 elements in the formula
stepping every 200 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 0.19 val mae: 0.202
Epoch: 19/300 --- train mae: 0.113 val mae: 0.124
Epoch: 39/300 --- train mae: 0.0895 val mae: 0.103
Epoch: 59/300 --- train mae: 0.0715 val mae: 0.0973
Epoch: 79/300 --- train mae: 0.0614 val mae: 0.096
Epoch: 99/300 --- train mae: 0.0529 val mae: 0.0932
Epoch 119 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 119/300 --- train mae: 0.049 val mae: 0.0946
Epoch: 139/300 --- train mae: 0.0445 val mae: 0.0924
Epoch: 159/300 --- train mae: 0.0406 val mae: 0.0917
Epoch: 179/300 --- train mae: 0.0333 val mae: 0.09
Epoch 199 failed to improve.
Discarded: 2/5 weight updates ♻🗑️
Epoch: 199/300 --- train mae: 0.0307 val mae: 0.0914
Epoch: 219/300 --- train mae: 0.027 val mae: 0.0901
Epoch: 239/300 --- train mae: 0.0283 val mae: 0.0896
Epoch: 259/300 --- train mae: 0.024 val mae: 0.

Generating EDM: 100%|██████████| 39663/39663 [00:00<00:00, 172837.49formulae/s]


loading data with up to 8 elements in the formula
training with batchsize 2048 (2**11.000)


Generating EDM: 100%|██████████| 8499/8499 [00:00<00:00, 184745.53formulae/s]


loading data with up to 8 elements in the formula
stepping every 200 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 5.82 val mae: 5.74
Epoch: 19/300 --- train mae: 2.83 val mae: 2.84
Epoch: 39/300 --- train mae: 2.25 val mae: 2.41
Epoch: 59/300 --- train mae: 1.93 val mae: 2.25
Epoch: 79/300 --- train mae: 1.61 val mae: 2.15
Epoch: 99/300 --- train mae: 1.4 val mae: 2.11
Epoch: 119/300 --- train mae: 1.23 val mae: 2.09
Epoch: 139/300 --- train mae: 1.1 val mae: 2.09
Epoch: 159/300 --- train mae: 0.98 val mae: 2.08
Epoch: 179/300 --- train mae: 0.893 val mae: 2.09
Epoch: 199/300 --- train mae: 0.816 val mae: 2.1
Epoch: 219/300 --- train mae: 0.755 val mae: 2.1
Epoch: 239/300 --- train mae: 0.708 val mae: 2.09
Epoch: 259/300 --- train mae: 0.665 val mae: 2.09
Epoch: 279/300 --- train mae: 0.627 val mae: 2.09
Epoch: 299/300 --- train mae: 0.599 val mae: 2.1
Saving network (mp_mu_b) to D:\deep\LGDCNN\models\Benchmark\L-G-D

Generating EDM: 100%|██████████| 4328/4328 [00:00<00:00, 183309.41formulae/s]

loading data with up to 6 elements in the formula





training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 928/928 [00:00<00:00, 231781.94formulae/s]


loading data with up to 6 elements in the formula
stepping every 170 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 30.7 val mae: 30.1
Epoch: 19/300 --- train mae: 11.4 val mae: 13.7
Epoch: 39/300 --- train mae: 7.52 val mae: 12.2
Epoch: 59/300 --- train mae: 4.66 val mae: 12.1
Epoch: 79/300 --- train mae: 3.15 val mae: 12
Epoch: 99/300 --- train mae: 2.2 val mae: 11.9
Epoch: 119/300 --- train mae: 1.72 val mae: 11.8
Epoch: 139/300 --- train mae: 1.36 val mae: 11.8
Epoch: 159/300 --- train mae: 1.15 val mae: 11.7
Epoch: 179/300 --- train mae: 0.96 val mae: 11.7
Epoch: 199/300 --- train mae: 0.848 val mae: 11.7
Epoch: 219/300 --- train mae: 0.778 val mae: 11.6
Epoch: 239/300 --- train mae: 0.716 val mae: 11.6
Epoch: 259/300 --- train mae: 0.674 val mae: 11.5
Epoch: 279/300 --- train mae: 0.619 val mae: 11.6
Epoch: 299/300 --- train mae: 0.57 val mae: 11.5
Saving network (mp_shear_modulus) to D:\deep\LGDCNN\models\Benchm

Generating EDM: 100%|██████████| 239125/239125 [00:01<00:00, 152831.47formulae/s]


loading data with up to 7 elements in the formula
training with batchsize 4096 (2**12.000)


Generating EDM: 100%|██████████| 51241/51241 [00:00<00:00, 170838.52formulae/s]


loading data with up to 7 elements in the formula
stepping every 590 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 0.145 val mae: 0.147
Epoch: 19/300 --- train mae: 0.0786 val mae: 0.0826
Epoch: 39/300 --- train mae: 0.0608 val mae: 0.0662
Epoch: 59/300 --- train mae: 0.0511 val mae: 0.0602
Epoch: 79/300 --- train mae: 0.0457 val mae: 0.0567
Epoch: 99/300 --- train mae: 0.0411 val mae: 0.0551
Epoch: 119/300 --- train mae: 0.0393 val mae: 0.0549
Epoch: 139/300 --- train mae: 0.0374 val mae: 0.0535
Epoch: 159/300 --- train mae: 0.0339 val mae: 0.0517
Epoch: 179/300 --- train mae: 0.0322 val mae: 0.0515
Epoch: 199/300 --- train mae: 0.0301 val mae: 0.0505
Epoch: 219/300 --- train mae: 0.0277 val mae: 0.049
Epoch 239 failed to improve.
Discarded: 1/5 weight updates ♻🗑️
Epoch: 239/300 --- train mae: 0.028 val mae: 0.0503
Epoch: 259/300 --- train mae: 0.025 val mae: 0.0489
Epoch: 279/300 --- train mae: 0.0235 val mae: 0.048

Generating EDM: 100%|██████████| 239190/239190 [00:03<00:00, 76012.31formulae/s]


loading data with up to 7 elements in the formula
training with batchsize 4096 (2**12.000)


Generating EDM: 100%|██████████| 51255/51255 [00:00<00:00, 106172.51formulae/s]


loading data with up to 7 elements in the formula
stepping every 590 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 1.54 val mae: 1.54
Epoch: 19/300 --- train mae: 0.0804 val mae: 0.0823
Epoch: 39/300 --- train mae: 0.0541 val mae: 0.0592
Epoch: 59/300 --- train mae: 0.0439 val mae: 0.0515
Epoch: 79/300 --- train mae: 0.038 val mae: 0.0476


### application for Element contribution to property prediction as a function of composition

In [None]:
# application 没有get_model就是没有train的过程

def load_model(mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(f'{mat_prop}.pth') # multi_lstm_attention_residual_dpcnn_V8_512

    # Check if classifcation task
    if classification:
        model.classification = True

    # mat_prop1 = 'MP_e_form'
    mat_prop1 = 'Mp_gap'
    # mat_prop1 = 'MP'
    # mat_prop1 = 'MP_e_above_hull'
    # mat_prop1 = 'MP_magnetism'
    # mat_prop ='MP_dielectric'
    # mat_prop1 = 'MP_Bulk_Modulus'
    # mat_prop1 = 'MP_Shear_Modulus'
    
    # mat_prop1 = 'OQMD_Formation_Enthalpy'
    # Load the data you want to predict with
    data = f'data/application/{mat_prop1}/{file_name}'
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model

def to_csv(output, save_name):
    # parse output and save to csv
    act, pred, formulae, uncertainty = output
    print(len(formulae))
    df = pd.DataFrame([formulae,act, pred,uncertainty]).T
    df.columns = [ 'formula','actual', 'predicted', 'uncertainty']
    save_path = 'data/application/prediction'
    # save_path = 'publication_predictions/onehot_matbench__predictions'
    # save_path = 'publication_predictions/random_200_matbench__predictions'
    os.makedirs(save_path, exist_ok=True)
    df.to_csv(f'{save_path}/{save_name}', index_label='Index')


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results(mat_prop, classification, file_name, verbose=True):
    model = load_model(mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.3f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.3g}')

    # save predictions to a csv
    fname = f'{mat_prop}_{file_name.replace(".csv", "")}_crabnet.csv'
    to_csv(output, fname)
    return model, mae


if __name__ == '__main__':
    # data_dir = 'data/benchmark_data'
    # mat_props = os.listdir(data_dir)
    classification_list = []
    
    # mat_props =  [ 'matbench_mp_gap4', 'mp_bulk_modulus', 'mp_elastic_anisotropy', 'mp_e_hull', 
    # 'mp_mu_b', 'mp_shear_modulus','matbench_mp_e_form0']
    # mat_props = ['OQMD_Formation_Enthalpy']
    

    mat_props = ['matbench_mp_gap3']
    # mat_props = ['MP_Formation_Enthalpy']
    print(f'training: {mat_props}')
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        # model = get_model(mat_prop, classification, verbose=True)

        print('=====================================================')
        # print('calculating train mae')
        # model_train, mae_train = save_results( mat_prop, classification,
                                        #   'train.csv', verbose=False)
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(mat_prop, classification,
                                        'cu_O_mp_gap.csv', verbose=False)
        # print('calculating val mae')
        # model_val, v_mae = save_results(mat_prop, classification,
                                        # 'val.csv', verbose=False)
        print('=====================================================')

training: ['matbench_mp_gap3']
property: matbench_mp_gap3
calculating test mae
loading data with up to 2 elements in the formula

matbench_mp_gap3 mae: 0.226
63


## test LGDCNN

In [None]:
lgdcnn_dir = r"D:\deep\LGDCNN"

def load_model(model_name, mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(model_name, f'{mat_prop}.pth') 

    # Check if classifcation task
    if classification:
        model.classification = True

    # Load the data you want to predict with
    data = os.path.join(lgdcnn_dir,"data","benchmark_data",mat_prop,file_name)
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results(model_name, mat_prop, classification, file_name, verbose=True):
    model = load_model(model_name, mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.4f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.4g}')

    return model, mae


if __name__ == '__main__':
    # choose model
    model_name = "L-G-DCNN-v1"
    # To construct the path of the benchmark_data folder
    benchmark_data_dir = os.path.join(lgdcnn_dir,"data","benchmark_data")
    mat_props = os.listdir(benchmark_data_dir)
    classification_list = []
    print(f'training: {mat_props}')
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(model_name, mat_prop, classification,
                                         'test.csv', verbose=False)

training: ['aflow__ael_bulk_modulus_vrh', 'aflow__ael_debye_temperature', 'aflow__ael_shear_modulus_vrh', 'aflow__agl_thermal_conductivity_300K', 'aflow__agl_thermal_expansion_300K', 'aflow__Egap', 'aflow__energy_atom', 'CritExam__Ed', 'CritExam__Ef', 'mp_bulk_modulus', 'mp_elastic_anisotropy', 'mp_e_hull', 'mp_mu_b', 'mp_shear_modulus', 'OQMD_Bandgap', 'OQMD_Energy_per_atom', 'OQMD_Formation_Enthalpy', 'OQMD_Volume_per_atom']
property: aflow__ael_bulk_modulus_vrh
calculating test mae
L-G-DCNN-v1
loading data with up to 3 elements in the formula

aflow__ael_bulk_modulus_vrh mae: 8.367
property: aflow__ael_debye_temperature
calculating test mae
L-G-DCNN-v1
loading data with up to 3 elements in the formula

aflow__ael_debye_temperature mae: 33.08
property: aflow__ael_shear_modulus_vrh
calculating test mae
L-G-DCNN-v1
loading data with up to 3 elements in the formula

aflow__ael_shear_modulus_vrh mae: 9.17
property: aflow__agl_thermal_conductivity_300K
calculating test mae
L-G-DCNN-v1
loa

## test

In [None]:
def load_model(mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(CrabNet(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(f'{mat_prop}.pth') # multi_lstm_attention_residual_dpcnn_V8_512

    # Check if classifcation task
    if classification:
        model.classification = True

    # Load the data you want to predict with
    data = f'data/benchmark_data/{mat_prop}/{file_name}'
    # data = f'data/benchmark_data_eleg/{mat_prop}/{file_name}'
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results(mat_prop, classification, file_name, verbose=True):
    model = load_model(mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.3f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.3g}')

    # save predictions to a csv
    fname = f'{mat_prop}_{file_name.replace(".csv", "")}_output.csv'
    # to_csv(output, fname)
    return model, mae


if __name__ == '__main__':
    # Get data to benchmark on
    data_dir = 'data/benchmark_data'
    mat_props = os.listdir(data_dir)
    classification_list = []
    print(f'training: {mat_props}')
    # mat_props =  [ 'CritExam__Ef','aflow__Egap','aflow__energy_atom','OQMD_Bandgap', 'OQMD_Energy_per_atom']
    # mat_props = [ 'OQMD_Energy_per_atom']
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        # model = get_model(mat_prop, classification, verbose=True)

        print('=====================================================')
        # print('calculating train mae')
        # model_train, mae_train = save_results( mat_prop, classification,
                                        #   'train.csv', verbose=False)
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(mat_prop, classification,
                                         'test.csv', verbose=False)
        # print('calculating val mae')
        # model_val, v_mae = save_results(mat_prop, classification,
                                        # 'val.csv', verbose=False)
        print('=====================================================')

## 检查oqmd_delta_e_train 是否包含NULL

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv(r'D:\deep\CrabNet\data\application\oqmd_delta_e_train.csv')
df.isnull().sum()

formula    0
target     0
dtype: int64

## 获取mp数据库中duo元相图的形成能数据代码

In [None]:
from pymatgen.ext.matproj import MPRester
# from pymatgen.analysis.phase_diagram import PhaseDiagram, PDPlotter
import pandas as pdd
import os

In [None]:
# a = MPRester('kQD0riCq7tpsdbWK')
# entries = a.get_entries_in_chemsys(['Li', 'Mo', 'O','P'])

#With entries, you can do many sophisticated analyses, like creating phase diagrams.
# pd = PhaseDiagram(entries)

#Let's show all phases, including unstable ones
# plotter = PDPlotter(pd, show_unstable=0.2,)
# plotter.show()

In [None]:
a = MPRester('kQD0riCq7tpsdbWK')
entries = a.get_entries_in_chemsys(['Ga', 'O'])

mat_id = [i.entry_id for i in entries]
com = [i.composition.reduced_formula for i in entries]
all_data = []
for i in mat_id:
    data = a.query(criteria={"task_id": i}, properties=["formation_energy_per_atom","magnetism.total_magnetization_normalized_formula_units","e_above_hull","elasticity.K_VRH","elasticity.G_VRH","band_gap","diel.n"])
    all_data.append(data)

e_formation = [i[0]['formation_energy_per_atom'] for i in all_data]
total_magnetization = [i[0]['magnetism.total_magnetization_normalized_formula_units'] for i in all_data]
e_above_hull = [i[0]['e_above_hull'] for i in all_data]
Bulk_Modulus = [i[0]['elasticity.K_VRH'] for i in all_data]
Shear_Modulus = [i[0]['elasticity.G_VRH'] for i in all_data]
band_gap = [i[0]['band_gap'] for i in all_data]
dielectric = [i[0]['diel.n'] for i in all_data]

df_all = pdd.DataFrame(dict(zip(['material_id', 'formula', 'formation_energy_per_atom', 'magnetism.total_magnetization_normalized_formula_units','e_above_hull','elasticity.K_VRH','elasticity.G_VRH','band_gap','diel.n'],
                               [mat_id, com, e_formation, total_magnetization, e_above_hull, Bulk_Modulus, Shear_Modulus, band_gap, dielectric])))

data_dir = rf'D:\deep\CrabNet\data\application\MP'
seed_ = 'Ga_O_all.csv'  
os.makedirs(data_dir, exist_ok=True)
df_all.to_csv(rf'{data_dir}/{seed_}', index=False)

In [None]:
#This initializes the REST adaptor. You may need to put your own API key in as an arg.
from pymatgen.ext.matproj import MPRester
from pymatgen.analysis.phase_diagram import PhaseDiagram, PDPlotter
import pandas as pdd
import os
a = MPRester(api_key='kQD0riCq7tpsdbWK') # y9WVXfllm2gQdZ6D4TCsE9w9gWX5VL8r   kQD0riCq7tpsdbWK

entries = a.get_entries_in_chemsys(['Al', 'O'])

#With entries, you can do many sophisticated analyses, like creating phase diagrams.
# pd = PhaseDiagram(entries)

#Let's show all phases, including unstable ones
# plotter = PDPlotter(pd, show_unstable=0.2,)
# plotter.show()

In [None]:
# diel.n
com = [i.composition.reduced_formula for i in entries]

diel_m = [a.query(criteria={"task_id": i.entry_id}, properties=["diel.n"])[0]['diel.n'] for i in entries]
df_shear = pdd.DataFrame(dict(zip(['formula','target'],[ com, diel_m])))

data_dir1 = rf'D:\deep\CrabNet\data\application\MP_dielectric'
seed_1 = 'Mn_O_mp_dielectric.csv'  
os.makedirs(data_dir1, exist_ok=True)
df_shear.to_csv(rf'{data_dir1}/{seed_1}', index=False)

In [None]:
com = [i.composition.reduced_formula for i in entries]

# mag = [a.query(criteria={"task_id": i.entry_id}, properties=["elasticity.K_VRH"])[0]['elasticity.K_VRH'] for i in entries]
# df_ti = pdd.DataFrame(dict(zip(['formula','target'],[ com, mag])))

shear_m = [a.query(criteria={"task_id": i.entry_id}, properties=["elasticity.G_VRH"])[0]['elasticity.G_VRH'] for i in entries]
df_shear = pdd.DataFrame(dict(zip(['formula','target'],[ com, shear_m])))

# data_dir = rf'D:\deep\CrabNet\data\application\MP_Bulk_Modulus'
# seed_ = 'Al_O_mp_bulk_modulus.csv'  
# os.makedirs(data_dir, exist_ok=True)
# df_ti.to_csv(rf'{data_dir}/{seed_}', index=False)

data_dir1 = rf'D:\deep\CrabNet\data\application\MP_Shear_Modulus'
seed_1 = 'Si_O_mp_shear_modulus.csv'  
os.makedirs(data_dir1, exist_ok=True)
df_shear.to_csv(rf'{data_dir1}/{seed_1}', index=False)

In [None]:
a.get_data( entries[20].entry_id, data_type="vasp", prop="e_above_hull")

In [None]:
data = a.query(criteria={"task_id": entries[24].entry_id}, properties=["diel.n"])
data

[{'diel.n': 1.4659358785431238}]

In [None]:
a.get_database_version()

'2020_09_08'

In [None]:
entries[0].energy_per_atom

-8.50449542

In [None]:
# delta_e = [pd.get_form_energy_per_atom(i) for i in entries]
# com = [i.composition.reduced_formula for i in entries]
# e_each_atom = [i.energy_per_atom for i in entries]
# mat_id = [i.entry_id for i in entries]

# df_ti = pdd.DataFrame(dict(zip(['material_id','formula','form_energy_per_atom', 'energy_per_atom'],
# [mat_id, com, delta_e, e_each_atom])))

# data_dir = rf'D:\deep\CrabNet\data\application\MP_e_form'
# seed_ = 'Mn_O_all.csv'  
# os.makedirs(data_dir, exist_ok=True)
# df_ti.to_csv(rf'{data_dir}/{seed_}', index=False)

## 得到 MP数据库中几个关键数据 并保存为.csv

In [None]:
from pymatgen.ext.matproj import MPRester
a = MPRester(api_key='kQD0riCq7tpsdbWK')

In [None]:
mat_id = [i.entry_id for i in entries]
com = [i.composition.reduced_formula for i in entries]
all_data = []
for i in mat_id:
    data = a.query(criteria={"task_id": i}, properties=["formation_energy_per_atom", "magnetism.total_magnetization_normalized_formula_units","e_above_hull","elasticity.K_VRH","elasticity.G_VRH","band_gap","diel.n"])
    all_data.append(data)


magnetization_unit = [i[0]['magnetism.total_magnetization'] for i in all_data]
e_above_hull = [i[0]['e_above_hull'] for i in all_data]
Bulk_Modulus = [i[0]['elasticity.K_VRH'] for i in all_data]
Shear_Modulus = [i[0]['elasticity.G_VRH'] for i in all_data]
band_gap = [i[0]['band_gap'] for i in all_data]
dielectric = [i[0]['diel.n'] for i in all_data]

df_all = pdd.DataFrame(dict(zip(['material_id','formula','magnetism.total_magnetization_normalized_formula_units','e_above_hull','elasticity.K_VRH','elasticity.G_VRH','band_gap','diel.n'],
                               [mat_id, com, magnetization_unit, e_above_hull, Bulk_Modulus, Shear_Modulus, band_gap, dielectric])))

data_dir = rf'D:\deep\CrabNet\data\application\MP'
seed_ = 'Li_B_O_all.csv'  
os.makedirs(data_dir, exist_ok=True)
df_all.to_csv(rf'{data_dir}/{seed_}', index=False)

In [None]:
all_data[0]

[{'magnetism.total_magnetization': 2.184898,
  'e_above_hull': 0.6572110503448272,
  'elasticity.K_VRH': None,
  'elasticity.G_VRH': None,
  'band_gap': 0.0,
  'diel.n': None}]

In [None]:
# "G_VRH":Shear Modulus, "K_VRH":Bulk Modulus
data = m.query(criteria={"task_id": entries[0].entry_id}, properties=["total_magnetization"])
print(data)

[{'total_magnetization': 1.092449}]


In [None]:
data = m.query(criteria={"task_id": entries[0].entry_id}, properties=["diel.n"])
print(data)

[{'diel.n': None}]


In [None]:
entries[0].entry_id

'mp-1057139'

## train val test 拆分数据集代码

#### OQMD band gap train val test 数据集拆分为10份

In [None]:
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import os

In [None]:
# 读取所有的OQMD数据包括 val train and test
X_d_test = pd.read_csv(r'D:\deep\CrabNet\data\benchmark_data\OQMD_Bandgap\test.csv')
X_d_train = pd.read_csv(r'D:\deep\CrabNet\data\benchmark_data\OQMD_Bandgap\train.csv')
X_d_val = pd.read_csv(r'D:\deep\CrabNet\data\benchmark_data\OQMD_Bandgap\val.csv')
data_dir = f'data/OQMD/' 
seed_f_val = 'OQMD_Band_gap_all.csv'

X_d_test.to_csv(f'{data_dir}/{seed_f_val}',index=False)
X_d_train.to_csv(f'{data_dir}/{seed_f_val}', index=False, header=False, mode='a+')
X_d_val.to_csv(f'{data_dir}/{seed_f_val}', index=False, header=False, mode='a+')

In [None]:
# df = pd.read_csv(r'D:\deep\CrabNet\data\OQMD\OQMD_Band_gap_all.csv')
df = pd.read_csv(r'D:\deep\CrabNet\data\OQMD\0005\gap_0005.csv')
df_gap = df['target'].values
df_formula = df['formula'].values

rs = ShuffleSplit(n_splits=1, test_size=0.1)
for train_index, test_index in rs.split(df_gap):
    delta_e_train = df_gap[train_index]
    delta_e_test = df_gap[test_index]

    com_train = df_formula[train_index]
    com_test = df_formula[test_index]

df_test = pd.DataFrame(dict(zip(['formula', 'target'],[com_test, delta_e_test])))
data_dir = f'data/OQMD/0005' 
name = 'test.csv'
os.makedirs(data_dir, exist_ok=True)
df_test.to_csv(f'{data_dir}/{name}', index=False)

In [None]:
rs1 = ShuffleSplit(n_splits=1, test_size=0.111)

for train_index, val_index in rs1.split(delta_e_train):
    train = delta_e_train[train_index]
    val = delta_e_train[val_index]

    composition_train = com_train[train_index]
    composition_val = com_train[val_index]

df_train = pd.DataFrame(dict(zip(['formula', 'target'],[composition_train, train])))
name1 = 'train.csv'
os.makedirs(data_dir, exist_ok=True)
df_train.to_csv(f'{data_dir}/{name1}', index=False)

df_val = pd.DataFrame(dict(zip(['formula', 'target'],[composition_val, val])))
name2 = 'val.csv'
os.makedirs(data_dir, exist_ok=True)
df_val.to_csv(f'{data_dir}/{name2}', index=False)

In [None]:
# df = pd.read_csv(r'D:\deep\CrabNet\data\OQMD\OQMD_Band_gap_all.csv')
df = pd.read_csv(r'D:\deep\CrabNet\data\application\transfer\mp-non-metals.csv')
df_gap = df['target'].values
df_formula = df['formula'].values

rs = ShuffleSplit(n_splits=1, test_size=0.2)
for train_index, test_index in rs.split(df_gap):
    delta_e_train = df_gap[train_index]
    delta_e_test = df_gap[test_index]

    com_train = df_formula[train_index]
    com_test = df_formula[test_index]

df_test = pd.DataFrame(dict(zip(['formula', 'target'],[com_test, delta_e_test])))
data_dir = f'data/application/transfer' 
name = 'test.csv'
os.makedirs(data_dir, exist_ok=True)
df_test.to_csv(f'{data_dir}/{name}', index=False)  

In [None]:
rs1 = ShuffleSplit(n_splits=1, test_size=0.1)

for train_index, val_index in rs1.split(delta_e_train):
    train = delta_e_train[train_index]
    val = delta_e_train[val_index]

    composition_train = com_train[train_index]
    composition_val = com_train[val_index]

df_train = pd.DataFrame(dict(zip(['formula', 'target'],[composition_train, train])))
name1 = 'train.csv'
os.makedirs(data_dir, exist_ok=True)
df_train.to_csv(f'{data_dir}/{name1}', index=False)

df_val = pd.DataFrame(dict(zip(['formula', 'target'],[composition_val, val])))
name2 = 'val.csv'
os.makedirs(data_dir, exist_ok=True)
df_val.to_csv(f'{data_dir}/{name2}', index=False)

### oqmd_all-22Mar18

In [None]:
X_df = pd.read_csv(r'D:\deep\CrabNet\data\application\oqmd_all-22Mar18.csv')

In [None]:
def delta_e_convert_to_float(data):
    return float(data.split('  ')[-2].strip())

def com_convert_to_float(data):
    return data.split('  ')[0].strip()

In [None]:
df_select_delta_e = X_df['comp energy_pa volume_pa magmom_pa bandgap delta_e stability'].apply(lambda row: delta_e_convert_to_float(row))
df_select_com = X_df['comp energy_pa volume_pa magmom_pa bandgap delta_e stability'].apply(lambda row: com_convert_to_float(row))

In [None]:
rs = ShuffleSplit(n_splits=1, test_size=0.15)

for train_index, test_index in rs.split(df_select_delta_e):
    delta_e_train = df_select_delta_e[train_index]
    delta_e_test = df_select_delta_e[test_index]

    com_train = df_select_com[train_index]
    com_test = df_select_com[test_index]

delta_e_col = X_df.columns.values.tolist()[0].split(' ')[-2]
com_col = X_df.columns.values.tolist()[0].split(' ')[0]

df_test = pd.DataFrame(dict(zip([com_col, delta_e_col],[com_test, delta_e_test])))
seed_f_test = 'oqmd_delta_e_test.csv'
data_dir = f'data/application' 
os.makedirs(data_dir, exist_ok=True)
df_test = df_test.rename(columns={com_col:'formula',delta_e_col:'target'})
df_test.to_csv(f'{data_dir}/{seed_f_test}', index=False)

(620196,)

In [None]:
rs1 = ShuffleSplit(n_splits=1, test_size=0.15)

delta_e_train = delta_e_train.reset_index(drop=True)
com_train = com_train.reset_index(drop=True)
for train_index, val_index in rs1.split(delta_e_train):
    train = delta_e_train[train_index]
    val = delta_e_train[val_index]

    composition_train = com_train[train_index]
    composition_val = com_train[val_index]

df_train = pd.DataFrame(dict(zip([com_col, delta_e_col],[composition_train, train])))
seed_f_train = 'oqmd_delta_e_train.csv'
os.makedirs(data_dir, exist_ok=True)
df_train = df_train.rename(columns={com_col:'formula',delta_e_col:'target'})
df_train.to_csv(f'{data_dir}/{seed_f_train}', index=False)

df_val = pd.DataFrame(dict(zip([com_col, delta_e_col],[composition_val, val])))
seed_f_val = 'oqmd_delta_e_val.csv'
os.makedirs(data_dir, exist_ok=True)
df_val = df_val.rename(columns={com_col:'formula',delta_e_col:'target'})
df_val.to_csv(f'{data_dir}/{seed_f_val}', index=False)

In [None]:
composition_val.shape

(79075,)

In [None]:
composition_train.shape

(448091,)

In [None]:
com_test.shape

(93030,)

## test

In [None]:
import pandas as pd
from collections import OrderedDict
from tqdm import tqdm
from utils.composition import  _element_composition
import numpy as np

In [None]:

mat_prop = 'OQMD_Formation_Enthalpy'
file_name = 'Ti_O_application_test.csv'
path = f'data/application/{mat_prop}/{file_name}'
if isinstance(path, str):
        df = pd.read_csv(path, keep_default_na=False, na_values=[''])
else:
        df = path

In [None]:
# if 'formula' not in df.columns.values.tolist():
#         df['formula'] = df['cif_id'].str.split('_ICSD').str[0]

df['count'] = [2 for _ in df['delta_e']]

In [None]:
df = df.rename(columns={"X_O": "O", "X_Ti": "Ti"})
all_symbols = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na',
                   'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc',
                   'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga',
                   'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb',
                   'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb',
                   'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm',
                   'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu',
                   'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl',
                   'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa',
                   'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md',
                   'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg',
                   'Cn', 'Nh', 'Fl', 'Mc', 'Lv', 'Ts', 'Og']
data_type_np = np.float64
verbose=True

k = ['O','Ti']
dic_compostion = [dict(zip(k,i)) for i in df[['O','Ti']].applymap(lambda x: '%.15f'%x).values.tolist()]

list_ohm = [OrderedDict(form)
                for form in dic_compostion]

n_elements = 16
elem_num = np.zeros(shape=(len(list_ohm), n_elements), dtype=data_type_np)
elem_frac = np.zeros(shape=(len(list_ohm), n_elements), dtype=data_type_np)

for i, comp in enumerate(tqdm(list_ohm,
                                  desc="Generating EDM",
                                  unit="formulae",
                                  disable=not verbose)):
        # print(comp)

        for j, (elem, count) in enumerate(list_ohm[i].items()):
            # print(type(elem), type(count))
            if j == n_elements:
                # Truncate EDM representation to n_elements
                break
            try:
                # edm_array[i, j, all_symbols.index(elem) + 1] = count
                elem_frac[i, j] = float(count)
                elem_num[i, j] = all_symbols.index(elem) + 1
            except ValueError:
                print(f'skipping composition {comp}')
if n_elements == 16:
        n_elements = np.max(np.sum(elem_frac > 0, axis=1, keepdims=True))
        elem_num = elem_num[:, :n_elements]
        elem_frac = elem_frac[:, :n_elements]

elem_num = elem_num.reshape(elem_num.shape[0], elem_num.shape[1], 1)
elem_frac = elem_frac.reshape(elem_frac.shape[0], elem_frac.shape[1], 1)
out = np.concatenate((elem_num, elem_frac), axis=1)

In [None]:
round(float(count),2)

0.96

In [None]:
elem_num[0]

array([[ 8.],
       [22.]])

In [None]:
elem_frac[0]

array([[0.5],
       [0.5]])