In [1]:

import sys
import os
# To get the absolute path of the directory where the lgdcnn module is located
lgdcnn_dir = r"D:\deep\LGDCNN"
# Add the directory where the lgdcnn module is located to the module search path
sys.path.append(lgdcnn_dir)

import numpy as np
import pandas as pd
import torch
from lgdcnn.fusion_lstm_dcnn import LGDCNN
# from lgdcnn.fusion_lstm_dcnn_v1 import LGDCNN
# from crabnet.model_application import Model
from lgdcnn.train import Model
from lgdcnn.utils.get_compute_device import get_compute_device

compute_device = get_compute_device(prefer_last=True)
RNG_SEED = 42
torch.manual_seed(RNG_SEED)  
np.random.seed(RNG_SEED)
from sklearn.metrics import roc_auc_score

lgdcnn_dir = r"D:\deep\LGDCNN"

  from .autonotebook import tqdm as notebook_tqdm


## train


In [2]:
def get_model(Fork,model_name, mat_prop, classification=False, batch_size=None,
              transfer=None, verbose=True):
    # Get the TorchedCrabNet architecture loaded
    model = Model(Fork(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)

    # Train network starting at pretrained weights
    if transfer is not None:
        model.load_network(f'{transfer}.pth')
        model.model_name = f'{mat_prop}'

    # Apply BCEWithLogitsLoss to model output if binary classification is True
    if classification:
        model.classification = True

    # Get the datafiles you will learn from
    train_data = os.path.join(lgdcnn_dir,"data", "benchmark_data", mat_prop, 'train.csv') 
    val_data = os.path.join(lgdcnn_dir,"data", "benchmark_data", mat_prop, 'val.csv')

    # Load the train and validation data before fitting the network
    data_size = pd.read_csv(train_data).shape[0]
    batch_size = 2**round(np.log2(data_size)-4)
    if batch_size < 2**7:
        batch_size = 2**7
    if batch_size > 2**12:
        batch_size = 2**12
    # batch_size = 2**7
    model.load_data(train_data, batch_size=batch_size, train=True)
    print(f'training with batchsize {model.batch_size} '
          f'(2**{np.log2(model.batch_size):0.3f})')
    model.load_data(val_data, batch_size=batch_size)

    # Set the number of epochs, decide if you want a loss curve to be plotted
    model.fit(epochs=300, losscurve=False)

    # Save the network (saved as f"{model_name}.pth")
    model.save_network(model_name)
    return model


def to_csv(output, save_name):
    # parse output and save to csv
    act, pred, formulae, uncertainty = output
    df = pd.DataFrame([formulae, act, pred, uncertainty]).T
    df.columns = ['formula', 'actual', 'predicted', 'uncertainty']
    save_path = os.path.join(lgdcnn_dir,"results", "Benchmark") 
    os.makedirs(save_path, exist_ok=True)
    df.to_csv(f'{save_path}/{save_name}', index_label='Index')


def load_model(Fork, lgdcnn_dir, model_name, mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(Fork(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(model_name, f'{mat_prop}.pth') # multi_lstm_attention_residual_dpcnn_V8_512

    # Check if classifcation task
    if classification:
        model.classification = True

    # Load the data you want to predict with
    data = os.path.join(lgdcnn_dir,"data","benchmark_data",mat_prop,file_name)
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results(Fork,lgdcnn_dir, model_name, mat_prop, classification, file_name, verbose=True):
    model = load_model(Fork, lgdcnn_dir, model_name, mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.4f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.4g}')
        
     # save predictions to a csv
    fname = f'{mat_prop}_{file_name.replace(".csv", "")}_output.csv'
    to_csv(output, fname)
    return model, mae


if __name__ == '__main__':
    model_name = "L-G-DCNN-TEST"
    Fork = LGDCNN
    # Get data to benchmark on
    # data_dir = 'data/benchmark_data'
    benchmark_data_dir = os.path.join(lgdcnn_dir,"data","benchmark_data")
    mat_props = os.listdir(benchmark_data_dir)
    classification_list = []
    print(f'training: {mat_props}')
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        model = get_model(Fork,model_name, mat_prop, classification, verbose=True)
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(lgdcnn_dir,model_name, mat_prop, classification,
                                         'test.csv', verbose=False)
        print('calculating val mae')
        model_val, v_mae = save_results(lgdcnn_dir, model_name, mat_prop, classification,
                                        'val.csv', verbose=False)
        print('=====================================================')


training: ['aflow__ael_bulk_modulus_vrh', 'aflow__ael_debye_temperature', 'aflow__ael_shear_modulus_vrh', 'aflow__agl_thermal_conductivity_300K', 'aflow__agl_thermal_expansion_300K', 'aflow__Egap', 'aflow__energy_atom', 'CritExam__Ed', 'CritExam__Ef', 'mp_bulk_modulus', 'mp_elastic_anisotropy', 'mp_e_hull', 'mp_mu_b', 'mp_shear_modulus', 'OQMD_Bandgap', 'OQMD_Energy_per_atom', 'OQMD_Formation_Enthalpy', 'OQMD_Volume_per_atom']
property: aflow__ael_bulk_modulus_vrh


Generating EDM: 100%|██████████| 3428/3428 [00:00<00:00, 219613.17formulae/s]


training with batchsize 256 (2**8.000)


Generating EDM: 100%|██████████| 732/732 [00:00<00:00, 243978.90formulae/s]


stepping every 140 training passes, cycling lr every 10 epochs
checkin at 20 epochs to match lr scheduler
Epoch: 0/300 --- train mae: 53 val mae: 54
Epoch: 19/300 --- train mae: 10.9 val mae: 13.8


### application for Element contribution to property prediction as a function of composition

In [None]:
# application 没有get_model就是没有train的过程

def load_model(mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(f'{mat_prop}.pth') # multi_lstm_attention_residual_dpcnn_V8_512

    # Check if classifcation task
    if classification:
        model.classification = True

    # mat_prop1 = 'MP_e_form'
    mat_prop1 = 'Mp_gap'
    # mat_prop1 = 'MP'
    # mat_prop1 = 'MP_e_above_hull'
    # mat_prop1 = 'MP_magnetism'
    # mat_prop ='MP_dielectric'
    # mat_prop1 = 'MP_Bulk_Modulus'
    # mat_prop1 = 'MP_Shear_Modulus'
    
    # mat_prop1 = 'OQMD_Formation_Enthalpy'
    # Load the data you want to predict with
    data = f'data/application/{mat_prop1}/{file_name}'
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model

def to_csv(output, save_name):
    # parse output and save to csv
    act, pred, formulae, uncertainty = output
    print(len(formulae))
    df = pd.DataFrame([formulae,act, pred,uncertainty]).T
    df.columns = [ 'formula','actual', 'predicted', 'uncertainty']
    save_path = 'data/application/prediction'
    # save_path = 'publication_predictions/onehot_matbench__predictions'
    # save_path = 'publication_predictions/random_200_matbench__predictions'
    os.makedirs(save_path, exist_ok=True)
    df.to_csv(f'{save_path}/{save_name}', index_label='Index')


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results(mat_prop, classification, file_name, verbose=True):
    model = load_model(mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.3f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.3g}')

    # save predictions to a csv
    fname = f'{mat_prop}_{file_name.replace(".csv", "")}_crabnet.csv'
    to_csv(output, fname)
    return model, mae


if __name__ == '__main__':
    # data_dir = 'data/benchmark_data'
    # mat_props = os.listdir(data_dir)
    classification_list = []
    
    # mat_props =  [ 'matbench_mp_gap4', 'mp_bulk_modulus', 'mp_elastic_anisotropy', 'mp_e_hull', 
    # 'mp_mu_b', 'mp_shear_modulus','matbench_mp_e_form0']
    # mat_props = ['OQMD_Formation_Enthalpy']
    

    mat_props = ['matbench_mp_gap3']
    # mat_props = ['MP_Formation_Enthalpy']
    print(f'training: {mat_props}')
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        # model = get_model(mat_prop, classification, verbose=True)

        print('=====================================================')
        # print('calculating train mae')
        # model_train, mae_train = save_results( mat_prop, classification,
                                        #   'train.csv', verbose=False)
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(mat_prop, classification,
                                        'cu_O_mp_gap.csv', verbose=False)
        # print('calculating val mae')
        # model_val, v_mae = save_results(mat_prop, classification,
                                        # 'val.csv', verbose=False)
        print('=====================================================')

training: ['matbench_mp_gap3']
property: matbench_mp_gap3
calculating test mae
loading data with up to 2 elements in the formula

matbench_mp_gap3 mae: 0.226
63


## test LGDCNN

In [None]:
lgdcnn_dir = r"D:\deep\LGDCNN"

def load_model(model_name, mat_prop, classification, file_name, verbose=True):
    # Load up a saved network.
    model = Model(LGDCNN(compute_device=compute_device).to(compute_device),
                  model_name=f'{mat_prop}', verbose=verbose)
    model.load_network(model_name, f'{mat_prop}.pth') 

    # Check if classifcation task
    if classification:
        model.classification = True

    # Load the data you want to predict with
    data = os.path.join(lgdcnn_dir,"data","benchmark_data",mat_prop,file_name)
    # data is reloaded to model.data_loader
    model.load_data(data, batch_size=2**9, train=False)
    return model


def get_results(model):
    output = model.predict(model.data_loader)  # predict the data saved here
    return model, output


def save_results(model_name, mat_prop, classification, file_name, verbose=True):
    model = load_model(model_name, mat_prop, classification, file_name, verbose=verbose)
    model, output = get_results(model)

    # Get appropriate metrics for saving to csv
    if model.classification:
        auc = roc_auc_score(output[0], output[1])
        print(f'\n{mat_prop} ROC AUC: {auc:0.4f}')
    else:
        mae = np.abs(output[0] - output[1]).mean()
        print(f'\n{mat_prop} mae: {mae:0.4g}')

    return model, mae


if __name__ == '__main__':
    # choose model
    model_name = "L-G-DCNN-v1"
    # To construct the path of the benchmark_data folder
    benchmark_data_dir = os.path.join(lgdcnn_dir,"data","benchmark_data")
    mat_props = os.listdir(benchmark_data_dir)
    classification_list = []
    print(f'training: {mat_props}')
    for mat_prop in mat_props:
        classification = False
        if mat_prop in classification_list:
            classification = True
        print(f'property: {mat_prop}')
        print('=====================================================')
        print('calculating test mae')
        model_test, t_mae = save_results(model_name, mat_prop, classification,
                                         'test.csv', verbose=False)

training: ['aflow__ael_bulk_modulus_vrh', 'aflow__ael_debye_temperature', 'aflow__ael_shear_modulus_vrh', 'aflow__agl_thermal_conductivity_300K', 'aflow__agl_thermal_expansion_300K', 'aflow__Egap', 'aflow__energy_atom', 'CritExam__Ed', 'CritExam__Ef', 'mp_bulk_modulus', 'mp_elastic_anisotropy', 'mp_e_hull', 'mp_mu_b', 'mp_shear_modulus', 'OQMD_Bandgap', 'OQMD_Energy_per_atom', 'OQMD_Formation_Enthalpy', 'OQMD_Volume_per_atom']
property: aflow__ael_bulk_modulus_vrh
calculating test mae
L-G-DCNN-v1
loading data with up to 3 elements in the formula

aflow__ael_bulk_modulus_vrh mae: 8.367
property: aflow__ael_debye_temperature
calculating test mae
L-G-DCNN-v1
loading data with up to 3 elements in the formula

aflow__ael_debye_temperature mae: 33.08
property: aflow__ael_shear_modulus_vrh
calculating test mae
L-G-DCNN-v1
loading data with up to 3 elements in the formula

aflow__ael_shear_modulus_vrh mae: 9.17
property: aflow__agl_thermal_conductivity_300K
calculating test mae
L-G-DCNN-v1
loa

## 获取mp数据库中多元相图的形成能数据代码

In [None]:
from pymatgen.ext.matproj import MPRester
# from pymatgen.analysis.phase_diagram import PhaseDiagram, PDPlotter
import pandas as pdd
import os

In [None]:
# a = MPRester('kQD0riCq7tpsdbWK')
# entries = a.get_entries_in_chemsys(['Li', 'Mo', 'O','P'])

#With entries, you can do many sophisticated analyses, like creating phase diagrams.
# pd = PhaseDiagram(entries)

#Let's show all phases, including unstable ones
# plotter = PDPlotter(pd, show_unstable=0.2,)
# plotter.show()

In [None]:
a = MPRester('kQD0riCq7tpsdbWK')
entries = a.get_entries_in_chemsys(['Ga', 'O'])

mat_id = [i.entry_id for i in entries]
com = [i.composition.reduced_formula for i in entries]
all_data = []
for i in mat_id:
    data = a.query(criteria={"task_id": i}, properties=["formation_energy_per_atom","magnetism.total_magnetization_normalized_formula_units","e_above_hull","elasticity.K_VRH","elasticity.G_VRH","band_gap","diel.n"])
    all_data.append(data)

e_formation = [i[0]['formation_energy_per_atom'] for i in all_data]
total_magnetization = [i[0]['magnetism.total_magnetization_normalized_formula_units'] for i in all_data]
e_above_hull = [i[0]['e_above_hull'] for i in all_data]
Bulk_Modulus = [i[0]['elasticity.K_VRH'] for i in all_data]
Shear_Modulus = [i[0]['elasticity.G_VRH'] for i in all_data]
band_gap = [i[0]['band_gap'] for i in all_data]
dielectric = [i[0]['diel.n'] for i in all_data]

df_all = pdd.DataFrame(dict(zip(['material_id', 'formula', 'formation_energy_per_atom', 'magnetism.total_magnetization_normalized_formula_units','e_above_hull','elasticity.K_VRH','elasticity.G_VRH','band_gap','diel.n'],
                               [mat_id, com, e_formation, total_magnetization, e_above_hull, Bulk_Modulus, Shear_Modulus, band_gap, dielectric])))

data_dir = rf'D:\deep\CrabNet\data\application\MP'
seed_ = 'Ga_O_all.csv'  
os.makedirs(data_dir, exist_ok=True)
df_all.to_csv(rf'{data_dir}/{seed_}', index=False)

In [None]:
#This initializes the REST adaptor. You may need to put your own API key in as an arg.
from pymatgen.ext.matproj import MPRester
from pymatgen.analysis.phase_diagram import PhaseDiagram, PDPlotter
import pandas as pdd
import os
a = MPRester(api_key='kQD0riCq7tpsdbWK') # y9WVXfllm2gQdZ6D4TCsE9w9gWX5VL8r   kQD0riCq7tpsdbWK

entries = a.get_entries_in_chemsys(['Al', 'O'])

#With entries, you can do many sophisticated analyses, like creating phase diagrams.
# pd = PhaseDiagram(entries)

#Let's show all phases, including unstable ones
# plotter = PDPlotter(pd, show_unstable=0.2,)
# plotter.show()

In [None]:
# diel.n
com = [i.composition.reduced_formula for i in entries]

diel_m = [a.query(criteria={"task_id": i.entry_id}, properties=["diel.n"])[0]['diel.n'] for i in entries]
df_shear = pdd.DataFrame(dict(zip(['formula','target'],[ com, diel_m])))

data_dir1 = rf'D:\deep\CrabNet\data\application\MP_dielectric'
seed_1 = 'Mn_O_mp_dielectric.csv'  
os.makedirs(data_dir1, exist_ok=True)
df_shear.to_csv(rf'{data_dir1}/{seed_1}', index=False)

In [None]:
com = [i.composition.reduced_formula for i in entries]

# mag = [a.query(criteria={"task_id": i.entry_id}, properties=["elasticity.K_VRH"])[0]['elasticity.K_VRH'] for i in entries]
# df_ti = pdd.DataFrame(dict(zip(['formula','target'],[ com, mag])))

shear_m = [a.query(criteria={"task_id": i.entry_id}, properties=["elasticity.G_VRH"])[0]['elasticity.G_VRH'] for i in entries]
df_shear = pdd.DataFrame(dict(zip(['formula','target'],[ com, shear_m])))

# data_dir = rf'D:\deep\CrabNet\data\application\MP_Bulk_Modulus'
# seed_ = 'Al_O_mp_bulk_modulus.csv'  
# os.makedirs(data_dir, exist_ok=True)
# df_ti.to_csv(rf'{data_dir}/{seed_}', index=False)

data_dir1 = rf'D:\deep\CrabNet\data\application\MP_Shear_Modulus'
seed_1 = 'Si_O_mp_shear_modulus.csv'  
os.makedirs(data_dir1, exist_ok=True)
df_shear.to_csv(rf'{data_dir1}/{seed_1}', index=False)

In [None]:
a.get_data( entries[20].entry_id, data_type="vasp", prop="e_above_hull")

In [None]:
data = a.query(criteria={"task_id": entries[24].entry_id}, properties=["diel.n"])
data

[{'diel.n': 1.4659358785431238}]

In [None]:
a.get_database_version()

'2020_09_08'

In [None]:
entries[0].energy_per_atom

-8.50449542

In [None]:
# delta_e = [pd.get_form_energy_per_atom(i) for i in entries]
# com = [i.composition.reduced_formula for i in entries]
# e_each_atom = [i.energy_per_atom for i in entries]
# mat_id = [i.entry_id for i in entries]

# df_ti = pdd.DataFrame(dict(zip(['material_id','formula','form_energy_per_atom', 'energy_per_atom'],
# [mat_id, com, delta_e, e_each_atom])))

# data_dir = rf'D:\deep\CrabNet\data\application\MP_e_form'
# seed_ = 'Mn_O_all.csv'  
# os.makedirs(data_dir, exist_ok=True)
# df_ti.to_csv(rf'{data_dir}/{seed_}', index=False)

## 得到 MP数据库中几个关键数据 并保存为.csv

In [None]:
from pymatgen.ext.matproj import MPRester
a = MPRester(api_key='kQD0riCq7tpsdbWK')

In [None]:
mat_id = [i.entry_id for i in entries]
com = [i.composition.reduced_formula for i in entries]
all_data = []
for i in mat_id:
    data = a.query(criteria={"task_id": i}, properties=["formation_energy_per_atom", "magnetism.total_magnetization_normalized_formula_units","e_above_hull","elasticity.K_VRH","elasticity.G_VRH","band_gap","diel.n"])
    all_data.append(data)


magnetization_unit = [i[0]['magnetism.total_magnetization'] for i in all_data]
e_above_hull = [i[0]['e_above_hull'] for i in all_data]
Bulk_Modulus = [i[0]['elasticity.K_VRH'] for i in all_data]
Shear_Modulus = [i[0]['elasticity.G_VRH'] for i in all_data]
band_gap = [i[0]['band_gap'] for i in all_data]
dielectric = [i[0]['diel.n'] for i in all_data]

df_all = pdd.DataFrame(dict(zip(['material_id','formula','magnetism.total_magnetization_normalized_formula_units','e_above_hull','elasticity.K_VRH','elasticity.G_VRH','band_gap','diel.n'],
                               [mat_id, com, magnetization_unit, e_above_hull, Bulk_Modulus, Shear_Modulus, band_gap, dielectric])))

data_dir = rf'D:\deep\CrabNet\data\application\MP'
seed_ = 'Li_B_O_all.csv'  
os.makedirs(data_dir, exist_ok=True)
df_all.to_csv(rf'{data_dir}/{seed_}', index=False)

In [None]:
all_data[0]

[{'magnetism.total_magnetization': 2.184898,
  'e_above_hull': 0.6572110503448272,
  'elasticity.K_VRH': None,
  'elasticity.G_VRH': None,
  'band_gap': 0.0,
  'diel.n': None}]

In [None]:
# "G_VRH":Shear Modulus, "K_VRH":Bulk Modulus
data = m.query(criteria={"task_id": entries[0].entry_id}, properties=["total_magnetization"])
print(data)

[{'total_magnetization': 1.092449}]


In [None]:
data = m.query(criteria={"task_id": entries[0].entry_id}, properties=["diel.n"])
print(data)

[{'diel.n': None}]


In [None]:
entries[0].entry_id

'mp-1057139'

## train val test 拆分数据集代码

#### OQMD band gap train val test 数据集拆分为10份

In [None]:
from sklearn.model_selection import ShuffleSplit
import pandas as pd
import os

In [None]:
# 读取所有的OQMD数据包括 val train and test
X_d_test = pd.read_csv(r'D:\deep\CrabNet\data\benchmark_data\OQMD_Bandgap\test.csv')
X_d_train = pd.read_csv(r'D:\deep\CrabNet\data\benchmark_data\OQMD_Bandgap\train.csv')
X_d_val = pd.read_csv(r'D:\deep\CrabNet\data\benchmark_data\OQMD_Bandgap\val.csv')
data_dir = f'data/OQMD/' 
seed_f_val = 'OQMD_Band_gap_all.csv'

X_d_test.to_csv(f'{data_dir}/{seed_f_val}',index=False)
X_d_train.to_csv(f'{data_dir}/{seed_f_val}', index=False, header=False, mode='a+')
X_d_val.to_csv(f'{data_dir}/{seed_f_val}', index=False, header=False, mode='a+')

In [None]:
# df = pd.read_csv(r'D:\deep\CrabNet\data\OQMD\OQMD_Band_gap_all.csv')
df = pd.read_csv(r'D:\deep\CrabNet\data\OQMD\0005\gap_0005.csv')
df_gap = df['target'].values
df_formula = df['formula'].values

rs = ShuffleSplit(n_splits=1, test_size=0.1)
for train_index, test_index in rs.split(df_gap):
    delta_e_train = df_gap[train_index]
    delta_e_test = df_gap[test_index]

    com_train = df_formula[train_index]
    com_test = df_formula[test_index]

df_test = pd.DataFrame(dict(zip(['formula', 'target'],[com_test, delta_e_test])))
data_dir = f'data/OQMD/0005' 
name = 'test.csv'
os.makedirs(data_dir, exist_ok=True)
df_test.to_csv(f'{data_dir}/{name}', index=False)

In [None]:
rs1 = ShuffleSplit(n_splits=1, test_size=0.111)

for train_index, val_index in rs1.split(delta_e_train):
    train = delta_e_train[train_index]
    val = delta_e_train[val_index]

    composition_train = com_train[train_index]
    composition_val = com_train[val_index]

df_train = pd.DataFrame(dict(zip(['formula', 'target'],[composition_train, train])))
name1 = 'train.csv'
os.makedirs(data_dir, exist_ok=True)
df_train.to_csv(f'{data_dir}/{name1}', index=False)

df_val = pd.DataFrame(dict(zip(['formula', 'target'],[composition_val, val])))
name2 = 'val.csv'
os.makedirs(data_dir, exist_ok=True)
df_val.to_csv(f'{data_dir}/{name2}', index=False)

In [None]:
# df = pd.read_csv(r'D:\deep\CrabNet\data\OQMD\OQMD_Band_gap_all.csv')
df = pd.read_csv(r'D:\deep\CrabNet\data\application\transfer\mp-non-metals.csv')
df_gap = df['target'].values
df_formula = df['formula'].values

rs = ShuffleSplit(n_splits=1, test_size=0.2)
for train_index, test_index in rs.split(df_gap):
    delta_e_train = df_gap[train_index]
    delta_e_test = df_gap[test_index]

    com_train = df_formula[train_index]
    com_test = df_formula[test_index]

df_test = pd.DataFrame(dict(zip(['formula', 'target'],[com_test, delta_e_test])))
data_dir = f'data/application/transfer' 
name = 'test.csv'
os.makedirs(data_dir, exist_ok=True)
df_test.to_csv(f'{data_dir}/{name}', index=False)  

In [None]:
rs1 = ShuffleSplit(n_splits=1, test_size=0.1)

for train_index, val_index in rs1.split(delta_e_train):
    train = delta_e_train[train_index]
    val = delta_e_train[val_index]

    composition_train = com_train[train_index]
    composition_val = com_train[val_index]

df_train = pd.DataFrame(dict(zip(['formula', 'target'],[composition_train, train])))
name1 = 'train.csv'
os.makedirs(data_dir, exist_ok=True)
df_train.to_csv(f'{data_dir}/{name1}', index=False)

df_val = pd.DataFrame(dict(zip(['formula', 'target'],[composition_val, val])))
name2 = 'val.csv'
os.makedirs(data_dir, exist_ok=True)
df_val.to_csv(f'{data_dir}/{name2}', index=False)

### oqmd_all-22Mar18

In [None]:
X_df = pd.read_csv(r'D:\deep\CrabNet\data\application\oqmd_all-22Mar18.csv')

In [None]:
def delta_e_convert_to_float(data):
    return float(data.split('  ')[-2].strip())

def com_convert_to_float(data):
    return data.split('  ')[0].strip()

In [None]:
df_select_delta_e = X_df['comp energy_pa volume_pa magmom_pa bandgap delta_e stability'].apply(lambda row: delta_e_convert_to_float(row))
df_select_com = X_df['comp energy_pa volume_pa magmom_pa bandgap delta_e stability'].apply(lambda row: com_convert_to_float(row))

In [None]:
rs = ShuffleSplit(n_splits=1, test_size=0.15)

for train_index, test_index in rs.split(df_select_delta_e):
    delta_e_train = df_select_delta_e[train_index]
    delta_e_test = df_select_delta_e[test_index]

    com_train = df_select_com[train_index]
    com_test = df_select_com[test_index]

delta_e_col = X_df.columns.values.tolist()[0].split(' ')[-2]
com_col = X_df.columns.values.tolist()[0].split(' ')[0]

df_test = pd.DataFrame(dict(zip([com_col, delta_e_col],[com_test, delta_e_test])))
seed_f_test = 'oqmd_delta_e_test.csv'
data_dir = f'data/application' 
os.makedirs(data_dir, exist_ok=True)
df_test = df_test.rename(columns={com_col:'formula',delta_e_col:'target'})
df_test.to_csv(f'{data_dir}/{seed_f_test}', index=False)

(620196,)

In [None]:
rs1 = ShuffleSplit(n_splits=1, test_size=0.15)

delta_e_train = delta_e_train.reset_index(drop=True)
com_train = com_train.reset_index(drop=True)
for train_index, val_index in rs1.split(delta_e_train):
    train = delta_e_train[train_index]
    val = delta_e_train[val_index]

    composition_train = com_train[train_index]
    composition_val = com_train[val_index]

df_train = pd.DataFrame(dict(zip([com_col, delta_e_col],[composition_train, train])))
seed_f_train = 'oqmd_delta_e_train.csv'
os.makedirs(data_dir, exist_ok=True)
df_train = df_train.rename(columns={com_col:'formula',delta_e_col:'target'})
df_train.to_csv(f'{data_dir}/{seed_f_train}', index=False)

df_val = pd.DataFrame(dict(zip([com_col, delta_e_col],[composition_val, val])))
seed_f_val = 'oqmd_delta_e_val.csv'
os.makedirs(data_dir, exist_ok=True)
df_val = df_val.rename(columns={com_col:'formula',delta_e_col:'target'})
df_val.to_csv(f'{data_dir}/{seed_f_val}', index=False)