# Joint fill algorithm with MEGNet

In [1]:
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph
import numpy as np

In [2]:
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
import pandas as pd
import numpy as np
import os
import copy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, LambdaCallback
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model

In [3]:
# confirm TensorFlow sees the GPU
from tensorflow.python.client import device_lib
str(device_lib.list_local_devices())

'[name: "/device:CPU:0"\ndevice_type: "CPU"\nmemory_limit: 268435456\nlocality {\n}\nincarnation: 14895609751761660406\n, name: "/device:XLA_CPU:0"\ndevice_type: "XLA_CPU"\nmemory_limit: 17179869184\nlocality {\n}\nincarnation: 13389522220463934305\nphysical_device_desc: "device: XLA_CPU device"\n]'

In [4]:
from sklearn.model_selection import KFold
from modnet.preprocessing import MODData

def shuffle_MD(data,random_state=10):
    data = copy.deepcopy(data)
    ids = data.df_targets.sample(frac=1,random_state=random_state).index
    data.df_featurized = data.df_featurized.loc[ids]
    data.df_targets = data.df_targets.loc[ids]
    data.df_structure = data.df_structure.loc[ids]
    
    return data

def MDKsplit(data,n_splits=5,random_state=10):
    data = shuffle_MD(data,random_state=random_state)
    ids = np.array(data.structure_ids)
    kf = KFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    folds = []
    for train_idx, val_idx in kf.split(ids):
        data_train = MODData(data.df_structure.iloc[train_idx]['structure'].values,data.df_targets.iloc[train_idx].values,target_names=data.df_targets.columns,structure_ids=ids[train_idx])
        data_train.df_featurized = data.df_featurized.iloc[train_idx]
        #data_train.optimal_features = data.optimal_features
        
        data_val = MODData(data.df_structure.iloc[val_idx]['structure'].values,data.df_targets.iloc[val_idx].values,target_names=data.df_targets.columns,structure_ids=ids[val_idx])
        data_val.df_featurized = data.df_featurized.iloc[val_idx]
        #data_val.optimal_features = data.optimal_features

        folds.append((data_train,data_val))
        
    return folds

def MD_append(md,lmd):
    md = copy.deepcopy(md)
    for m in lmd:
        md.df_structure.append(m.df_structure)
        md.df_targets.append(m.df_targets)
        md.df_featurized.append(m.df_featurized)
    return md

In [5]:
md_exp = MODData.load('exp_gap_all')
md_exp.df_targets.columns = ['exp_gap']
md_pbe = MODData.load('pbe_gap_struct.zip')
md_pbe.df_targets.columns = ['pbe_gap']
md_joint = MODData.load('exp_pbe_joint')
#md_hse = MODData.load('moddatas/hse_gap.zip')
#md_hse.df_targets.columns = ['gap']


If you use the ChemEnv tool for your research, please consider citing the following reference(s) :
David Waroquiers, Xavier Gonze, Gian-Marco Rignanese, Cathrin Welker-Nieuwoudt, Frank Rosowski,
Michael Goebel, Stephan Schenk, Peter Degelmann, Rute Andre, Robert Glaum, and Geoffroy Hautier,
"Statistical analysis of coordination environments in oxides",
Chem. Mater., 2017, 29 (19), pp 8346-8360,
DOI: 10.1021/acs.chemmater.7b02766



INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa8cc64d070> object, created with modnet version <=0.1.7
INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa8cc64d280> object, created with modnet version 0.1.8~develop
INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa8d78561c0> object, created with modnet version <=0.1.7


In [7]:
df_exp = md_exp.df_targets

icsds_n =  [int(x.split('-')[1]) for x in list(df_exp.index)]

from pymatgen.ext.matproj import MPRester

with MPRester(api_key='DaZc4G3gfZrVogm6') as mpr:
    res1 = mpr.query({"icsd_ids": {"$in": icsds_n}}, properties=["material_id","icsd_ids"],chunk_size=2000)
    
mapping = {}

for el in res1:
    for icsd in el['icsd_ids']:
        mapping['icsd-'+str(icsd)] = el['material_id']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2405.0), HTML(value='')))

In [8]:
df_exp['mp_id'] = df_exp.index.map(mapping)

In [9]:
df_exp.index = df_exp['mp_id']

In [11]:
df_exp = df_exp.drop("mp_id", axis=1)

In [12]:
df_exp.head()

Unnamed: 0_level_0,exp_gap
mp_id,Unnamed: 1_level_1
mp-10192,1.3
mp-9814,0.355
,3.402809
,0.117444
mp-18531,2.26


In [13]:
md_exp.df_targets.index = md_exp.df_targets['mp_id']

In [14]:
md_exp.df_targets = md_exp.df_targets.drop("mp_id", axis=1)

In [15]:
md_exp.df_targets = md_exp.df_targets.reset_index().dropna().set_index('mp_id')

In [16]:
md_exp.df_targets

Unnamed: 0_level_0,exp_gap
mp_id,Unnamed: 1_level_1
mp-10192,1.300000
mp-9814,0.355000
mp-18531,2.260000
mp-1197022,1.800000
mp-20012,0.220632
...,...
mp-12735,0.000000
mp-568113,0.000000
mp-12060,0.000000
mp-715,0.000000


In [17]:
md_exp.df_featurized

Unnamed: 0,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,AtomicPackingEfficiency|mean simul. packing efficiency,AtomicPackingEfficiency|mean abs simul. packing efficiency,AtomicPackingEfficiency|dist from 1 clusters |APE| < 0.010,...,BondFractions|N3- - Nb1.5+ bond frac.,BondFractions|N3- - Ta3.6+ bond frac.,BondFractions|Nb1.5+ - Nb1.5+ bond frac.,BondFractions|P5+ - V3+ bond frac.,BondFractions|Pb0+ - Yb0+ bond frac.,BondFractions|Rh0+ - Zn0+ bond frac.,BondFractions|Se2- - U3.333+ bond frac.,BondFractions|Sm0+ - Tl0+ bond frac.,BondFractions|Ta3.6+ - Ta3.6+ bond frac.,BondFractions|U3.333+ - U3.333+ bond frac.
icsd-44914,2.0,15,-0.206080,1.0,71,-0.155112,0.050968,0.024133,0.024133,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-629000,3.0,29,-0.202272,3.0,29,-0.202272,0.000000,-0.010896,0.031070,0.019642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-670493,2.0,8,-0.338381,1.0,30,-0.222725,0.115656,0.008654,0.008772,0.033672,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-196824,2.0,51,-0.185623,2.0,51,-0.185623,0.000000,0.002025,0.017082,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-81658,2.0,52,-0.226594,1.0,48,-0.204228,0.022366,-0.027937,0.032937,0.044896,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
icsd-604337,3.0,46,-0.160771,3.0,46,-0.160771,0.000000,0.001944,0.024436,0.039386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-623417,3.0,27,-0.322368,1.0,27,-0.204497,0.117871,-0.024002,0.024002,0.094281,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-104609,1.0,77,-0.195511,1.0,77,-0.195511,0.000000,-0.023321,0.023321,0.029463,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
icsd-197202,3.0,41,-0.125252,3.0,41,-0.125252,0.000000,-0.015541,0.015541,0.125708,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
icsds_n =  [int(x.split('-')[1]) for x in list(md_exp.df_featurized.index)]

from pymatgen.ext.matproj import MPRester

with MPRester(api_key='DaZc4G3gfZrVogm6') as mpr:
    res1 = mpr.query({"icsd_ids": {"$in": icsds_n}}, properties=["material_id","icsd_ids"],chunk_size=2000)
    
mapping = {}

for el in res1:
    for icsd in el['icsd_ids']:
        mapping['icsd-'+str(icsd)] = el['material_id']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2405.0), HTML(value='')))

In [19]:
md_exp.df_featurized['mp_id'] = md_exp.df_featurized.index.map(mapping)

In [20]:
md_exp.df_featurized.index = md_exp.df_featurized['mp_id']

In [21]:
md_exp.df_featurized = md_exp.df_featurized.drop("mp_id", axis=1)

In [22]:
md_exp.df_featurized = md_exp.df_featurized.reset_index().dropna().set_index('mp_id')

In [23]:
md_exp.df_featurized

Unnamed: 0_level_0,AtomicOrbitals|HOMO_character,AtomicOrbitals|HOMO_element,AtomicOrbitals|HOMO_energy,AtomicOrbitals|LUMO_character,AtomicOrbitals|LUMO_element,AtomicOrbitals|LUMO_energy,AtomicOrbitals|gap_AO,AtomicPackingEfficiency|mean simul. packing efficiency,AtomicPackingEfficiency|mean abs simul. packing efficiency,AtomicPackingEfficiency|dist from 1 clusters |APE| < 0.010,...,BondFractions|N3- - Nb1.5+ bond frac.,BondFractions|N3- - Ta3.6+ bond frac.,BondFractions|Nb1.5+ - Nb1.5+ bond frac.,BondFractions|P5+ - V3+ bond frac.,BondFractions|Pb0+ - Yb0+ bond frac.,BondFractions|Rh0+ - Zn0+ bond frac.,BondFractions|Se2- - U3.333+ bond frac.,BondFractions|Sm0+ - Tl0+ bond frac.,BondFractions|Ta3.6+ - Ta3.6+ bond frac.,BondFractions|U3.333+ - U3.333+ bond frac.
mp_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mp-10192,2.0,15,-0.206080,1.0,71,-0.155112,0.050968,0.024133,0.024133,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-9814,3.0,29,-0.202272,3.0,29,-0.202272,0.000000,-0.010896,0.031070,0.019642,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-18531,2.0,52,-0.226594,1.0,48,-0.204228,0.022366,-0.027937,0.032937,0.044896,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-1197022,2.0,8,-0.338381,2.0,8,-0.338381,0.000000,0.025113,0.029266,0.046045,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-20012,2.0,51,-0.185623,2.0,51,-0.185623,0.000000,0.023994,0.023994,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mp-12735,1.0,47,-0.157407,1.0,47,-0.157407,0.000000,0.033243,0.033243,0.047140,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-568113,2.0,33,-0.197497,2.0,33,-0.197497,0.000000,0.010370,0.010370,0.037684,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-12060,3.0,46,-0.160771,3.0,46,-0.160771,0.000000,0.001944,0.024436,0.039386,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mp-715,1.0,77,-0.195511,1.0,77,-0.195511,0.000000,-0.023321,0.023321,0.029463,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
md_exp.df_structure

Unnamed: 0_level_0,structure
id,Unnamed: 1_level_1
icsd-44914,"[[0. 0. 0.] Lu3+, [4.44887131e-16 2.76650000e+..."
icsd-629000,"[[0. 0. 5.6075] Cu+, [2.82450000e+00 2..."
icsd-670493,"[[3.467925 3.467925 1.155975] Zn0+, [3.467925 ..."
icsd-196824,"[[0.805 5.635 2.415] Sb0+, [5.635 2.415 0.805]..."
icsd-81658,[[9.09152140e-16 5.65350000e+00 3.46177034e-16...
...,...
icsd-604337,"[[0. 0. 0.] Nd0+, [2.1095 2.1095 5.139 ] Nd0+,..."
icsd-623417,"[[4.13648 0.955 3.18714] Co0+, [1.62648 0.95..."
icsd-104609,"[[1.83975 1.83975 1.83975] Zr0+, [0. 0. 0.] Zr..."
icsd-197202,"[[3.98 2.29831364 2.725 ] Nb0+, [3.9..."


In [25]:
icsds_n =  [int(x.split('-')[1]) for x in list(md_exp.df_structure.index)]

from pymatgen.ext.matproj import MPRester

with MPRester(api_key='DaZc4G3gfZrVogm6') as mpr:
    res1 = mpr.query({"icsd_ids": {"$in": icsds_n}}, properties=["material_id","icsd_ids"],chunk_size=2000)
    
mapping = {}

for el in res1:
    for icsd in el['icsd_ids']:
        mapping['icsd-'+str(icsd)] = el['material_id']

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2405.0), HTML(value='')))

In [26]:
md_exp.df_structure['mp_id'] = md_exp.df_structure.index.map(mapping)

In [27]:
md_exp.df_structure.index = md_exp.df_structure['mp_id']

In [28]:
md_exp.df_structure = md_exp.df_structure.drop("mp_id", axis=1)

In [29]:
md_exp.df_structure = md_exp.df_structure.reset_index().dropna().set_index('mp_id')

In [30]:
md_exp.df_structure

Unnamed: 0_level_0,structure
mp_id,Unnamed: 1_level_1
mp-10192,"[[0. 0. 0.] Lu3+, [4.44887131e-16 2.76650000e+..."
mp-9814,"[[0. 0. 5.6075] Cu+, [2.82450000e+00 2..."
mp-18531,[[9.09152140e-16 5.65350000e+00 3.46177034e-16...
mp-1197022,"[[4.7664464 2.90875 0.072656 ] Cr6+, [1.5584..."
mp-20012,"[[0. 0. 0.] In3+, [5.20951332e-16 3.23950000e+..."
...,...
mp-12735,[[6.66133815e-16 3.22654570e+00 4.07472300e+00...
mp-568113,"[[0. 0. 0.] Ba0+, [2.0265 2.0265 6.385 ] Ba0+,..."
mp-12060,"[[0. 0. 0.] Nd0+, [2.1095 2.1095 5.139 ] Nd0+,..."
mp-715,"[[1.83975 1.83975 1.83975] Zr0+, [0. 0. 0.] Zr..."


In [31]:
from megnet.data.graph import GaussianDistance

k = 5
random_state = 202010
folds = MDKsplit(md_exp,n_splits=k,random_state=random_state)
maes_ph1 = np.ones(5)
maes = np.ones(5)
for i,f in enumerate(folds):
    train = f[0]
    test = f[1]
    fpath = 'train_{}_{}'.format(random_state,i+1)
    if os.path.exists(fpath):
        train = MODData.load(fpath)
        train.df_targets.columns = ['exp_gap']
    else:
        train.feature_selection(n=-1)
        train.save(fpath)
        
    # assure no overlap
    assert len(set(train.df_targets.index).intersection(set(test.df_targets.index))) == 0
    
    
    # 1 train on exp only
    
    rlr = ReduceLROnPlateau(monitor="loss", factor=0.5, patience=20, verbose=0, mode="auto", min_delta=0)
    es = EarlyStopping(monitor="loss", min_delta=0.001, patience=300, verbose=0, mode="auto", baseline=None,
                           restore_best_weights=True)
    
    model = MODNetModel([[['exp_gap']]],{'exp_gap':1}) #,num_neurons=[[350],[64],[8],[8]], n_feat=350,act='elu')
    model.fit_preset(train,verbose=0)
    # model.fit(train, val_fraction = 0.0, lr=0.005, epochs = 1000, batch_size = 64, loss='mae', callbacks=[rlr,es], verbose = 0)
    
    pred = model.predict(test)
    true = test.df_targets
    error = pred-true
    error = error.drop(pred.index[((pred['exp_gap']).abs()>20)]) # drop unrealistic values: happens extremely rarely
    mae = np.abs(error['exp_gap'].values).mean()
    print('mae_ph1')
    print(mae)
    maes_ph1[i] = mae
    
    # 2 predict on pbe dataset
    
    df_pred = model.predict(md_pbe)
    
    df_pred.loc[df_pred['exp_gap']>10,'exp_gap'] = 10
    df_pred.loc[df_pred['exp_gap']<0,'exp_gap'] = 0
    print(df_pred['exp_gap'].nlargest(5))
    print(df_pred['exp_gap'].nsmallest(5))
    
    md_pbe_filled = copy.deepcopy(md_pbe)
    
    md_pbe_filled.df_targets['exp_gap'] = df_pred['exp_gap']
    md_pbe_filled.optimal_features = train.optimal_features
    
    inter_idx = list(set(md_pbe_filled.df_targets.index).intersection(set(train.df_targets.index.map(mapping))))
    train.df_targets = md_joint.df_targets.loc[inter_idx]
    train.df_featurized = md_joint.df_featurized.loc[inter_idx]
    train.df_structure = md_pbe.df_structure.loc[inter_idx]
    print(len(train.df_targets))
    
    md_pbe_filled.df_targets.loc[inter_idx[:-200],'exp_gap'] = train.df_targets.loc[inter_idx[:-200],'exp_gap']
    
    ## adding train data with high weight (=copying multiple times)
    print('adding train data with high weight')
    df_temp_targets = pd.DataFrame([])
    df_temp_featurized = pd.DataFrame([])
    df_temp_structure = pd.DataFrame([])
    for j in range(15):
        print(j)
        df_temp_targets = df_temp_targets.append(train.df_targets.iloc[:-200])
        df_temp_featurized = df_temp_featurized.append(train.df_featurized.iloc[:-200])
        df_temp_structure = df_temp_structure.append(train.df_structure.iloc[:-200])
        
    md_pbe_filled.df_targets = md_pbe_filled.df_targets.append(
        df_temp_targets.append(train.df_targets.iloc[200:]))
    md_pbe_filled.df_featurized = md_pbe_filled.df_featurized.append(
        df_temp_featurized.append(train.df_featurized.iloc[200:]))
    md_pbe_filled.df_structure = md_pbe_filled.df_structure.append(
        df_temp_structure.append(train.df_structure.iloc[200:]))
    print('done creating joint df')
    print(md_pbe_filled.df_targets.sample(n=15))
    
    # 3 joint training
    
    print(md_pbe_filled.df_targets.columns)
    
    
    #### MEGNET #####
    
    nfeat_bond = 100
    r_cutoff = 5
    gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
    gaussian_width = 0.5
    graph_converter = CrystalGraph(cutoff=r_cutoff)
    model = MEGNetModel(graph_converter=graph_converter, centers=gaussian_centers, width=gaussian_width)
    
    genome = model.model.layers[-4].output
    out = Dense(96,activation='relu')(genome)
    out = Dense(96,activation='relu')(out)

    outs = []
    for i in range(2):
        t = Dense(4,activation='relu')(out)
        outs.append(Dense(1,activation='elu')(t))
    out = Concatenate()(outs)

    model.model = Model(inputs=model.model.input, outputs=out)
    model.model.compile(Adam(lr=0.001), 'mae', metrics=['mae'])
    
    ###
    test.df_targets['pbe_gap'] = 0
    
    #Pretrained model (we are using transfer learning)
    model_form = MEGNetModel.from_file('band_gap_regression.hdf5')
    embedding_layer = [i for i in model_form.layers if i.name.startswith('embedding')][0]
    embedding = embedding_layer.get_weights()[0]
    print('Embedding matrix dimension is ', embedding.shape)
    # New model with length 16 atom feature
    cg = CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 100), 0.5), cutoff=5)
    model = MEGNetModel(100, 2, nvocal=95, embedding_dim=16, graph_converter=cg)
    # find the embedding layer  index in all the model layers
    embedding_layer_index = [i for i, j in enumerate(model.layers) if j.name.startswith('atom_embedding')][0]
    # Set the weights to our previous embedding
    model.layers[embedding_layer_index].set_weights([embedding])
    # Freeze the weights
    model.layers[embedding_layer_index].trainable = False
    
    model.train(md_pbe_filled.df_structure['structure'],
                md_pbe_filled.df_targets.values,
                validation_structures=test.df_structure['structure'],
                validation_targets=test.df_targets.values, epochs=20,batch_size=64)
    
    
    #### test error ####
    
    y_pred = []
    y_true = []
    for mpid,s in test.df_structure['structure'].iteritems():
        y_pred.append(model.predict_structure(s)[0])
        y_true.append(test.df_targets.loc[mpid,'exp_gap'])

    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    error = y_pred-y_true
    #error = error.drop(pred.index[((pred['exp_gap']).abs()>20)]) # drop unrealistic values: happens extremely rarely
    mae = np.abs(error).mean()
    print('mae')
    print(mae)
    maes[i] = mae

INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.







INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa86663ef40> object, created with modnet version 0.1.8~develop
INFO:root:Training preset #1/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.345
INFO:root:Training preset #2/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.358
INFO:root:Training preset #3/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.326
INFO:root:Training preset #4/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.316
INFO:root:Training preset #5/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.349
INFO:root:Training preset #6/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.338
INFO:root:Training preset #7/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.302
INFO:root:Training preset #8/1

mae_ph1
0.19019426501779324
mp-1029115    10.0
mp-1028938    10.0
mp-608100     10.0
mp-569304     10.0
mp-1030239    10.0
Name: exp_gap, dtype: float32
mp-647999    0.0
mp-19019     0.0
mp-685999    0.0
mp-676402    0.0
mp-510688    0.0
Name: exp_gap, dtype: float32
1358
adding train data with high weight
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
done creating joint df
            pbe_gap   exp_gap
mp-720294    4.1735  4.369019
mp-1358      0.0000  0.000000
mp-29904     1.9810  0.000000
mp-31089     0.0000  0.000000
mp-5532      1.9142  3.410000
mp-616604    0.0994  0.000000
mp-568557    2.4442  0.130743
mp-561084    4.0823  3.213043
mp-28900     0.7681  1.637729
mp-768907    3.0473  0.000000
mp-773742    0.0000  0.000000
mp-999059    0.0000  0.000000
mp-562607    0.0000  5.941882
mp-1018715   0.0000  0.000000
mp-767317    1.3372  0.000000
Index(['pbe_gap', 'exp_gap'], dtype='object')
Embedding matrix dimension is  (95, 16)
Epoch 1/20


INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.77324, saving model to callback/val_mae_00001_0.773242.hdf5


Epoch 2/20
Epoch 3/20

INFO:megnet.callbacks:
Epoch 00003: val_mae improved from 0.77324 to 0.76332, saving model to callback/val_mae_00003_0.763325.hdf5


Epoch 4/20

INFO:megnet.callbacks:
Epoch 00004: val_mae improved from 0.76332 to 0.76150, saving model to callback/val_mae_00004_0.761503.hdf5


Epoch 5/20

INFO:megnet.callbacks:
Epoch 00005: val_mae improved from 0.76150 to 0.71807, saving model to callback/val_mae_00005_0.718072.hdf5


Epoch 6/20

INFO:megnet.callbacks:
Epoch 00006: val_mae improved from 0.71807 to 0.71076, saving model to callback/val_mae_00006_0.710759.hdf5


Epoch 7/20
Epoch 8/20
Epoch 9/20

INFO:megnet.callbacks:
Epoch 00009: val_mae improved from 0.71076 to 0.70506, saving model to callback/val_mae_00009_0.705058.hdf5


Epoch 10/20
Epoch 11/20

INFO:megnet.callbacks:
Epoch 00011: val_mae improved from 0.70506 to 0.69187, saving model to callback/val_mae_00011_0.691872.hdf5


Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20

INFO:megnet.callbacks:
Epoch 00018: val_mae improved from 0.69187 to 0.68932, saving model to callback/val_mae_00018_0.689324.hdf5


Epoch 19/20
Epoch 20/20

INFO:megnet.callbacks:
Epoch 00020: val_mae improved from 0.68932 to 0.68825, saving model to callback/val_mae_00020_0.688248.hdf5


mae
0.41024996569035765


INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa86660e3d0> object, created with modnet version 0.1.8~develop
INFO:root:Training preset #1/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.383
INFO:root:Training preset #2/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.419
INFO:root:Training preset #3/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.336
INFO:root:Training preset #4/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.351
INFO:root:Training preset #5/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.377
INFO:root:Training preset #6/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.412
INFO:root:Training preset #7/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.332
INFO:root:Training preset #8/1

mae_ph1
0.14633261752513635
mp-1029115    10.0
mp-1028938    10.0
mp-569304     10.0
mp-1030239    10.0
mp-764288     10.0
Name: exp_gap, dtype: float32
mp-1027594    0.0
mp-21609      0.0
mp-675545     0.0
mp-1027269    0.0
mp-758251     0.0
Name: exp_gap, dtype: float32
1367
adding train data with high weight
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
done creating joint df
            pbe_gap   exp_gap
mp-850934    0.0000  0.000042
mp-1023943   0.5230  0.000042
mp-569715    0.6481  1.200000
mp-1020108   2.8374  4.705926
mp-27369     3.7552  3.976209
mp-22331     0.0000  0.000000
mp-7794      0.0000  2.546209
mp-20566     0.0000  0.000000
mp-979       0.0000  0.000042
mp-1028535   0.0000  0.000042
mp-16548     0.0000  0.000000
mp-604496    0.0000  1.889380
mp-643432    4.4390  6.107689
mp-560125    3.3526  4.475445
mp-20905     0.0000  2.000000
Index(['pbe_gap', 'exp_gap'], dtype='object')
Embedding matrix dimension is  (95, 16)
Epoch 1/20


INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.73199, saving model to callback/val_mae_00001_0.731995.hdf5


Epoch 2/20
Epoch 3/20

INFO:megnet.callbacks:
Epoch 00003: val_mae improved from 0.73199 to 0.70804, saving model to callback/val_mae_00003_0.708043.hdf5


Epoch 4/20
Epoch 5/20
Epoch 6/20

INFO:megnet.callbacks:
Epoch 00006: val_mae improved from 0.70804 to 0.70770, saving model to callback/val_mae_00006_0.707704.hdf5


Epoch 7/20

INFO:megnet.callbacks:
Epoch 00007: val_mae improved from 0.70770 to 0.70573, saving model to callback/val_mae_00007_0.705729.hdf5


Epoch 8/20

INFO:megnet.callbacks:
Epoch 00008: val_mae improved from 0.70573 to 0.70506, saving model to callback/val_mae_00008_0.705063.hdf5


Epoch 9/20

INFO:megnet.callbacks:
Epoch 00009: val_mae improved from 0.70506 to 0.68645, saving model to callback/val_mae_00009_0.686450.hdf5


Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

INFO:megnet.callbacks:
Epoch 00013: val_mae improved from 0.68645 to 0.68467, saving model to callback/val_mae_00013_0.684669.hdf5


Epoch 14/20

INFO:megnet.callbacks:
Epoch 00014: val_mae improved from 0.68467 to 0.67253, saving model to callback/val_mae_00014_0.672533.hdf5


Epoch 15/20
Epoch 16/20

INFO:megnet.callbacks:
Epoch 00016: val_mae improved from 0.67253 to 0.67247, saving model to callback/val_mae_00016_0.672468.hdf5


Epoch 17/20
Epoch 18/20
Epoch 19/20

INFO:megnet.callbacks:
Epoch 00019: val_mae improved from 0.67247 to 0.67026, saving model to callback/val_mae_00019_0.670261.hdf5


Epoch 20/20

INFO:megnet.callbacks:
Epoch 00020: val_mae improved from 0.67026 to 0.66887, saving model to callback/val_mae_00020_0.668867.hdf5


mae
0.43443995526487716


INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa7d3497c40> object, created with modnet version 0.1.8~develop
INFO:root:Training preset #1/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.357
INFO:root:Training preset #2/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.349
INFO:root:Training preset #3/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.405
INFO:root:Training preset #4/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.346
INFO:root:Training preset #5/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.428
INFO:root:Training preset #6/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.424
INFO:root:Training preset #7/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.332
INFO:root:Training preset #8/1

mae_ph1
0.13555155116308834
mp-28264     10.0
mp-30942     10.0
mp-28263     10.0
mp-685168    10.0
mp-24199     10.0
Name: exp_gap, dtype: float32
mp-1029115    0.0
mp-1028938    0.0
mp-1030239    0.0
mp-764630     0.0
mp-865782     0.0
Name: exp_gap, dtype: float32
1357
adding train data with high weight
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
done creating joint df
            pbe_gap   exp_gap
mp-1006127   0.0000  0.006457
mp-777551    2.6788  0.006457
mp-764526    0.2988  5.886298
mp-573581    1.1915  1.910000
mp-568920    0.0000  0.000000
mp-13503     0.0000  0.000000
mp-775647    0.6081  3.469726
mp-23546     0.0000  0.000000
mp-997618    0.7513  2.600000
mp-15079     6.0599  5.477665
mp-866635    2.1168  1.163374
mp-542903    2.7758  2.619982
mp-24399     4.9785  0.826860
mp-31479     0.0000  0.000000
mp-757159    3.1885  3.562789
Index(['pbe_gap', 'exp_gap'], dtype='object')
Embedding matrix dimension is  (95, 16)
Epoch 1/20


INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.71614, saving model to callback/val_mae_00001_0.716144.hdf5


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

INFO:megnet.callbacks:
Epoch 00005: val_mae improved from 0.71614 to 0.66973, saving model to callback/val_mae_00005_0.669728.hdf5


Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20

INFO:megnet.callbacks:
Epoch 00013: val_mae improved from 0.66973 to 0.66529, saving model to callback/val_mae_00013_0.665287.hdf5


Epoch 14/20

INFO:megnet.callbacks:
Epoch 00014: val_mae improved from 0.66529 to 0.65021, saving model to callback/val_mae_00014_0.650209.hdf5


Epoch 15/20
Epoch 16/20

INFO:megnet.callbacks:
Epoch 00016: val_mae improved from 0.65021 to 0.64885, saving model to callback/val_mae_00016_0.648845.hdf5


Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
mae
0.4237004508755587


INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa7cb87a160> object, created with modnet version 0.1.8~develop
INFO:root:Training preset #1/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.414
INFO:root:Training preset #2/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.404
INFO:root:Training preset #3/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.361
INFO:root:Training preset #4/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.397
INFO:root:Training preset #5/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.402
INFO:root:Training preset #6/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.389
INFO:root:Training preset #7/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.369
INFO:root:Training preset #8/1

mae_ph1
0.09433496500775179
mp-685999     10.0
mp-1029115    10.0
mp-1027594    10.0
mp-1029255    10.0
mp-1027015    10.0
Name: exp_gap, dtype: float32
mp-569304     0.0
mp-1014298    0.0
mp-722316     0.0
mp-1018648    0.0
mp-168        0.0
Name: exp_gap, dtype: float32
1360
adding train data with high weight
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
done creating joint df
           pbe_gap    exp_gap
mp-762904   1.2293  10.000000
mp-2866     0.0000   0.003398
mp-763041   0.0000   0.003398
mp-849653   0.0136   0.003398
mp-557121   1.6433   2.750000
mp-570997   0.0000   0.003398
mp-1586     0.0000   0.000000
mp-4924     0.0000   0.003398
mp-23281    2.1446   2.348000
mp-972176   0.0000   0.000000
mp-13026    0.0000   0.000000
mp-770886   1.9602  10.000000
mp-864628   0.0000   0.003398
mp-754526   4.4270   0.003398
mp-10412    0.0000   1.485525
Index(['pbe_gap', 'exp_gap'], dtype='object')
Embedding matrix dimension is  (95, 16)
Epoch 1/20


INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.79780, saving model to callback/val_mae_00001_0.797800.hdf5


Epoch 2/20

INFO:megnet.callbacks:
Epoch 00002: val_mae improved from 0.79780 to 0.74679, saving model to callback/val_mae_00002_0.746793.hdf5


Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20

INFO:megnet.callbacks:
Epoch 00008: val_mae improved from 0.74679 to 0.72266, saving model to callback/val_mae_00008_0.722665.hdf5


Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20

INFO:megnet.callbacks:
Epoch 00016: val_mae improved from 0.72266 to 0.71131, saving model to callback/val_mae_00016_0.711310.hdf5


Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
mae
0.433209106345109


INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fa7e5cf1d00> object, created with modnet version 0.1.8~develop
INFO:root:Training preset #1/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.421
INFO:root:Training preset #2/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.482
INFO:root:Training preset #3/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 1.233
INFO:root:Training preset #4/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.413
INFO:root:Training preset #5/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.425
INFO:root:Training preset #6/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.453
INFO:root:Training preset #7/16
INFO:root:Compiling model...
INFO:root:Fitting model...
INFO:root:Validation loss: 0.455
INFO:root:Training preset #8/1

mae_ph1
0.11186497491909839
mp-1029115    10.0
mp-1028938    10.0
mp-1030239    10.0
mp-2741       10.0
mp-2064       10.0
Name: exp_gap, dtype: float32
mp-764266    0.0
mp-19019     0.0
mp-685999    0.0
mp-676402    0.0
mp-510688    0.0
Name: exp_gap, dtype: float32
1374
adding train data with high weight
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
done creating joint df
            pbe_gap   exp_gap
mp-30445     0.0000  0.000000
mp-1017441   0.0000  0.000000
mp-27275     5.1473  7.088377
mp-1114      0.0000  0.000000
mp-631449    0.0000  0.000000
mp-11242     0.0000  0.000000
mp-778655    1.3408  3.811031
mp-1022      2.3220  2.100000
mp-763018    0.6595  0.000000
mp-607230    0.0000  0.000000
mp-757115    0.7755  2.260391
mp-862790    0.0000  0.000000
mp-556541    1.5135  1.800000
mp-23719     3.2040  0.740867
mp-606703    0.0000  2.300000
Index(['pbe_gap', 'exp_gap'], dtype='object')
Embedding matrix dimension is  (95, 16)
Epoch 1/20


INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.61897, saving model to callback/val_mae_00001_0.618966.hdf5


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

INFO:megnet.callbacks:
Epoch 00005: val_mae improved from 0.61897 to 0.60093, saving model to callback/val_mae_00005_0.600934.hdf5


Epoch 6/20

INFO:megnet.callbacks:
Epoch 00006: val_mae improved from 0.60093 to 0.59994, saving model to callback/val_mae_00006_0.599943.hdf5


Epoch 7/20
Epoch 8/20

INFO:megnet.callbacks:
Epoch 00008: val_mae improved from 0.59994 to 0.59866, saving model to callback/val_mae_00008_0.598662.hdf5


Epoch 9/20
Epoch 10/20

INFO:megnet.callbacks:
Epoch 00010: val_mae improved from 0.59866 to 0.59843, saving model to callback/val_mae_00010_0.598426.hdf5


Epoch 11/20
Epoch 12/20

INFO:megnet.callbacks:
Epoch 00012: val_mae improved from 0.59843 to 0.59240, saving model to callback/val_mae_00012_0.592398.hdf5


Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20

INFO:megnet.callbacks:
Epoch 00019: val_mae improved from 0.59240 to 0.59001, saving model to callback/val_mae_00019_0.590015.hdf5


Epoch 20/20

INFO:megnet.callbacks:
Epoch 00020: val_mae improved from 0.59001 to 0.58744, saving model to callback/val_mae_00020_0.587442.hdf5


mae
0.3899257877417298


In [32]:
    md_pbe_filled = md_joint
    md_pbe_filled.df_structure = md_exp.df_structure.iloc[:len(md_joint.df_targets)]
    #### MEGNET #####
    nfeat_bond = 100
    r_cutoff = 5
    gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
    gaussian_width = 0.5
    graph_converter = CrystalGraph(cutoff=r_cutoff)
    model_form = MEGNetModel.from_file('band_gap_regression.hdf5')
    embedding_layer = [i for i in model_form.layers if i.name.startswith('embedding')][0]
    embedding = embedding_layer.get_weights()[0]
    print('Embedding matrix dimension is ', embedding.shape)
    # New model with length 16 atom feature
    cg = CrystalGraph(bond_converter=GaussianDistance(np.linspace(0, 5, 100), 0.5), cutoff=5)
    model = MEGNetModel(100, 2, nvocal=95, embedding_dim=16, graph_converter=cg)
    # find the embedding layer  index in all the model layers
    embedding_layer_index = [i for i, j in enumerate(model.layers) if j.name.startswith('atom_embedding')][0]
    # Set the weights to our previous embedding
    model.layers[embedding_layer_index].set_weights([embedding])
    # Freeze the weights
    model.layers[embedding_layer_index].trainable = False
    
    genome = model.model.layers[-4].output
    out = Dense(96,activation='relu')(genome)
    out = Dense(32,activation='relu')(out)

    outs = []
    for i in range(2):
        t = Dense(8,activation='relu')(out)
        outs.append(Dense(1,activation='elu',name=str(i)+'target')(t))
    out = Concatenate()(outs)

    model.model = Model(inputs=model.model.input, outputs=out)
    model.model.compile(Adam(lr=0.001), 'mse', metrics=['mae'])
    
    def print_test(epoch,loss):
        print(loss.keys())
    
    print_cb = LambdaCallback(on_epoch_end=print_test)
    
    model.train(md_pbe_filled.df_structure['structure'],
                md_pbe_filled.df_targets.values,
                validation_structures=md_pbe_filled.df_structure['structure'],
                validation_targets=md_pbe_filled.df_targets.values, epochs=5,batch_size=128)
    
    
    #### test error ####
    
    y_pred = []
    y_true = []
    for mpid,s in test.df_structure['structure'].iteritems():
        y_pred.append(model.predict_structure(s)[0])
        y_true.append(test.df_targets.loc[mpid,'exp_gap'])

    y_pred = np.array(y_pred)
    y_true = np.array(y_true)
    
    error = y_pred-y_true
    #error = error.drop(pred.index[((pred['exp_gap']).abs()>20)]) # drop unrealistic values: happens extremely rarely
    mae = np.abs(error).mean()
    print('mae')
    print(mae)
    maes[i] = mae

Embedding matrix dimension is  (95, 16)
Epoch 1/5

INFO:megnet.callbacks:
Epoch 00001: val_mae improved from inf to 0.89870, saving model to callback/val_mae_00001_0.898701.hdf5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
mae
1.1349503696172885


In [35]:
maes_ph1.mean()

0.13565567472657364

In [36]:
maes.mean()

1.0269900739234576

#### Conclusion

No improvement.