# Difference method

In this model, we will calculate the difference between the experimental band gap and the PBE band gap.

In [1]:
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
import numpy as np
import os
import copy
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:
from sklearn.model_selection import KFold
from modnet.preprocessing import MODData

def shuffle_MD(data,random_state=10):
    data = copy.deepcopy(data)
    ids = data.df_targets.sample(frac=1,random_state=random_state).index
    data.df_featurized = data.df_featurized.loc[ids]
    data.df_targets = data.df_targets.loc[ids]
    data.df_structure = data.df_structure.loc[ids]
    
    return data

def MDKsplit(data,n_splits=5,random_state=10):
    data = shuffle_MD(data,random_state=random_state)
    ids = np.array(data.structure_ids)
    kf = KFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    folds = []
    for train_idx, val_idx in kf.split(ids):
        data_train = MODData(data.df_structure.iloc[train_idx]['structure'].values,data.df_targets.iloc[train_idx].values,target_names=data.df_targets.columns,structure_ids=ids[train_idx])
        data_train.df_featurized = data.df_featurized.iloc[train_idx]
        #data_train.optimal_features = data.optimal_features
        
        data_val = MODData(data.df_structure.iloc[val_idx]['structure'].values,data.df_targets.iloc[val_idx].values,target_names=data.df_targets.columns,structure_ids=ids[val_idx])
        data_val.df_featurized = data.df_featurized.iloc[val_idx]
        #data_val.optimal_features = data.optimal_features

        folds.append((data_train,data_val))
        
    return folds

def MD_append(md,lmd):
    md = copy.deepcopy(md)
    for m in lmd:
        md.df_structure.append(m.df_structure)
        md.df_targets.append(m.df_targets)
        md.df_featurized.append(m.df_featurized)
    return md

In [3]:
md_exp = MODData.load('exp_gap_all_mpid')
md_exp.df_targets.columns = ['exp_gap','mp_id']
md_pbe = MODData.load('pbe_gap.zip')
md_pbe.df_targets.columns = ['gap']
md_joint = MODData.load('exp_pbe_joint')


If you use the ChemEnv tool for your research, please consider citing the following reference(s) :
David Waroquiers, Xavier Gonze, Gian-Marco Rignanese, Cathrin Welker-Nieuwoudt, Frank Rosowski,
Michael Goebel, Stephan Schenk, Peter Degelmann, Rute Andre, Robert Glaum, and Geoffroy Hautier,
"Statistical analysis of coordination environments in oxides",
Chem. Mater., 2017, 29 (19), pp 8346-8360,
DOI: 10.1021/acs.chemmater.7b02766



INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7fef0d4e7730> object, created with modnet version <=0.1.7
INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7feee2662af0> object, created with modnet version <=0.1.7
INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7feee2662fa0> object, created with modnet version <=0.1.7


In [4]:
md_joint.df_targets

Unnamed: 0,exp_gap,pbe_gap
mp-12699,0.00,0.0000
mp-559459,3.40,2.1704
mp-21162,0.00,0.0000
mp-1306,0.00,0.0000
mp-15252,0.53,0.1049
...,...,...
mp-556541,1.80,1.5135
mp-2602,1.04,0.0000
mp-30366,0.00,0.0000
mp-3718,1.45,0.2418


In [5]:
md_joint.df_targets['difference'] = md_joint.df_targets['exp_gap'] - md_joint.df_targets['pbe_gap']

In [10]:
md_joint.df_targets

Unnamed: 0,exp_gap,pbe_gap,difference
mp-12699,0.00,0.0000,0.0000
mp-559459,3.40,2.1704,1.2296
mp-21162,0.00,0.0000,0.0000
mp-1306,0.00,0.0000,0.0000
mp-15252,0.53,0.1049,0.4251
...,...,...,...
mp-556541,1.80,1.5135,0.2865
mp-2602,1.04,0.0000,1.0400
mp-30366,0.00,0.0000,0.0000
mp-3718,1.45,0.2418,1.2082


In [7]:
k = 5
random_state = 202010
folds = MDKsplit(md_joint,n_splits=k,random_state=random_state)
maes = np.ones(5)
for i,f in enumerate(folds):
    train = f[0]
    test = f[1]
    train.feature_selection(n=-1, use_precomputed_cross_nmi=True)
    '''fpath = 'trainA_{}_{}'.format(random_state,i+1)
    if os.path.exists(fpath):
        train = MODData.load(fpath)
    else:
        train.feature_selection(n=-1, use_precomputed_cross_nmi=True)
        train.save(fpath)
    '''   
    # assure no overlap
    assert len(set(train.df_targets.index).intersection(set(test.df_targets.index))) == 0
    
    # difference prediction
    model = MODNetModel([[['difference']]],{'difference':1})
    model.fit_preset(train,verbose=0)
    
    pred = model.predict(test)
    true = test.df_targets
    true = true.drop(columns=['exp_gap','pbe_gap'])
    error = pred-true
    error = error['difference'].drop(pred.index[((pred['difference']).abs()>20)]) # drop unrealistic values: happens extremely rarely
    mae = np.abs(error).mean()
    print('mae')
    print(mae)
    maes[i] = mae
    

INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loading cross NMI from 'Features_cross' file.
INFO:root:Starting target 1/3: exp_gap ...
INFO:root:Computing mutual information between features and target...
INFO:root:Computing optimal features...
INFO:root:Selected 50/1102 features...
INFO:root:Selected 100/1102 features...
INFO:root:Selected 150/1102 features...
INFO:root:Selected 200/1102 features...
INFO:root:Selected 250/1102 features...
INFO:root:Selected 300/1102 features...
INFO:root:Selected 350/1102 feat

mae
0.32491913361014685


INFO:root:Computing optimal features...
INFO:root:Selected 50/1095 features...
INFO:root:Selected 100/1095 features...
INFO:root:Selected 150/1095 features...
INFO:root:Selected 200/1095 features...
INFO:root:Selected 250/1095 features...
INFO:root:Selected 300/1095 features...
INFO:root:Selected 350/1095 features...
INFO:root:Selected 400/1095 features...
INFO:root:Selected 450/1095 features...
INFO:root:Selected 500/1095 features...
INFO:root:Selected 550/1095 features...
INFO:root:Selected 600/1095 features...
INFO:root:Selected 650/1095 features...
INFO:root:Selected 700/1095 features...
INFO:root:Selected 750/1095 features...
INFO:root:Selected 800/1095 features...
INFO:root:Selected 850/1095 features...
INFO:root:Selected 900/1095 features...
INFO:root:Selected 950/1095 features...
INFO:root:Selected 1000/1095 features...
INFO:root:Selected 1050/1095 features...
INFO:root:Done with target 1/3: exp_gap.
INFO:root:Starting target 2/3: pbe_gap ...
INFO:root:Computing mutual informat

mae
0.31884230677695524


INFO:root:Computing optimal features...
INFO:root:Selected 50/1104 features...
INFO:root:Selected 100/1104 features...
INFO:root:Selected 150/1104 features...
INFO:root:Selected 200/1104 features...
INFO:root:Selected 250/1104 features...
INFO:root:Selected 300/1104 features...
INFO:root:Selected 350/1104 features...
INFO:root:Selected 400/1104 features...
INFO:root:Selected 450/1104 features...
INFO:root:Selected 500/1104 features...
INFO:root:Selected 550/1104 features...
INFO:root:Selected 600/1104 features...
INFO:root:Selected 650/1104 features...
INFO:root:Selected 700/1104 features...
INFO:root:Selected 750/1104 features...
INFO:root:Selected 800/1104 features...
INFO:root:Selected 850/1104 features...
INFO:root:Selected 900/1104 features...
INFO:root:Selected 950/1104 features...
INFO:root:Selected 1000/1104 features...
INFO:root:Selected 1050/1104 features...
INFO:root:Selected 1100/1104 features...
INFO:root:Done with target 1/3: exp_gap.
INFO:root:Starting target 2/3: pbe_ga

mae
0.2720722933268656


INFO:root:Starting target 1/3: exp_gap ...
INFO:root:Computing mutual information between features and target...
INFO:root:Computing optimal features...
INFO:root:Selected 50/1099 features...
INFO:root:Selected 100/1099 features...
INFO:root:Selected 150/1099 features...
INFO:root:Selected 200/1099 features...
INFO:root:Selected 250/1099 features...
INFO:root:Selected 300/1099 features...
INFO:root:Selected 350/1099 features...
INFO:root:Selected 400/1099 features...
INFO:root:Selected 450/1099 features...
INFO:root:Selected 500/1099 features...
INFO:root:Selected 550/1099 features...
INFO:root:Selected 600/1099 features...
INFO:root:Selected 650/1099 features...
INFO:root:Selected 700/1099 features...
INFO:root:Selected 750/1099 features...
INFO:root:Selected 800/1099 features...
INFO:root:Selected 850/1099 features...
INFO:root:Selected 900/1099 features...
INFO:root:Selected 950/1099 features...
INFO:root:Selected 1000/1099 features...
INFO:root:Selected 1050/1099 features...
INFO:r

mae
0.2984659686439678


INFO:root:Computing optimal features...
INFO:root:Selected 50/1094 features...
INFO:root:Selected 100/1094 features...
INFO:root:Selected 150/1094 features...
INFO:root:Selected 200/1094 features...
INFO:root:Selected 250/1094 features...
INFO:root:Selected 300/1094 features...
INFO:root:Selected 350/1094 features...
INFO:root:Selected 400/1094 features...
INFO:root:Selected 450/1094 features...
INFO:root:Selected 500/1094 features...
INFO:root:Selected 550/1094 features...
INFO:root:Selected 600/1094 features...
INFO:root:Selected 650/1094 features...
INFO:root:Selected 700/1094 features...
INFO:root:Selected 750/1094 features...
INFO:root:Selected 800/1094 features...
INFO:root:Selected 850/1094 features...
INFO:root:Selected 900/1094 features...
INFO:root:Selected 950/1094 features...
INFO:root:Selected 1000/1094 features...
INFO:root:Selected 1050/1094 features...
INFO:root:Done with target 1/3: exp_gap.
INFO:root:Starting target 2/3: pbe_gap ...
INFO:root:Computing mutual informat

mae
0.2824356685888961


In [8]:
maes

array([0.32491913, 0.31884231, 0.27207229, 0.29846597, 0.28243567])

In [9]:
maes.mean()

0.29934707418936635