# MODNet on experimental dataset 

MODNet model on the experimental dataset alone. This model is used as our benchmark in this Ensemble Methods repository.

In [1]:
from modnet.preprocessing import MODData
from modnet.models import MODNetModel
import numpy as np
import os
import copy

from collections import defaultdict
import itertools
import os
import pandas as pd
import matplotlib.pyplot as plt 
from IPython.display import Markdown
from matminer.datasets import load_dataset, get_all_dataset_info
from pymatgen.core import Composition

from modnet.featurizers import MODFeaturizer
from modnet.featurizers.presets import DeBreuck2020Featurizer

os.environ["CUDA_VISIBLE_DEVICES"] = "1"


If you use the ChemEnv tool for your research, please consider citing the following reference(s) :
David Waroquiers, Xavier Gonze, Gian-Marco Rignanese, Cathrin Welker-Nieuwoudt, Frank Rosowski,
Michael Goebel, Stephan Schenk, Peter Degelmann, Rute Andre, Robert Glaum, and Geoffroy Hautier,
"Statistical analysis of coordination environments in oxides",
Chem. Mater., 2017, 29 (19), pp 8346-8360,
DOI: 10.1021/acs.chemmater.7b02766



In [2]:
from sklearn.model_selection import KFold
from modnet.preprocessing import MODData

def shuffle_MD(data,random_state=10):
    data = copy.deepcopy(data)
    ids = data.df_targets.sample(frac=1,random_state=random_state).index
    data.df_featurized = data.df_featurized.loc[ids]
    data.df_targets = data.df_targets.loc[ids]
    data.df_structure = data.df_structure.loc[ids]
    
    return data

def MDKsplit(data,n_splits=5,random_state=10):
    data = shuffle_MD(data,random_state=random_state)
    ids = np.array(data.structure_ids)
    kf = KFold(n_splits=n_splits,shuffle=True,random_state=random_state)
    folds = []
    for train_idx, val_idx in kf.split(ids):
        data_train = MODData(data.df_structure.iloc[train_idx]['structure'].values,data.df_targets.iloc[train_idx].values,target_names=data.df_targets.columns,structure_ids=ids[train_idx])
        data_train.df_featurized = data.df_featurized.iloc[train_idx]
        #data_train.optimal_features = data.optimal_features
        
        data_val = MODData(data.df_structure.iloc[val_idx]['structure'].values,data.df_targets.iloc[val_idx].values,target_names=data.df_targets.columns,structure_ids=ids[val_idx])
        data_val.df_featurized = data.df_featurized.iloc[val_idx]
        #data_val.optimal_features = data.optimal_features

        folds.append((data_train,data_val))
        
    return folds

In [3]:
md = MODData.load('exp_gap_all')
df = pd.read_pickle('df_exp_cleaned.pkl')
md_joint = MODData.load('exp_pbe_joint')

INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7f136c0acee0> object, created with modnet version <=0.1.7
INFO:root:Loaded <modnet.preprocessing.MODData object at 0x7f12e2a1e310> object, created with modnet version <=0.1.7


In [4]:
k = 5
random_state = 202010
folds = MDKsplit(md_joint,n_splits=k,random_state=random_state)
maes = np.ones(5)
for i,f in enumerate(folds):
    train = f[0]
    train.df_targets = train.df_targets.drop(columns='pbe_gap')
    test = f[1]
    test.df_targets = test.df_targets.drop(columns='pbe_gap')
    train.feature_selection(n=-1, use_precomputed_cross_nmi=True)
    '''fpath = 'trainA_{}_{}'.format(random_state,i+1)
    if os.path.exists(fpath):
        train = MODData.load(fpath)
    else:
        train.feature_selection(n=-1, use_precomputed_cross_nmi=True)
        train.save(fpath)
    '''  
    # assure no overlap
    assert len(set(train.df_targets.index).intersection(set(test.df_targets.index))) == 0
    
    model = MODNetModel([[['exp_gap']]],{'exp_gap':1})
    model.fit_preset(train,verbose=0)
    
    pred = model.predict(test)
    #pred = pred.drop(columns='pbe_gap')
    true = test.df_targets
    error = pred['exp_gap']-true['exp_gap']
    error = error.drop(pred.index[((pred['exp_gap']).abs()>20)]) # drop unrealistic values: happens extremely rarely
    mae = np.abs(error.values).mean()
    print(mae)
    maes[i] = mae
    

INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loaded DeBreuck2020Featurizer featurizer.
INFO:root:Loading cross NMI from 'Features_cross' file.
INFO:root:Starting target 1/1: exp_gap ...
INFO:root:Computing mutual information between features and target...
INFO:root:Computing optimal features...
INFO:root:Selected 50/1102 features...
INFO:root:Selected 100/1102 features...
INFO:root:Selected 150/1102 features...
INFO:root:Selected 200/1102 features...
INFO:root:Selected 250/1102 features...
INFO:root:Selected 300/1102 features...
INFO:root:Selected 350/1102 feat

0.4228854634957829


INFO:root:Computing optimal features...
INFO:root:Selected 50/1095 features...
INFO:root:Selected 100/1095 features...
INFO:root:Selected 150/1095 features...
INFO:root:Selected 200/1095 features...
INFO:root:Selected 250/1095 features...
INFO:root:Selected 300/1095 features...
INFO:root:Selected 350/1095 features...
INFO:root:Selected 400/1095 features...
INFO:root:Selected 450/1095 features...
INFO:root:Selected 500/1095 features...
INFO:root:Selected 550/1095 features...
INFO:root:Selected 600/1095 features...
INFO:root:Selected 650/1095 features...
INFO:root:Selected 700/1095 features...
INFO:root:Selected 750/1095 features...
INFO:root:Selected 800/1095 features...
INFO:root:Selected 850/1095 features...
INFO:root:Selected 900/1095 features...
INFO:root:Selected 950/1095 features...
INFO:root:Selected 1000/1095 features...
INFO:root:Selected 1050/1095 features...
INFO:root:Done with target 1/1: exp_gap.
INFO:root:Merging all features...
INFO:root:Done.
INFO:root:Training preset #1

0.3853935641559408


INFO:root:Computing optimal features...
INFO:root:Selected 50/1104 features...
INFO:root:Selected 100/1104 features...
INFO:root:Selected 150/1104 features...
INFO:root:Selected 200/1104 features...
INFO:root:Selected 250/1104 features...
INFO:root:Selected 300/1104 features...
INFO:root:Selected 350/1104 features...
INFO:root:Selected 400/1104 features...
INFO:root:Selected 450/1104 features...
INFO:root:Selected 500/1104 features...
INFO:root:Selected 550/1104 features...
INFO:root:Selected 600/1104 features...
INFO:root:Selected 650/1104 features...
INFO:root:Selected 700/1104 features...
INFO:root:Selected 750/1104 features...
INFO:root:Selected 800/1104 features...
INFO:root:Selected 850/1104 features...
INFO:root:Selected 900/1104 features...
INFO:root:Selected 950/1104 features...
INFO:root:Selected 1000/1104 features...
INFO:root:Selected 1050/1104 features...
INFO:root:Selected 1100/1104 features...
INFO:root:Done with target 1/1: exp_gap.
INFO:root:Merging all features...
INF

0.28246581539031224


INFO:root:Computing optimal features...
INFO:root:Selected 50/1099 features...
INFO:root:Selected 100/1099 features...
INFO:root:Selected 150/1099 features...
INFO:root:Selected 200/1099 features...
INFO:root:Selected 250/1099 features...
INFO:root:Selected 300/1099 features...
INFO:root:Selected 350/1099 features...
INFO:root:Selected 400/1099 features...
INFO:root:Selected 450/1099 features...
INFO:root:Selected 500/1099 features...
INFO:root:Selected 550/1099 features...
INFO:root:Selected 600/1099 features...
INFO:root:Selected 650/1099 features...
INFO:root:Selected 700/1099 features...
INFO:root:Selected 750/1099 features...
INFO:root:Selected 800/1099 features...
INFO:root:Selected 850/1099 features...
INFO:root:Selected 900/1099 features...
INFO:root:Selected 950/1099 features...
INFO:root:Selected 1000/1099 features...
INFO:root:Selected 1050/1099 features...
INFO:root:Done with target 1/1: exp_gap.
INFO:root:Merging all features...
INFO:root:Done.
INFO:root:Training preset #1

0.3626913465290502


INFO:root:Computing optimal features...
INFO:root:Selected 50/1094 features...
INFO:root:Selected 100/1094 features...
INFO:root:Selected 150/1094 features...
INFO:root:Selected 200/1094 features...
INFO:root:Selected 250/1094 features...
INFO:root:Selected 300/1094 features...
INFO:root:Selected 350/1094 features...
INFO:root:Selected 400/1094 features...
INFO:root:Selected 450/1094 features...
INFO:root:Selected 500/1094 features...
INFO:root:Selected 550/1094 features...
INFO:root:Selected 600/1094 features...
INFO:root:Selected 650/1094 features...
INFO:root:Selected 700/1094 features...
INFO:root:Selected 750/1094 features...
INFO:root:Selected 800/1094 features...
INFO:root:Selected 850/1094 features...
INFO:root:Selected 900/1094 features...
INFO:root:Selected 950/1094 features...
INFO:root:Selected 1000/1094 features...
INFO:root:Selected 1050/1094 features...
INFO:root:Done with target 1/1: exp_gap.
INFO:root:Merging all features...
INFO:root:Done.
INFO:root:Training preset #1

0.35961734783657856


In [5]:
maes

array([0.42288546, 0.38539356, 0.28246582, 0.36269135, 0.35961735])

In [6]:
maes.mean()

0.3626107074815329

#### Conclusion

This value of the MAE will be our benchmark value for comparing the different methods in this repository.