# Reproducing results from the paper (see below)

Band gaps and other from the paper by [Ward et al.](https://www.nature.com/articles/npjcompumats201628#MOESM37)

In [47]:
%matplotlib inline
from matminer.featurizers.base import MultipleFeaturizer
from matminer.featurizers import composition as cf
from matminer.featurizers.conversions import StrToComposition
from matplotlib import pyplot as plt
from matplotlib.colors import LogNorm
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, ShuffleSplit, KFold

import matplotlib.gridspec as gridspec
from datetime import datetime
from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from sklearn.linear_model import ElasticNetCV, ElasticNet, LassoCV,Lasso, RidgeCV, Ridge, LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import matplotlib.pyplot as plt
import scipy.stats as stats
import sklearn.linear_model as linear_model
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

In [48]:
#read file
df = pd.read_csv('/home/dima/Desktop/ML/Data/A general purpose - data/bandgap.data', delimiter = ' ')

In [49]:
df

Unnamed: 0,composition,energy_pa,bandgap
0,Ne,-0.029181,11.91
1,La,-4.804203,0.0
2,Pr,-4.648720,0.0
3,Kr,-0.001324,7.487
4,V,-8.682485,0.0
...,...,...,...
52382,FeB2C6O6F8,-6.662130,5.318
52383,FeRe2H8O12,-6.308626,1.799
52384,Mn2HgC10O10,-7.268038,2.976
52385,PdAu2F8,-3.202736,0.0


There are missing values in the dataframe

In [50]:
for i in range (0,len(df['bandgap'])):
        if df['bandgap'][i] == 'None':
            df['bandgap'][i]=pd.NaT

In [51]:
df.dropna(inplace=True)
df = df.reset_index(drop=True)

Note that among 52387 compounds there are only 25212 unique

In [52]:
len(df['composition'].unique())

25212

In order to be able to make math operation with data, we change the types to float 

In [53]:
df['bandgap'] = pd.to_numeric(df['bandgap'])
df['energy_pa'] = pd.to_numeric(df['energy_pa'])

We take mean values of band gaps and drop duplicated compounds 

In [55]:
%%time
for i in range(0,len(df)):
    p=1
    for k in range(i+1,len(df)):
        if df['composition'][i]==df['composition'][k]:
            df['bandgap'][i] += df['bandgap'][k]
            df['energy_pa'][i] += df['energy_pa'][k]
            p+=1
        if k==(len(df)-1):
            df['bandgap'][i]=df['bandgap'][i]/p
            df['energy_pa'][i]=df['energy_pa'][i]/p
df.drop_duplicates('composition', keep='first', inplace=True)            

CPU times: user 7h 50min 14s, sys: 609 ms, total: 7h 50min 15s
Wall time: 7h 50min 18s


After having dropped the duplicated items, one has to reindex data

In [58]:
df.index = np.arange(len(df))

Save the data without duplicated compounds

In [62]:
df.to_csv('band_gaps_wald_without_duplicates.csv')

## Feauture generating

In [65]:
data = pd.read_csv("/home/dima/Desktop/ML/Data/Reserve/band_gaps_wald_without_duplicates.csv")

In [66]:
data.head()

Unnamed: 0.1,Unnamed: 0,composition,energy_pa,bandgap
0,0,Ne,0.549757,14.897022
1,1,La,-4.879983,0.0
2,2,Pr,-4.704287,0.0
3,3,Kr,0.042789,7.49325
4,4,V,-8.810925,0.0


In [67]:
data = StrToComposition(target_col_id='formula').featurize_dataframe(data, 'composition',  ignore_errors=True)

HBox(children=(FloatProgress(value=0.0, description='StrToComposition', max=25212.0, style=ProgressStyle(descr…




In [68]:
data

Unnamed: 0.1,Unnamed: 0,composition,energy_pa,bandgap,formula
0,0,Ne,0.549757,14.897022,(Ne)
1,1,La,-4.879983,0.000000,(La)
2,2,Pr,-4.704287,0.000000,(Pr)
3,3,Kr,0.042789,7.493250,(Kr)
4,4,V,-8.810925,0.000000,(V)
...,...,...,...,...,...
25207,25207,Al2W5O16,-8.715022,0.000000,"(Al, W, O)"
25208,25208,Co5Se4Cl2O12,-5.378454,2.550000,"(Co, Se, Cl, O)"
25209,25209,Co5Se4Br2O12,-5.334756,2.543000,"(Co, Se, Br, O)"
25210,25210,Ba2Ca3Tl2Cu4O12,-5.207109,0.000000,"(Ba, Ca, Tl, Cu, O)"


In [69]:
original_count = len(data)
data.dropna(subset=['formula'], inplace=True)
print('Removed %d/%d entries'%(original_count - len(data), original_count))

Removed 0/25212 entries


We generate exactly those features that were used in the paper  [Ward et al.](https://www.nature.com/articles/npjcompumats201628#MOESM37)

In [70]:
from matminer.featurizers.composition import ElementProperty
feature_calculators_magpie = MultipleFeaturizer([cf.Stoichiometry(), cf.ElementProperty.from_preset("magpie"),
                                          cf.ValenceOrbital(props=['avg']), cf.IonProperty(fast=True)])
feature_labels_magpie = feature_calculators_magpie.feature_labels()
df_magpie = feature_calculators_magpie.featurize_dataframe(data, col_id='formula', ignore_errors=True)

HBox(children=(FloatProgress(value=0.0, description='MultipleFeaturizer', max=25212.0, style=ProgressStyle(des…




In [77]:
df_magpie.head(290)

Unnamed: 0.1,Unnamed: 0,composition,energy_pa,bandgap,formula,0-norm,2-norm,3-norm,5-norm,7-norm,...,MagpieData mean GSmagmom,MagpieData avg_dev GSmagmom,MagpieData mode GSmagmom,avg s valence electrons,avg p valence electrons,avg d valence electrons,avg f valence electrons,compound possible,max ionic char,avg ionic char
0,0,Ne,0.549757,14.897022,(Ne),1,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,2.0,6.0,0.0,0.0,True,0.000000,0.000000
1,1,La,-4.879983,0.000000,(La),1,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,2.0,0.0,1.0,0.0,True,0.000000,0.000000
2,2,Pr,-4.704287,0.000000,(Pr),1,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,2.0,0.0,0.0,3.0,True,0.000000,0.000000
3,3,Kr,0.042789,7.493250,(Kr),1,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,2.0,6.0,10.0,0.0,True,0.000000,0.000000
4,4,V,-8.810925,0.000000,(V),1,1.000000,1.000000,1.000000,1.000000,...,0.0,0.0,0.0,2.0,0.0,3.0,0.0,True,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,285,TlCl,-3.090180,2.425125,"(Tl, Cl)",2,0.707107,0.629961,0.574349,0.552045,...,0.0,0.0,0.0,2.0,3.0,5.0,7.0,True,0.447278,0.111819
286,286,LiCl,-3.679026,6.907500,"(Li, Cl)",2,0.707107,0.629961,0.574349,0.552045,...,0.0,0.0,0.0,1.5,2.5,0.0,0.0,True,0.695200,0.173800
287,287,InAs,-3.783298,0.080500,"(In, As)",2,0.707107,0.629961,0.574349,0.552045,...,0.0,0.0,0.0,2.0,2.0,10.0,0.0,True,0.039211,0.009803
288,288,LiBr,-3.311030,5.440500,"(Li, Br)",2,0.707107,0.629961,0.574349,0.552045,...,0.0,0.0,0.0,1.5,2.5,5.0,0.0,True,0.624726,0.156182


In [73]:
dropcol_magpie = ['MagpieData minimum SpaceGroupNumber','MagpieData maximum SpaceGroupNumber','MagpieData range SpaceGroupNumber', 'MagpieData mean SpaceGroupNumber', 'MagpieData avg_dev SpaceGroupNumber', 'MagpieData mode SpaceGroupNumber']
df_magpie=df_magpie.drop(columns = dropcol_magpie)
for i in dropcol_magpie:
    feature_labels_magpie.remove(i)  

KeyError: "['MagpieData minimum SpaceGroupNumber'\n 'MagpieData maximum SpaceGroupNumber' 'MagpieData range SpaceGroupNumber'\n 'MagpieData mean SpaceGroupNumber' 'MagpieData avg_dev SpaceGroupNumber'\n 'MagpieData mode SpaceGroupNumber'] not found in axis"

In [74]:
df

Unnamed: 0,composition,energy_pa,bandgap
0,Ne,0.549757,14.897022
1,La,-4.879983,0.000000
2,Pr,-4.704287,0.000000
3,Kr,0.042789,7.493250
4,V,-8.810925,0.000000
...,...,...,...
25207,Al2W5O16,-8.715022,0.000000
25208,Co5Se4Cl2O12,-5.378454,2.550000
25209,Co5Se4Br2O12,-5.334756,2.543000
25210,Ba2Ca3Tl2Cu4O12,-5.207109,0.000000
