In [None]:
import warnings
warnings.filterwarnings("ignore")
import re
import csv
import numpy as np
import pandas as pd
from keras.models import Sequential,load_model
from keras import optimizers, losses,callbacks
from keras.layers import Dense, Dropout
from keras import backend as K
import matplotlib.pyplot as plt

# Load Data

In [None]:
element_list = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr','Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu']
df_train = pd.read_csv('../Data/oqmd_train.csv')
df_test = pd.read_csv('../Data/oqmd_test.csv')

## Convert string representation of molecular formula to one-hot encoding

In [None]:
def counts2vector(pairs):
    vec = [0]*len(element_list)
    for pair in pairs:
        vec[element_list.index(pair[0])]+=int(pair[1])
    vec = np.array(vec)
    return vec

def onehot(enc):
    rep = np.zeros((len(element_list),11))
    rep[:,0] = 1
    for i,val in enumerate(enc):
        if val!=0:
            rep[i][0]=0
            rep[i][val]=1
    return rep.flatten()

def encode(df):
    formulae = df[['comp']]
    counts = pd.Series(formulae.values.flatten()).str.findall(r"([a-z]+)([0-9]+)", re.I)
    df['encoding'] = counts.apply(counts2vector)
    df['onehot'] = df['encoding'].apply(onehot)

In [None]:
encode(df_train)
encode(df_test)

## Extract the encoding and target values

In [None]:
X_train = df_train.onehot.values
X_train = np.stack(X_train,axis=0)
X_test = df_test.onehot.values
X_test = np.stack(X_test,axis=0)
y_train = df_train[['delta_e','volume_pa','energy_pa']].values
y_test = df_test[['delta_e','volume_pa','energy_pa']].values

# Create the layers of the neural network

In [None]:
def make_model(dropout=True):
    
    model = Sequential()
    model.add(Dense(1024,input_dim=979,activation='relu'))
    model.add(Dense(1024,activation='relu'))
    
    if dropout:
        model.add(Dropout(0.7))
    model.add(Dense(1024,activation='relu'))
    if dropout:
        model.add(Dropout(0.7))
    model.add(Dense(512,activation='relu'))
    if dropout:
        model.add(Dropout(0.5))
    
    model.add(Dense(128,activation='relu'))
    model.add(Dense(64,activation='relu'))
    model.add(Dense(32,activation='relu'))
    model.add(Dense(1,activation='linear'))

    model.compile(loss='mean_absolute_error',optimizer=optimizers.adam())
    model.summary()
    return model

# Formation Enthalpy Model

## Train with Dropout

In [None]:
model = make_model()

In [None]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001,verbose=1)
checkpoint = callbacks.ModelCheckpoint(filepath='delta_e_intermediate.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
history = model.fit(X_train, y_train[:,0], epochs=120, validation_split=0.2,batch_size=256,callbacks=[reduce_lr,checkpoint])

## Retrain without dropout

In [None]:
model = make_model(dropout=False)
model.load_weights('delta_e_intermediate.h5')

In [None]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001,verbose=1)
checkpoint = callbacks.ModelCheckpoint(filepath='delta_e_best_model.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
history = model.fit(X_train, y_train[:,0], epochs=120, validation_split=0.2,batch_size=256,callbacks=[reduce_lr,checkpoint])

In [None]:
model.evaluate(X_test,y_test[:,0])

# Volume Per Atom Model

## Train with Dropout

In [None]:
model = make_model()

In [None]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001,verbose=1)
checkpoint = callbacks.ModelCheckpoint(filepath='volume_pa_intermediate.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
history = model.fit(X_train, y_train[:,1], epochs=120, validation_split=0.2,batch_size=256,callbacks=[reduce_lr,checkpoint])

## Retrain without dropout

In [None]:
model = make_model(dropout=False)
model.load_weights('volume_pa_intermediate.h5')

In [None]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001,verbose=1)
checkpoint = callbacks.ModelCheckpoint(filepath='volume_pa_best_model.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
history = model.fit(X_train, y_train[:,1], epochs=120, validation_split=0.2,batch_size=256,callbacks=[reduce_lr,checkpoint])

In [None]:
model.evaluate(X_test,y_test[:,1])

# Energy Per Atom Model

## Train with Dropout

In [None]:
model = make_model()

In [None]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001,verbose=1)
checkpoint = callbacks.ModelCheckpoint(filepath='energy_pa_intermediate.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
history = model.fit(X_train, y_train[:,2], epochs=120, validation_split=0.2,batch_size=256,callbacks=[reduce_lr,checkpoint])

## Retrain without dropout

In [None]:
model = make_model(dropout=False)
model.load_weights('energy_pa_intermediate.h5')

In [None]:
reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1,
                              patience=5, min_lr=0.00001,verbose=1)
checkpoint = callbacks.ModelCheckpoint(filepath='energy_pa_best_model.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

In [None]:
history = model.fit(X_train, y_train[:,2], epochs=120, validation_split=0.2,batch_size=256,callbacks=[reduce_lr,checkpoint])

In [None]:
model.evaluate(X_test,y_test[:,2])