### Packages

In [None]:
%pip install matplotlib 
%pip install seaborn
%pip install numpy 
%pip install pandas 
%pip install pycalphad
%pip install tensorflow
%pip install scikit-learn 

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import itertools as itr
import tensorflow as tf
from tensorflow import keras
from sklearn import preprocessing
from pycalphad import Database

In [None]:
db = Database('')
main_phases = np.unique(list(db.phases.keys()))
main_phases

### Initial work with data

In [2]:
data = pd.concat(
    [pd.read_csv('test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases']), 
    pd.read_csv('test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'])])
data.head(20)

In [3]:
data.describe()

Unnamed: 0,t,G
count,9445200.0,9289152.0
mean,1648.0,-121291.5
std,785.1752,82659.72
min,298.0,-451785.4
25%,973.0,-177296.7
50%,1648.0,-107891.4
75%,2323.0,-52284.21
max,2998.0,23371.54


In [None]:
# in case there are data on phases with dependent Gibbs energy
data.reset_index(drop=True, inplace=True)
enc_phs = pd.read_csv('test.csv', sep=',', names=['Enc_ph_1', 'Enc_ph_2', 'Enc_ph_3']) 
data = pd.concat([data, enc_phs], axis=1)
data.tail(50)

In [None]:
data = data.drop_duplicates().reset_index(drop=True)
data = data.dropna(axis=0)
print(data.shape)
print(data.head(10))
metals = (
        'LI','BE','NA','MG','AL','K','CA','SC','TI','V','CR','MN','FE',
        'CO','NI','CU','ZN','GA','Y','ZR','NB','MO','TC',
        'RH','PD','AG','CD','IN','SN','BA','LA','CE','PR','ND','PM','SM','EU',
        'GD','TB','DY','HO','ER','HF','TA','W','RE','OS','IR',
        'PT','AU','HG','TL','PB','BI', 'C', "SI")

(9283784, 9)
      t     materials                   conc            G  \
1   298  ('LI', 'BE')                 [0.05] -8525.898817   
2   298  ('LI', 'BE')  [0.07500000000000001] -8379.036561   
3   298  ('LI', 'BE')                  [0.1] -8232.174305   
4   298  ('LI', 'BE')                [0.125] -8085.312049   
5   298  ('LI', 'BE')  [0.15000000000000002] -7938.449794   
6   298  ('LI', 'BE')  [0.17500000000000002] -7791.587538   
7   298  ('LI', 'BE')                  [0.2] -7644.725283   
8   298  ('LI', 'BE')                [0.225] -7497.863027   
9   298  ('LI', 'BE')                 [0.25] -7351.000771   
10  298  ('LI', 'BE')                [0.275] -7204.138516   

                                                NP  \
1   [0.07734289226638622, 0.9226571077335307, nan]   
2    [0.1546857846722567, 0.8453142153270212, nan]   
3    [0.2320286768280203, 0.7679713231721393, nan]   
4   [0.30937156948554245, 0.6906284305195832, nan]   
5    [0.38671446130762044, 0.613285538692424,

In [None]:
data[data['G'] == 0]

### Preprocessing data for training neural

In [None]:
main_phases = np.append(main_phases, '')
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(main_phases)
print(list(label_encoder.classes_).sort(reverse=True))

t = np.asarray(data['t'])
materials = np.zeros(shape=(len(data['materials'])), dtype=tuple)
concentration = np.zeros(shape=(len(data['conc'])), dtype=tuple)
phases_array = np.zeros(shape=(len(data['Phases'])), dtype=np.ndarray)

for id, item in enumerate(data['materials']):
   materials[id] = tuple(item.replace('(','').replace(')','').replace("'",'').replace(" ",'').split(","))

for id, item in enumerate(data['conc']):
   concentration[id] = (np.round_(1 - float(item.replace('[','').replace(']','').replace("'",'')), 3), 
                        np.round_(float(item.replace('[','').replace(']','').replace("'",'')), 3))
   
target_nn_classes = 0      

if 'Enc_ph_1' in data.columns:
   for id, _ in enumerate(data['Phases']):
      classes = np.zeros(shape=(len(label_encoder.classes_)), dtype=int)
      classes[data['Enc_ph_1'].astype(int)] = 1
      classes[data['Enc_ph_2'].astype(int)] = 1
      classes[data['Enc_ph_3'].astype(int)] = 1
      phases_array[id] = np.asarray(classes)
   target_nn_classes = np.asarray(phases_array)

In [5]:
training_nn_numeric_rows = np.zeros(shape=(len(data['materials']), (1+len(metals))))
target_nn_rows = np.asarray(data['G'])

for id, item in enumerate(concentration):
    training_nn_numeric_rows[id][0] = t[id]
    f_item_id = metals.index(materials[id][0])
    s_item_id = metals.index(materials[id][1])
    training_nn_numeric_rows[id][f_item_id+1] = item[0]
    training_nn_numeric_rows[id][s_item_id+1] = item[1]

Validation on ternary systems data

In [None]:
del materials, concentration, t

data = pd.read_csv('test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'], engine="python", encoding='utf-8', on_bad_lines='warn')
data['G'] = data['G'].fillna(0)
data = data[0:1500000]
del data['NP'], data['Phases']
print(data)

valid_target_nn_rows = np.asarray(data['G'])

metals = (
        'LI','BE','NA','MG','AL','K','CA','SC','TI','V','CR','MN','FE',
        'CO','NI','CU','ZN','GA','Y','ZR','NB','MO','TC',
        'RH','PD','AG','CD','IN','SN','BA','LA','CE','PR','ND','PM','SM','EU',
        'GD','TB','DY','HO','ER','HF','TA','W','RE','OS','IR',
        'PT','AU','HG','TL','PB','BI', 'C', "SI")

t = np.round_(np.asarray(data['t']), 2)
materials = np.zeros(shape=(len(data['materials'])), dtype=tuple)
concentration = np.zeros(shape=(len(data['conc'])), dtype=tuple)

for id, item in enumerate(data['materials']):
    materials[id] = tuple(item.replace('(','').replace(')','').replace("'",'').replace(" ",'').split(","))

for id, item in enumerate(data['conc']):
    item = item.replace('[','').replace(']','').replace("'",'').split(',')
    concentration[id] = (np.round_(1 - (float(item[0])+float(item[1])), 1),
                        np.round_(float(item[0]), 1), 
                        np.round_(float(item[1]), 1))

training_nn_numeric_rows_valid = np.zeros(shape=(len(data['materials']), (1+len(metals))))

for id, item in enumerate(concentration):
    training_nn_numeric_rows_valid[id][0] = t[id]
    f_item_id = metals.index(materials[id][0])
    s_item_id = metals.index(materials[id][1])
    th_item_id = metals.index(materials[id][2])
    training_nn_numeric_rows_valid[id][f_item_id+1] = item[0]
    training_nn_numeric_rows_valid[id][s_item_id+1] = item[1]
    training_nn_numeric_rows_valid[id][th_item_id+1] = item[2]

del data

# DNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import tensorflow.keras.backend as K

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

model = Sequential()
model.add(Dense(training_nn_numeric_rows.shape[1]*2, activation='ReLU', kernel_initializer='random_normal', bias_initializer='zeros'))
model.add(Dropout(0.1))
model.add(Dense(training_nn_numeric_rows.shape[1]*2, activation='ReLU', kernel_initializer='random_normal', bias_initializer='zeros'))
model.add(Dropout(0.1))
model.add(Dense(training_nn_numeric_rows.shape[1]*2, activation='ReLU', kernel_initializer='random_normal', bias_initializer='zeros'))
model.add(Dense(1, activation="linear"))

model.build(input_shape=(training_nn_numeric_rows.shape[0], training_nn_numeric_rows.shape[1]))
model.summary()

lr = 0.0001
optimizer = keras.optimizers.Adam(learning_rate=lr)

model.compile(loss="mean_squared_error", metrics=['mae', r2_score], optimizer=optimizer)
hist = model.fit(x=training_nn_numeric_rows, y=target_nn_rows, epochs=15, shuffle=True, validation_split=0.2)
model.save('test_model.h5')

In [None]:
# summarize history for loss
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('DNN model loss')
plt.ylabel('Loss, MSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('dnn_15ep_0.2valid.png', dpi=1000)
plt.show()

### Creating synthetic data for performance testing

In [None]:
import random as rand
from numpy.random import dirichlet as dr

syntetic_pred_lst = np.zeros(shape=(1000000, (1+len(metals))))

for i in range(0, 1000000, 1):
    syntetic_pred_lst[i][0] = rand.randint(299, 3000)
    temp = dr(np.ones(12),size=1)
    syntetic_pred_lst[i][1] = temp[0][0]
    syntetic_pred_lst[i][2] = temp[0][1]
    syntetic_pred_lst[i][3] = temp[0][2]
    syntetic_pred_lst[i][4] = temp[0][3]
    syntetic_pred_lst[i][5] = temp[0][4]
    syntetic_pred_lst[i][6] = temp[0][5]
    syntetic_pred_lst[i][7] = temp[0][6]
    syntetic_pred_lst[i][8] = temp[0][7]
    syntetic_pred_lst[i][9] = temp[0][8]
    syntetic_pred_lst[i][10] = temp[0][9]
    syntetic_pred_lst[i][11] = temp[0][10]
    syntetic_pred_lst[i][12] = temp[0][11]

syntetic_pred_lst


In [None]:
from keras.models import load_model

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

dependencies = {
    'r2_score': r2_score
}
test = load_model('test_model.h5', custom_objects=dependencies)
test.predict(syntetic_pred_lst)

### CNN

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization, LayerNormalization, Conv1D, Flatten, MaxPooling1D
import tensorflow.keras.backend as K

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

training_nn_rows = training_nn_numeric_rows.reshape(training_nn_numeric_rows.shape[0], training_nn_numeric_rows.shape[1], 1)
print(training_nn_rows.shape)

model = Sequential()
model.add(Conv1D(32, 2, activation="ReLU", strides=1, padding="same"))
model.add(MaxPooling1D(pool_size=2, strides=1, padding='valid')) 
model.add(Dropout(0.1))
model.add(Conv1D(16, 2, activation="ReLU", strides=1, padding="same"))
model.add(Flatten())
model.add(Dense(32, activation='ReLU', kernel_initializer='random_normal', bias_initializer='zeros'))
model.add(Dropout(0.1))
model.add(Dense(16, activation='ReLU', kernel_initializer='random_normal', bias_initializer='zeros'))
model.add(Dense(8, activation='ReLU', kernel_initializer='random_normal', bias_initializer='zeros'))
model.add(Dense(1, activation="linear"))

model.build(input_shape=(training_nn_rows.shape[0], training_nn_rows.shape[1], 1))
model.summary()

lr = 0.0001
optimizer = keras.optimizers.Adam(learning_rate=lr)

model.compile(loss="mean_squared_error", metrics=['mae', r2_score], optimizer=optimizer)
# optimal epochs - 5
hist = model.fit(x=training_nn_rows, y=target_nn_rows, epochs=8, shuffle=True, validation_split=0.2) 
model.save('test_model.h5') 

In [None]:
# Not an actual chart, just an example of model overfitting at 8 epochs
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('CNN model loss')
plt.ylabel('Loss, MSE')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.savefig('cnn_15ep_0.2valid_8ep.png', dpi=1000)
plt.show()

### Model validation works

### Binary

In [None]:
from keras.models import load_model
import tensorflow.keras.backend as K
from sklearn.metrics import r2_score as r2s, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

dependencies = {
    'r2_score': r2_score
}
model_cnn = load_model('../models/cnn_model_5ep_0.2val_final.h5', custom_objects=dependencies)
model_dnn = load_model('../models/dnn_model_15ep_0.2val_final.h5', custom_objects=dependencies)


data = pd.concat(
    [pd.read_csv('../test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases']), 
    pd.read_csv('../test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'])
    ])
data = data.dropna(axis=0)
del data['NP'], data['Phases']
data = data[data['materials'] == "('BE', 'FE')"]

valid_target_nn_rows = np.asarray(data['G'])

metals = (
        'LI','BE','NA','MG','AL','K','CA','SC','TI','V','CR','MN','FE',
        'CO','NI','CU','ZN','GA','Y','ZR','NB','MO','TC',
        'RH','PD','AG','CD','IN','SN','BA','LA','CE','PR','ND','PM','SM','EU',
        'GD','TB','DY','HO','ER','HF','TA','W','RE','OS','IR',
        'PT','AU','HG','TL','PB','BI', 'C', "SI")

t = np.asarray(data['t'])
materials = np.zeros(shape=(len(data['materials'])), dtype=tuple)
concentration = np.zeros(shape=(len(data['conc'])), dtype=tuple)

for id, item in enumerate(data['materials']):
   materials[id] = tuple(item.replace('(','').replace(')','').replace("'",'').replace(" ",'').split(","))

for id, item in enumerate(data['conc']):
   concentration[id] = (np.round_(1 - float(item.replace('[','').replace(']','').replace("'",'')), 3), 
                        np.round_(float(item.replace('[','').replace(']','').replace("'",'')), 3))


training_nn_numeric_rows_valid = np.zeros(shape=(len(data['materials']), (1+len(metals))))

for id, item in enumerate(concentration):
    training_nn_numeric_rows_valid[id][0] = t[id]
    f_item_id = metals.index(materials[id][0])
    s_item_id = metals.index(materials[id][1])
    training_nn_numeric_rows_valid[id][f_item_id+1] = item[0]
    training_nn_numeric_rows_valid[id][s_item_id+1] = item[1]

valid_predict_dnn = model_dnn.predict(x=training_nn_numeric_rows_valid)
valid_predict_cnn = model_cnn.predict(x=training_nn_numeric_rows_valid)

In [None]:
print(r2s(valid_target_nn_rows, valid_predict_cnn))
print(mean_squared_error(valid_target_nn_rows, valid_predict_cnn))
print(np.sqrt(mean_squared_error(valid_target_nn_rows, valid_predict_cnn)))
print(mean_absolute_error(valid_target_nn_rows, valid_predict_cnn))
print(mean_absolute_percentage_error(valid_target_nn_rows, valid_predict_cnn))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")


f, ax = plt.subplots(figsize=(10, 10), dpi=1000)
ax.set_yticks(range(298, 3000, 325))
ax.set_xticks(range(0, int(np.minimum(np.min(valid_target_nn_rows), np.min(valid_predict_dnn))), -25000))
ax.set_ylabel('T, K', fontsize=20)
ax.set_xlabel('G, J', fontsize=20)
f.suptitle('Na-V system (DNN)', fontsize=20)
sns.scatterplot(x=valid_predict_dnn.flatten(), y=t, linewidth=0, color="#F20587", marker='s', s = 100, ax=ax)
sns.scatterplot(x=valid_target_nn_rows.flatten(), y=t, linewidth=0, color="#6AFC98", ax=ax,  s = 100, marker='s')
plt.legend(loc='lower left', labels=['Predicted values', 'Original values'], fontsize=15, markerscale=2)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.savefig('Figure 11 - dnn')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")


f, ax = plt.subplots(figsize=(12, 10), dpi=300)
ax.set_yticks(range(298, 3000, 325))
ax.set_xticks(range(0, int(np.minimum(np.min(valid_target_nn_rows), np.min(valid_predict_cnn))), -25000))
ax.set_ylabel('T, K', fontsize=20)
ax.set_xlabel('G, J', fontsize=20)
f.suptitle('Be-Fe system (CNN)', fontsize=20)
sns.scatterplot(x=valid_predict_cnn.flatten(), y=t, linewidth=0, color="#0099DD", marker='s', s = 100, ax=ax)
sns.scatterplot(x=valid_target_nn_rows.flatten(), y=t, linewidth=0, color="#FF9933", ax=ax,  s = 100, marker='s')
plt.legend(loc='lower left', labels=['Predicted values', 'Original values'], fontsize=15, markerscale=2)
plt.xticks(fontsize=15)
plt.yticks(fontsize=15)
plt.savefig('../images/cnn_binary')
plt.show()

In [None]:
print(r2s(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_squared_error(valid_target_nn_rows, valid_predict_nn_rows))
print(np.sqrt(mean_squared_error(valid_target_nn_rows, valid_predict_nn_rows)))
print(mean_absolute_error(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_absolute_percentage_error(valid_target_nn_rows, valid_predict_nn_rows))

### Ternary

In [None]:
from keras.models import load_model
import tensorflow.keras.backend as K
from sklearn.metrics import r2_score as r2s, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

dependencies = {
    'r2_score': r2_score
}
model_cnn = load_model('../models/cnn_model_5ep_0.2val_final.h5', custom_objects=dependencies)
model_dnn = load_model('../models/dnn_model_15ep_0.2val_final.h5', custom_objects=dependencies)

data = pd.read_csv('../test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'], engine="python", encoding='utf-8', on_bad_lines='warn')
data = data.dropna(axis=0)
del data['NP'], data['Phases']
data = data[data['materials'] == "('MG', 'TI', 'ZN')"]
print(data)

valid_target_nn_rows = np.asarray(data['G'])

metals = (
        'LI','BE','NA','MG','AL','K','CA','SC','TI','V','CR','MN','FE',
        'CO','NI','CU','ZN','GA','Y','ZR','NB','MO','TC',
        'RH','PD','AG','CD','IN','SN','BA','LA','CE','PR','ND','PM','SM','EU',
        'GD','TB','DY','HO','ER','HF','TA','W','RE','OS','IR',
        'PT','AU','HG','TL','PB','BI', 'C', "SI")

t = np.round_(np.asarray(data['t']), 2)
materials = np.zeros(shape=(len(data['materials'])), dtype=tuple)
concentration = np.zeros(shape=(len(data['conc'])), dtype=tuple)

for id, item in enumerate(data['materials']):
    materials[id] = tuple(item.replace('(','').replace(')','').replace("'",'').replace(" ",'').split(","))

for id, item in enumerate(data['conc']):
    item = item.replace('[','').replace(']','').replace("'",'').split(',')
    concentration[id] = (np.round_(1 - (float(item[0])+float(item[1])), 1),
                        np.round_(float(item[0]), 1), 
                        np.round_(float(item[1]), 1))


training_nn_numeric_rows_valid = np.zeros(shape=(len(data['materials']), (1+len(metals)))) 

for id, item in enumerate(concentration):
    training_nn_numeric_rows_valid[id][0] = t[id]
    f_item_id = metals.index(materials[id][0])
    s_item_id = metals.index(materials[id][1])
    th_item_id = metals.index(materials[id][2])
    training_nn_numeric_rows_valid[id][f_item_id+1] = item[0]
    training_nn_numeric_rows_valid[id][s_item_id+1] = item[1]
    training_nn_numeric_rows_valid[id][th_item_id+1] = item[2]

valid_predict_dnn = model_dnn.predict(x=training_nn_numeric_rows_valid)
valid_predict_cnn = model_cnn.predict(x=training_nn_numeric_rows_valid)


In [None]:
print(r2s(valid_target_nn_rows, valid_predict_dnn))
print(mean_squared_error(valid_target_nn_rows, valid_predict_dnn))
print(np.sqrt(mean_squared_error(valid_target_nn_rows, valid_predict_dnn)))
print(mean_absolute_error(valid_target_nn_rows, valid_predict_dnn))
print(mean_absolute_percentage_error(valid_target_nn_rows, valid_predict_dnn))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")


f, ax = plt.subplots(figsize=(35, 25), dpi=300)
ax.set_yticks(range(298, 3000, 200))
ax.set_xticks(range(0, int(np.minimum(np.min(valid_target_nn_rows), np.min(valid_predict_dnn))), -25000))
ax.set_ylabel('T, K', fontsize=40)
ax.set_xlabel('G, J', fontsize=40)
f.suptitle('Mg-Al-ZN system (DNN)', fontsize=45)
sns.scatterplot(x=valid_predict_dnn.flatten(), y=t, linewidth=0, color="#0099DD", marker='s', s = 100, ax=ax)
sns.scatterplot(x=valid_target_nn_rows.flatten(), y=t, linewidth=0, color="#FF9933", ax=ax,  s = 100, marker='s')
plt.legend(loc='lower left', labels=['Predicted values', 'Original values'], fontsize=40, markerscale=3)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.savefig('../images/dnn_ternary')
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")


f, ax = plt.subplots(figsize=(35, 25), dpi=600)
ax.set_yticks(range(298, 3000, 200))
ax.set_xticks(range(0, int(np.minimum(np.min(valid_target_nn_rows), np.min(valid_predict_cnn))), -25000))
ax.set_ylabel('T, K', fontsize=40)
ax.set_xlabel('G, J', fontsize=40)
f.suptitle('Mg-Sc-Ta system (CNN)', fontsize=45)
sns.scatterplot(x=valid_predict_cnn.flatten(), y=t, linewidth=0, color="#F20587", marker='s', s = 100, ax=ax)
sns.scatterplot(x=valid_target_nn_rows.flatten(), y=t, linewidth=0, color="#6AFC98", ax=ax,  s = 100, marker='s')
plt.legend(loc='lower left', labels=['Predicted values', 'Original values'], fontsize=40, markerscale=3)
plt.xticks(fontsize=30)
plt.yticks(fontsize=30)
plt.savefig('Figure 12 - cnn')
plt.show()

In [None]:
print(r2s(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_squared_error(valid_target_nn_rows, valid_predict_nn_rows))
print(np.sqrt(mean_squared_error(valid_target_nn_rows, valid_predict_nn_rows)))
print(mean_absolute_error(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_absolute_percentage_error(valid_target_nn_rows, valid_predict_nn_rows))

Compare on binary systems data


In [None]:
from keras.models import load_model
import tensorflow.keras.backend as K
from sklearn.metrics import r2_score as r2s, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

dependencies = {
    'r2_score': r2_score
}
model_cnn = load_model('models/cnn_model_5ep_0.2val_final.h5', custom_objects=dependencies)
model_dnn = load_model('models/dnn_model_15ep_0.2val_final.h5', custom_objects=dependencies)


data = pd.concat(
    [pd.read_csv('data samples/valid_t300_w0.25.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases']), 
    pd.read_csv('data samples/valid_325_w0.18.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'])])
del data['NP'], data['Phases']

valid_target_nn_rows = np.asarray(data['G'])

metals = (
        'LI','BE','NA','MG','AL','K','CA','SC','TI','V','CR','MN','FE',
        'CO','NI','CU','ZN','GA','Y','ZR','NB','MO','TC',
        'RH','PD','AG','CD','IN','SN','BA','LA','CE','PR','ND','PM','SM','EU',
        'GD','TB','DY','HO','ER','HF','TA','W','RE','OS','IR',
        'PT','AU','HG','TL','PB','BI', 'C', "SI")

t = np.asarray(data['t'])
materials = np.zeros(shape=(len(data['materials'])), dtype=tuple)
concentration = np.zeros(shape=(len(data['conc'])), dtype=tuple)

for id, item in enumerate(data['materials']):
   materials[id] = tuple(item.replace('(','').replace(')','').replace("'",'').replace(" ",'').split(","))

for id, item in enumerate(data['conc']):
   concentration[id] = (np.round_(1 - float(item.replace('[','').replace(']','').replace("'",'')), 3), 
                        np.round_(float(item.replace('[','').replace(']','').replace("'",'')), 3))


training_nn_numeric_rows_valid = np.zeros(shape=(len(data['materials']), (1+len(metals))))

for id, item in enumerate(concentration):
    training_nn_numeric_rows_valid[id][0] = t[id]
    f_item_id = metals.index(materials[id][0])
    s_item_id = metals.index(materials[id][1])
    training_nn_numeric_rows_valid[id][f_item_id+1] = item[0]
    training_nn_numeric_rows_valid[id][s_item_id+1] = item[1]

valid_predict_dnn = model_dnn.predict(x=training_nn_numeric_rows_valid)
valid_predict_cnn = model_cnn.predict(x=training_nn_numeric_rows_valid)

In [None]:
len(valid_predict_dnn)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid", {'axes.grid' : False})

max_min = 0

if np.min(valid_predict_dnn) < np.min(valid_predict_cnn):
    max_min = np.min(valid_predict_dnn)
else:
    max_min = np.min(valid_predict_cnn)
max_min = np.round_(max_min)

fig = plt.figure(figsize=(50,20), dpi=600)
ax = fig.subplots(1, 2)
ax[0].set_yticks(range(298, 3000, 300))
ax[0].set_xticks(range(0, int(max_min), -20000))
ax[0].set_xlim(int(max_min), 100)

ax[1].set_yticks(range(298, 3000, 300))
ax[1].set_xticks(range(0, int(max_min), -20000)) 
ax[1].set_xlim(int(max_min), 100)

ax[0].tick_params(axis='both', which='both', labelsize=30)
ax[1].tick_params(axis='both', which='both', labelsize=30)

ax[0].set_ylabel('T, K', fontsize=40)
ax[1].set_ylabel('T, K', fontsize=40)

ax[0].set_xlabel('G, J (1e5)', fontsize=40)
ax[1].set_xlabel('G, J (1e5)', fontsize=40)

ax[0].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
ax[1].ticklabel_format(style='sci', axis='x', scilimits=(0,0))

fig.suptitle('On the left DNN, right CNN model', fontsize=60)

sns.histplot(x=valid_predict_dnn.flatten(), y= t, color="#F20587", bins=20, ax=ax[0])
sns.histplot(x=valid_predict_cnn.flatten(), y= t, color="#6AFC98", bins=20, ax=ax[1])
sns.despine(left = False)

plt.savefig("dnn_cnn.png", dpi=600)  
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid", {'axes.grid' : False})

max_min = np.min(valid_predict_dnn)

if np.min(valid_target_nn_rows) < max_min: 
    max_min = np.min(valid_target_nn_rows)

max_min = np.round_(max_min)

fig = plt.figure(figsize=(50,20), dpi=600)
ax = fig.subplots(1, 2)
ax[0].set_yticks(range(298, 3000, 300))
ax[0].set_xticks(range(0, int(max_min), -20000))
ax[0].set_xlim(int(max_min), 100)

ax[1].set_yticks(range(298, 3000, 300))
ax[1].set_xticks(range(0, int(max_min), -20000)) 
ax[1].set_xlim(int(max_min), 100)

ax[0].tick_params(axis='both', which='both', labelsize=30)
ax[1].tick_params(axis='both', which='both', labelsize=30)

ax[0].set_ylabel('T, K', fontsize=40)
ax[1].set_ylabel('T, K', fontsize=40)

ax[0].set_xlabel('G, J (1e5)', fontsize=40)
ax[1].set_xlabel('G, J (1e5)', fontsize=40)

ax[0].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
ax[1].ticklabel_format(style='sci', axis='x', scilimits=(0,0))

fig.suptitle('On the left, the original values, right DNN model', fontsize=60)

sns.histplot(x=valid_target_nn_rows.flatten(), y= t, color="#F20587", bins=20, ax=ax[0])
sns.histplot(x=valid_predict_dnn.flatten(), y= t, color="#2E038C", bins=20, ax=ax[1])
sns.despine(left = False)

plt.savefig("dnn.png", dpi=600)  
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style("whitegrid", {'axes.grid' : False})

max_min = np.min(valid_predict_cnn)

if np.min(valid_target_nn_rows) < max_min: 
    max_min = np.min(valid_target_nn_rows)

max_min = np.round_(max_min)

fig = plt.figure(figsize=(50,20), dpi=600)
ax = fig.subplots(1, 2)
ax[0].set_yticks(range(298, 3000, 300))
ax[0].set_xticks(range(0, int(max_min), -20000))
ax[0].set_xlim(int(max_min), 100)

ax[1].set_yticks(range(298, 3000, 300))
ax[1].set_xticks(range(0, int(max_min), -20000)) 
ax[1].set_xlim(int(max_min), 100)

ax[0].tick_params(axis='both', which='both', labelsize=30)
ax[1].tick_params(axis='both', which='both', labelsize=30)

ax[0].set_ylabel('T, K', fontsize=40)
ax[1].set_ylabel('T, K', fontsize=40)

ax[0].set_xlabel('G, J (1e5)', fontsize=40)
ax[1].set_xlabel('G, J (1e5)', fontsize=40)

ax[0].ticklabel_format(style='sci', axis='x', scilimits=(0,0))
ax[1].ticklabel_format(style='sci', axis='x', scilimits=(0,0))

fig.suptitle('On the left, the original values, right CNN model', fontsize=60)

sns.histplot(x=valid_target_nn_rows.flatten(), y= t, color="#F20587", bins=20, ax=ax[0])
sns.histplot(x=valid_predict_cnn.flatten(), y= t, color="#2E038C", bins=20, ax=ax[1])
sns.despine(left = False)

plt.savefig("cnn.png", dpi=600)  
plt.show()

Another validation on ternary data

In [None]:
from keras.models import load_model
import tensorflow.keras.backend as K
from sklearn.metrics import r2_score as r2s, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

def r2_score(y_true, y_pred):
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

dependencies = {
    'r2_score': r2_score
}
model = load_model('../models/cnn_model_5ep_0.2val_final.h5', custom_objects=dependencies)

data = pd.concat(
    [pd.read_csv('test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'], engine="python", encoding='utf-8', on_bad_lines='warn'), 
    pd.read_csv('test.csv', sep=',', names=['t','materials','conc','G','NP', 'Phases'], engine="python", encoding='utf-8', on_bad_lines='warn')])
data['G'] = data['G'].fillna(0)
del data['NP'], data['Phases']
print(data)

valid_target_nn_rows = np.asarray(data['G'])

metals = (
        'LI','BE','NA','MG','AL','K','CA','SC','TI','V','CR','MN','FE',
        'CO','NI','CU','ZN','GA','Y','ZR','NB','MO','TC',
        'RH','PD','AG','CD','IN','SN','BA','LA','CE','PR','ND','PM','SM','EU',
        'GD','TB','DY','HO','ER','HF','TA','W','RE','OS','IR',
        'PT','AU','HG','TL','PB','BI', 'C', "SI")

t = np.round_(np.asarray(data['t']), 2)
materials = np.zeros(shape=(len(data['materials'])), dtype=tuple)
concentration = np.zeros(shape=(len(data['conc'])), dtype=tuple)

for id, item in enumerate(data['materials']):
    materials[id] = tuple(item.replace('(','').replace(')','').replace("'",'').replace(" ",'').split(","))

for id, item in enumerate(data['conc']):
    item = item.replace('[','').replace(']','').replace("'",'').split(',')
    concentration[id] = (np.round_(1 - (float(item[0])+float(item[1])), 1),
                        np.round_(float(item[0]), 1), 
                        np.round_(float(item[1]), 1))


training_nn_numeric_rows_valid = np.zeros(shape=(len(data['materials']), (1+len(metals))))

for id, item in enumerate(concentration):
    training_nn_numeric_rows_valid[id][0] = t[id]
    f_item_id = metals.index(materials[id][0])
    s_item_id = metals.index(materials[id][1])
    th_item_id = metals.index(materials[id][2])
    training_nn_numeric_rows_valid[id][f_item_id+1] = item[0]
    training_nn_numeric_rows_valid[id][s_item_id+1] = item[1]
    training_nn_numeric_rows_valid[id][th_item_id+1] = item[2]


valid_predict_nn_rows = model.predict(x=training_nn_numeric_rows_valid)

print(r2s(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_squared_error(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_absolute_error(valid_target_nn_rows, valid_predict_nn_rows))
print(mean_absolute_percentage_error(valid_target_nn_rows, valid_predict_nn_rows))
print(len(data))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(50,20), dpi=150)
ax = fig.subplots(1, 2)
ax[0].set_yticks(range(298, 3000, 300))
ax[0].set_xticks(range(0, -400000, -5000))

ax[1].set_yticks(range(298, 3000, 300))
ax[1].set_xticks(range(0, -400000, -5000))

ax[0].tick_params(axis='y', which='major', labelsize=25)
ax[1].tick_params(axis='y', which='major', labelsize=25)
ax[0].set_ylabel('T, K', fontsize=25)

fig.suptitle('Слева DNN модель, справа CNN модель', fontsize=50)

sns.histplot(x=dnn_values.flatten(), y= t, color="#F20587", bins=20, ax=ax[0])
sns.histplot(x=valid_predict_nn_rows.flatten(), y= t, color="#2E038C", bins=20, ax=ax[1])
sns.despine(left = False)

plt.savefig("DNN_CNN.png")  
plt.show()