In [10]:
#importing and preparing data
import numpy as np
import pickle
import pandas as pd

In [11]:
nb_runs       = 50
nb_iterations = 100

In [12]:
df = pd.read_csv('hydrogen_gcmc_data.csv')
df.head()

Unnamed: 0,_id,Name,CSD refc.,DB Acr.,GT,Calc. Type,Density,GSA,VSA,VF,PV,LCD,PLD,UG at PS,UV at PS,Source,CSFT,CSA
0,1,DAKXOE_CSD17,DAKXOE,CSD17,Real,Grand Canonical Monte Carlo,0.24,7697.77,1871.97,0.9,3.72,12.42,10.55,13.35,39.89,DOI: 10.1021/acs.chemmater.7b00441,CIF (Not P1 Symmetry),Not open source
1,2,ENAPAL_CSD17,ENAPAL,CSD17,Real,Grand Canonical Monte Carlo,0.48,6108.18,2920.11,0.86,1.8,8.42,7.34,7.36,39.88,DOI: 10.1021/acs.chemmater.7b00441,CIF (Not P1 Symmetry),Not open source
2,3,BIVVEI_CSD17,BIVVEI,CSD17,Real,Grand Canonical Monte Carlo,0.37,6827.24,2544.55,0.89,2.39,11.4,10.01,9.28,39.79,DOI: 10.1021/acs.chemmater.7b00441,CIF (Not P1 Symmetry),Not open source
3,4,str_m3_o20_o20_f0_pcu.sym.1.out,Not applicable,UO,Hypothetical,Grand Canonical Monte Carlo,0.32,5637.21,1811.42,0.88,2.75,16.19,10.93,10.44,39.7,https://doi.org/10.1002/ejic.201600365,CIF (P1 Symmetry),DOI: 10.24435/materialscloud:2018.0016/v2
4,5,mof_5399,Not applicable,ToBaCCo,Hypothetical,Grand Canonical Monte Carlo,0.33,5777.0,1935.0,0.89,2.61,16.3,12.2,9.97,39.66,DOI: 10.1021/acs.cgd.7b00848,CIF (P1 Symmetry),Open Source (https://github.com/tobacco-mofs/t...


In [13]:
# extracting the useful features for inputs to a function
features = ['Density', 'GSA', 'VSA', 'VF', 'PV', 'LCD', 'PLD']
print(len(features))

7


In [14]:
# extracting the GCMC calculated values 
# There are two quantities we can use: UG at PS or UV at PS
y = df['UG at PS'].values
print('Shape of Y: ', np.shape(y))
print(y) 

Shape of Y:  (98694,)
[13.35  7.36  9.28 ...  0.    0.    0.  ]


In [15]:
x = df[features].values
print('Shape of X: ', np.shape(x))
print(x)

Shape of X:  (98694, 7)
[[2.40000e-01 7.69777e+03 1.87197e+03 ... 3.72000e+00 1.24200e+01
  1.05500e+01]
 [4.80000e-01 6.10818e+03 2.92011e+03 ... 1.80000e+00 8.42000e+00
  7.34000e+00]
 [3.70000e-01 6.82724e+03 2.54455e+03 ... 2.39000e+00 1.14000e+01
  1.00100e+01]
 ...
 [1.43000e+00 0.00000e+00 0.00000e+00 ... 2.70000e-01 2.56000e+00
  1.54000e+00]
 [1.67000e+00 0.00000e+00 0.00000e+00 ... 1.70000e-01 2.14000e+00
  1.30000e+00]
 [3.46000e+00 0.00000e+00 0.00000e+00 ... 8.00000e-02 3.52000e+00
  9.50000e-01]]


In [16]:
# We need to normalize all of these values
for i in range(np.shape(x)[1]):
    x[:, i] = (x[:, i] - np.min(x[:, i])) / (np.max(x[:, i]) - np.min(x[:, i]))
    print("feature", i, " in [", np.min(x[:, i]), ",", np.max(x[:, i]), "]")

print(x)

feature 0  in [ 0.0 , 1.0 ]
feature 1  in [ 0.0 , 1.0 ]
feature 2  in [ 0.0 , 1.0 ]
feature 3  in [ 0.0 , 1.0 ]
feature 4  in [ 0.0 , 1.0 ]
feature 5  in [ 0.0 , 1.0 ]
feature 6  in [ 0.0 , 1.0 ]
[[0.0407767  0.78951487 0.46857588 ... 0.10411419 0.16837523 0.14755245]
 [0.08737864 0.62648    0.73093752 ... 0.05037783 0.11220334 0.10265734]
 [0.06601942 0.70022974 0.63693048 ... 0.06689057 0.1540514  0.14      ]
 ...
 [0.27184466 0.         0.         ... 0.00755668 0.02991153 0.02153846]
 [0.3184466  0.         0.         ... 0.00475791 0.02401348 0.01818182]
 [0.66601942 0.         0.         ... 0.00223901 0.04339278 0.01328671]]


In [17]:
# We need to also assign names to each COF:
names = df['Name'].values
print(names)

['DAKXOE_CSD17' 'ENAPAL_CSD17' 'BIVVEI_CSD17' ...
 'str_m9_o22_o22_f0_sra.sym.87.out' 'str_m9_o5_o2_f0_sra.sym.37.out'
 'str_m9_o5_o2_f0_sra.sym.95.out']


In [18]:
with open('hydrogen_input_output.pkl', 'wb') as file:
    pickle.dump({'x': x, 'y': y, 'features': features, 'names': names, 'nb_COFs': np.size(y), 'nb_runs': nb_runs, 'nb_iterations': nb_iterations}, file)