In [1]:
import pandas as pd
import numpy as np
import re

Loading the csv-files

In [2]:
db = pd.read_csv('./om_info.csv')
dat = pd.read_csv('./datosrahm.csv')

symtoz = {k:v for k,v in zip(dat['Symbol'].values, dat['Z'].values)}
symtoz['D'] = 1

The db dataframe is the complete collection of compounds. The dat dataframe is a support file that relates element symbols with atomic numbers

In [3]:
diccio = {k:v for k,v in zip(df['cif'], df['formula'])}

formulas = list()

for row in range(db.shape[0]):
    
    cif = int(db['name'][row].split('_')[0])
    formulas += [diccio.get(cif, None)]

db['formula'] = formulas

Now, the formula of each compound in db will be converted in a 96x1 matrix, which represents elements times number of atoms within the unit cell.

In [3]:
atoms = np.zeros((db.shape[0], 96, 1))
for row in range(db.shape[0]):
    sample = db['formula_unit_cell'][row]
    for item in sample.split():
        splitem = re.split('(\d+)', item)
        el, ats, = ''.join(splitem[:1]), ''.join(splitem[1:])
        atoms[row, symtoz.get(el,None)-1] = float(ats)

Now, we check that there are not fractional atoms (next result must be equal to zero)

In [4]:
db.iloc[(atoms.sum(axis=(1,2))%1 != 0),:]

Unnamed: 0,cif,a,b,c,alpha,beta,gamma,formula_unit_cell,formula_om
5881,2010797,7.6940,8.0390,14.1300,90.0,90.0000,90.0,H24 C19.99992 I4 N3.99984 Cl8,C5 H6 Cl2 I N
7166,2017128,17.5480,17.5480,7.8980,90.0,90.0000,120.0,Cu12 H89.99991 C45 N30 Cl18 O3,C15 H30 Cl6 Cu4 N10 O
8215,2103906,7.2982,9.3593,29.2570,90.0,90.0000,90.0,H128 C68 N16 O16.0064,C17 H32 N4 O4
13436,4101626,11.6012,11.6012,11.6012,90.0,90.0000,90.0,Zn4 Cu4 H47.9988 C32 N20,C8 H12 Cu N5 Zn
14306,4323260,14.4730,18.3460,19.5500,90.0,90.0000,90.0,Si16 Mo8.0008 P16 H288 C128,C32 H72 Mo2 P4 Si4
...,...,...,...,...,...,...,...,...,...
95523,7118853,6.8611,18.3584,9.5657,90.0,90.0894,90.0,H40 C60.18 N8.024 O12.036,C15 H10 N2 O3
95627,7153154,12.6980,16.1029,19.8555,90.0,98.2570,90.0,Fe4 H204 C148 N8 Cl15.88 O8,C37 H51 Cl4 Fe N2 O2
95782,7204222,14.1610,7.5210,22.9670,90.0,105.7030,90.0,H120 C91.984 N16 O28.0,C23 H30 N4 O7
95812,7205487,11.3843,14.2937,15.1250,90.0,111.6560,90.0,H96.028 C128.028 O8,C32 H24 O2


The decimals are really near to integers. 

We check the highest number of atoms for an element in a compound. This is important since this will fix the length of the binary input vector

In [5]:
atoms.max()

360.0

In the next cell we compare the matrix for each compound with the formula.

In [6]:
row = 100
atoms[row,:,0], db['formula_unit_cell'][row]

(array([84.,  0.,  0.,  0.,  0., 80., 20.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  8., 16.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  8.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.]),
 'Cu8 H84 C80 S8 N20 Cl16')

Now, the array called atoms will be transformed into a binary representation of the number of atoms of each element.

In [7]:
length = len(bin(int(atoms.max()))[2:])

binatoms = np.zeros((db.shape[0], 96, length))
for row in range(db.shape[0]):
    sample = db['formula_unit_cell'][row]
    for item in sample.split():
        splitem = re.split('(\d+)', item)
        el, ats, = ''.join(splitem[:1]), ''.join(splitem[1:])

        binats = bin(int(np.round(float(ats),0)))[2:].zfill(length)
        binats = [i for i in binats]
        binats = np.asarray(binats)
        
        
        binatoms[row, symtoz.get(el,None)-1] = binats

Now we compare both representations (binary and decimal):

In [8]:
row = 10
binatoms[row,:,:],atoms[row,:,0]

(array([[0., 0., 0., 1., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 0., 1., 0., 0.],
        [0., 0., 0., 0., 1., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0

Saving the binary representation

In [9]:
np.save('binformula', binatoms)