In [1]:
import pandas as pd
import numpy as np
import re

Loading the csv-files

In [2]:
df = pd.read_csv('orgcomp_formulas.csv')
db = pd.read_csv('dfset_macro.csv')
dat = pd.read_csv('./datosrahm.csv')

symtoz = {k:v for k,v in zip(dat['Symbol'].values, dat['Z'].values)}
symtoz['D'] = 1

The df dataframe contains the formula of the compound within the unit cell. The db dataframe is the complete collection of compounds. The dat dataframe is a support file that relates element symbols with atomic numbers

In [3]:
diccio = {k:v for k,v in zip(df['cif'], df['formula'])}

formulas = list()

for row in range(db.shape[0]):
    
    cif = int(db['name'][row].split('_')[0])
    formulas += [diccio.get(cif, None)]

db['formula'] = formulas

Now, the formula of each compound in db will be converted in a 96x1 matrix, which represents elements times number of atoms within the unit cell.

In [4]:
atoms = np.zeros((db.shape[0], 96, 1))
for row in range(db.shape[0]):
    sample = db['formula'][row]
    for item in sample.split():
        splitem = re.split('(\d+)', item)
        el, ats, = ''.join(splitem[:1]), ''.join(splitem[1:])
        atoms[row, symtoz.get(el,None)-1] = float(ats)

Now, we check that there are not fractional atoms (next result must be equal to zero)

In [5]:
db.iloc[(atoms.sum(axis=(1,2))%1 != 0),:]

Unnamed: 0,name,sgnum,formula
6452,2010797_macro,62,H24 C19.99992 I4 N3.99984 Cl8
7800,2017128_macro,146,Cu12 H89.99991 C45 N30 Cl18 O3
8951,2103906_macro,63,H128 C68 N16 O16.0064
14409,4101626_macro,198,Zn4 Cu4 H47.9988 C32 N20
15460,4323260_macro,62,Si16 Mo8.0008 P16 H288 C128
16054,4342250_macro,148,Si6 H89.9982 C96 N18
16882,7028981_macro,148,P13.999998 H228.000006 Ru6 C135.999998 N12 F42
17131,7112029_macro,173,Fe2 H95.9994 C66 S6 N10 O12
19522,2233677_macro,2,H16 C24.0018 S2 N4 O2 F4
22099,4023195_macro,4,H68 C36 S2 N10 O10.0008


The decimals are really near to integers. 

We check the highest number of atoms for an element in a compound. This is important since this will fix the length of the binary input vector

In [6]:
atoms.max()

360.0

In the next cell we compare the matrix for each compound with the formula.

In [7]:
row = 100
atoms[row,:,0], db['formula'][row]

(array([42.,  0.,  0.,  0.,  0., 38.,  2.,  6.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.]), 'H42 C38 N2 O6')

Now, the array called atoms will be transformed into a binary representation of the number of atoms of each element.

In [8]:
length = len(bin(int(atoms.max()))[2:])

binatoms = np.zeros((db.shape[0], 96, length))
for row in range(db.shape[0]):
    sample = db['formula'][row]
    for item in sample.split():
        splitem = re.split('(\d+)', item)
        el, ats, = ''.join(splitem[:1]), ''.join(splitem[1:])
        #print(el, ats)
        binats = bin(int(np.round(float(ats),0)))[2:].zfill(length)
        binats = [i for i in binats]
        binats = np.asarray(binats)
        
        
        binatoms[row, symtoz.get(el,None)-1] = binats

Now we compare both representations (binary and decimal):

In [9]:
row = 10
binatoms[row,:,:],atoms[row,:,0]

(array([[0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0

Saving the binary representation

In [10]:
np.save('binformula', binatoms)