In [1]:
import scipy.sparse as sp
import scipy.sparse.linalg as sp_linalg
import pandas as pd
from itertools import chain
import multiprocessing as mp
import numpy as np
import sys
import re

## Depending on the dataset (and the memory we have) you may use the following dtypes:

- np.ubyte: 1 byte Unsigned char ( 0 ... 255 )
- np.byte : 1 byte Signed char ( -128 ... 127 )
- np.short: 2 bytes C short   ( -32,768 ... 32,767 )  <-- This is more than enough

In [2]:
def getVec_sparse_v2(tx,elemList): #Tx = molecular formula, e.g. C4H2ClBr
    #### This regex handles non-integer subindices: C6H16Na3O12.5 (which happens in DS) 
    Li = re.split(r"(?<!^)(?=[A-Z])",tx)  #Split as ['H2','O']
    
    # Adds 1 if no subindex. Result is ['H2','O1']. 
    # Right after, split chem symbol from subindex as [['H',2],['O',1]]
    
    li = [re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i)
          if bool(re.match(r'[A-z]*([0-9]*[.])?[0-9]+',i))
          else re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i+'1') for i in Li]  
    
    # Construct two lists: input for sparse matrix construction
    col  = [elemList.index(i[1]) for i in li]  # Index of element i to put correspondent data
    data = [float(i[2]) for i in li]           # Num. atoms of element i
    
    for i in data:
        if float(i)!=int(i):
            return None  # Return empty lists or better None?
    return col,data

def getElems(DataFile,NMax=None):
    """NMax specifies number of rows of dataset to be used"""
    col_names = ['ID','formula','year']
    sep = '\t'

    df = pd.read_csv(DataFile,header=None,sep=sep,nrows=NMax,names=col_names)  #Load data
    
    df['formula'] = df['formula'].str.strip()   #Remove white spaces at begginning and end of string 

    elems = set([])

    for cmpnd in df['formula']:
        txt = ''.join(re.findall(r'[A-z]',cmpnd))   #Remove all subindices (there must be a regex to this but who knows)
        elems = elems.union(  set(re.split(r"(?<!^)(?=[A-Z])",txt))  )  # Add elements of this set to the set of known elements

    elems = sorted(list(elems)) # Convert to list and sort

    # Save this list of elements so it doesn't have to be calculated every time
    #with open("./Data/ElementList.txt", "w") as f:
    #    for A in elems:
    #        f.write(str(A) +"\n")

    return elems  # This returns a list with all sorted elements in dataset

def allVecs_sparse(DataFile,NMax=None):
    col_names = ['ID','formula','year']
    sep = '\t'

    df = pd.read_csv(DataFile,header=None,sep=sep,nrows=NMax,names=col_names)  #Load data
        
    df['formula'] = df['formula'].str.strip()   #Remove white spaces at begginning and end of string 

    elemList = getElems(DataFile,NMax)
    
    # List of lists [col,data]
    colXdata = list(map(lambda x: getVec_sparse_v2(x,elemList) , df['formula'].values))
    index = [i for i, l in enumerate(colXdata) if l is not None]
    colXdata = [l for l in colXdata if l is not None]
    
    # See docs for scipy.sparse.csr_matrix to understand the syntaxis
    indptr = np.cumsum([0]+list(map(lambda x: len(x[0]) , colXdata)))
    indices = np.array(list(chain(*[l[0] for l in colXdata])))
    data = np.array(list(chain(*[l[1] for l in colXdata])))

    cmpnds = sp.csr_matrix((data, indices, indptr), 
                           shape=(len(colXdata), len(elemList)),
                           dtype=np.short)
       
    years = df['year'].values[index]
    subsID = df['ID'].values[index]
    
    del indptr, indices, data, df, index, colXdata

    return cmpnds,years,subsID, elemList

In [3]:
sparse , years , subsID , elemList = allVecs_sparse('../SendProfGR/sample_w_IDs.csv')
sparse

<11138x60 sparse matrix of type '<class 'numpy.int16'>'
	with 41691 stored elements in Compressed Sparse Row format>

In [4]:
def findRs_sparse(cmpnds,elemList):
    indices = cmpnds.indices
    data = cmpnds.data
    indptr = cmpnds.indptr
    
    sz_cols = cmpnds.shape[1] # Number of elements
    
    Rs = []  
    cmpnd_num_Rs = []
    
    for c in range(indptr.shape[0]-1):
        indx = indices[indptr[c]:indptr[c+1]]
        sub_dat = data[indptr[c]:indptr[c+1]]
        
        for i in range(indx.shape[0]):
            c_data = sub_dat.copy()
            n = int(c_data[i])          
            
            for j in range(n):   #Loop through ith element's subindex
                c_data[i] -= 1   #Remove one
                n = j+1          #How many atoms of this element have been removed 

                #Append compound data with a reduced entry (R-X n-1)
                Rs.append(  np.append(c_data.copy(),n)  )

        cmpnd_num_Rs.append((sub_dat.sum(),indx))
   
    # Construct sparse matrix
    for_indics = list(chain(*[l[0]*[l[1]] for l in cmpnd_num_Rs]) )

    indptr = np.cumsum([0]+list(map(lambda x: len(x)+1 , for_indics)))
    indices = np.array(list(chain(*[list(l)+[sz_cols] for l in for_indics])))
    data = np.array(list(chain(*[l for l in Rs])))

    Rs = sp.csr_matrix((data, indices, indptr),
                        shape=(len(Rs), sz_cols+1),
                        dtype=np.short)

    return Rs

In [5]:
uniq_cmpnds_arr = np.unique(sparse.toarray(),axis=0)
uniq_sparse = sp.csr_matrix(uniq_cmpnds_arr,dtype=np.short)

years , subsID = years[:uniq_sparse.shape[0]] , subsID[:uniq_sparse.shape[0]] 

In [6]:
%%time
Rs = findRs_sparse(uniq_sparse,elemList)
Rs

CPU times: user 4.25 s, sys: 182 ms, total: 4.43 s
Wall time: 4.53 s


<211425x61 sparse matrix of type '<class 'numpy.int16'>'
	with 1102416 stored elements in Compressed Sparse Row format>

---
#### Next step: Finding which Rs are more than once in DS.

1. Distribute the total list of Rs in chunks of non-overlapping sets
    - That means, if a vector `v` is in chunk $i$, then all of its repetitions (if any) should as well be in chunk i and in no other chunk $j \ne i$.
    
    - That way we can take advantage of multiple processors.


    
2. Find non-unique vectors within each chunk.

In [7]:
import scipy
scipy.__version__

'1.2.1'

In [8]:
def validRs(Rs):
    """Get all non-unique R-n vectors out of a chunk Rs."""
    new_rs , c = np.unique(Rs.toarray(),axis=0, return_counts=True)
    new_rs = new_rs[c > 1]
    return sp.csr_matrix(new_rs,dtype=np.short)

def unique_mp(Rs,size=4):
    """Create data chunks for finding non-unique Rs in parallel.
    Test how this does on full DS. If pickling errors, implement recursive spliting.
    """
    max_n = 4
    # First split by n, the most natural choice.
    dist_list = [Rs[(Rs[:,-1]==i).toarray()[:,-1] ]
                  for i in range(1,max_n)]   + [Rs[(Rs[:,-1]>=max_n).toarray()[:,-1]]]

    with mp.Pool(processes=size) as pool:
        R_results = [pool.apply_async(validRs,args=(r,))
                     for r in dist_list]        

        Rs_get = [r.get() for r in R_results]

    return Rs_get

In [9]:
uniqRs = unique_mp(Rs,size=4)
uniqRs

[<2715x61 sparse matrix of type '<class 'numpy.int16'>'
 	with 11245 stored elements in Compressed Sparse Row format>,
 <1874x61 sparse matrix of type '<class 'numpy.int16'>'
 	with 7781 stored elements in Compressed Sparse Row format>,
 <560x61 sparse matrix of type '<class 'numpy.int16'>'
 	with 2150 stored elements in Compressed Sparse Row format>,
 <965x61 sparse matrix of type '<class 'numpy.int16'>'
 	with 3759 stored elements in Compressed Sparse Row format>]

In [10]:
elemDict = {}
for i,elem in enumerate(elemList):
    elemDict[i] = elem

In [14]:
def get_matches(Rs,cmpnds,years,subsID,elemDict):
    """Get matches. For each R(n) find all elements X such that compound R-Xn exists in dataset. 
    Build an element set for each R(n).
    """
    ns = Rs[:,-1].data
    R  = Rs[:,:-1]
    
    sumCmpnds = cmpnds.sum(axis=1)
    sumRaxis1 = np.array(    R.sum(axis=1).flatten() + ns    ).flatten()
    
    Matches = []
    for i,n in enumerate(ns):
        if i%1000==0:       print( f"\t{i}th R evaluated..." )
            
        r = R[i] #The actual R
        
        """Encode a condition to search only within a subset of compounds
        fulfulling certain conditions based on R"""
        # 1. R is contained in compound        
        cond1 = ((cmpnds - r.toarray())>=0).all(axis=1)
        # 2. sum of atoms in cmpnd == sum of atoms in R_ (sum(R) + n)
        cond2 = (sumCmpnds == sumRaxis1[i])
        
        cond = np.array(cond1 & cond2).flatten()  # Combine conditions
        subsetCmpnds = cmpnds[cond]  # Select subset of cmpnds
        curr_years = years[cond]
        curr_subsID = subsID[cond]
        
        cmpnds_no_R = (subsetCmpnds - r.toarray())
        
        # Now select only those cmpnds where residual is due to one element only (X_n)
        # Only useful for n!=1
        if n!=1:
            cond = np.array((cmpnds_no_R!=0).sum(axis=1)==1).flatten()
            subsetCmpnds = subsetCmpnds[cond]
            curr_years = curr_years[cond]
            curr_subsID = curr_subsID[cond]
            
        # At this point, subsetCmpnds contains all compounds that match with R(n).
        elemIndex = (subsetCmpnds - r.toarray()).nonzero()[1]
        curr_list = set(map(lambda x: elemDict[x]  , elemIndex))  # Map dict to above list of elems

        Matches.append( [curr_list, curr_years, curr_subsID] )

        
        
        ###########
        ## Deal with this after you get all data created above
        #Table_list.append(getTable(curr_list,curr_years,curr_subsID,useID=useID))      

    
    
    
    
    
    return 0
        
############



#    if rank==0:    print("Saving...")
#    table_list_arr = np.array(Table_list,dtype=np.intc)
#    np.save(f'./Data/TablesID_NMax{NMax}_P{rank}.npy',table_list_arr)
#    np.save(f'./Data/RVector_NMax{NMax}_P{rank}.npy',np.array(R_list,dtype=np.short))
#
#
#    ### Make a function here that produces a new array exchanging ID for year, just a mapping
#    if useID:
#        if rank==0:  print("\n\tProducing array filled with years from ID array...")
#
#        # Mapping going from ID to year
#        mapping = dict(zip(list(subsID),list(years)))
#        mapping[-1] = -1
#        # Apply mapping 
#        yearArray = np.vectorize(mapping.__getitem__)(table_list_arr)
#        np.save(f'./Data/TablesYears_NMax{NMax}_P{rank}.npy',np.array(yearArray,dtype=np.short))
#
#    return Comm_list,R_list  #This list contains lists (one for each R) of elements X such that R-X exist in dataset.
#
for i in range(4): get_matches(uniqRs[i],uniq_sparse,years,subsID,elemDict)

	0th R evaluated...
{'Ba', 'Th', 'N', 'As', 'B', 'Mn', 'Cu', 'Li', 'Cd', 'Au', 'V', 'P', 'Na', 'Si', 'Hg', 'Cs', 'Ce', 'Ru', 'Pd', 'Ni', 'Rh', 'Cl', 'I', 'Bi', 'Ca', 'C', 'W', 'Os', 'Pb', 'U', 'Be', 'Sb', 'Zr', 'Sn', 'Ta', 'Br', 'Al', 'H', 'Sr', 'In', 'Te', 'Ti', 'F', 'S', 'Zn', 'Mo', 'Mg', 'Cr', 'K', 'Co', 'O', 'Ag', 'Se', 'Pt', 'Fe', 'Tl', 'Ir', 'Rb'}
[2040 2042 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840
 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840
 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840
 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840
 1840 1840]
[   0    1    2    3    4    5    6    7    8    9   10   11   12   13
   21   26   68   70   72   73   78   81   88  121  124  302  326  418
  485  505  526  558  578  685  699  709  812  913 2041 2194 2367 2553
 2561 2719 2852 3493 3502 3619 3776 7641 8248 8370 8410 8662 8782 8846
 9175 9335]

{'P', 'O', 'Cu', 'S'}
[1840 1840 1840 1840]
[  28  

{'C', 'Te'}
[1840 1840]
[ 341 3792]

{'Mo', 'W', 'S', 'Te'}
[1840 1840 1840 1840]
[342 343 344 496]

{'O', 'S'}
[1840 1840]
[348 350]

{'O', 'Te'}
[1840 1840]
[348 354]

{'Te', 'S', 'Sn', 'Se', 'C', 'Si'}
[1840 1840 1840 1840 1840 1840]
[ 350  351  352  353  354 3793]

{'O', 'S'}
[1840 1840]
[355 360]

{'O', 'Se', 'S'}
[1840 1840 1840]
[355 356 361]

{'U', 'S', 'Mo', 'Cr', 'Mn', 'Se', 'Fe', 'W', 'Te'}
[1840 1840 1840 1840 1840 1840 1840 1840 1840]
[ 357  358  359  360  361  497  518 2062 2581]

{'H', 'Na'}
[1840 1840]
[394 980]

{'Ca', 'Zn'}
[1840 1840]
[ 376 3643]

{'Se', 'S'}
[1840 1840]
[388 389]

{'As', 'Sb'}
[1840 1840]
[ 389 8861]

{'Al', 'Sb', 'B'}
[1840 1840 1840]
[ 391 8664 9177]

{'P', 'As'}
[1840 1840]
[ 392 8863]

{'O', 'S'}
[1840 1840]
[392 394]

{'P', 'Sb', 'As'}
[1840 1840 1840]
[ 393  394 8864]

{'Zr', 'Si'}
[1840 1840]
[397 398]

{'O', 'S'}
[1840 1840]
[399 401]

{'Ti', 'Al', 'N', 'B', 'Cr', 'Co', 'O', 'Nb', 'Fe', 'P', 'V', 'C'}
[1840 1840 1840 1840 1840 1840 1840 1840

{'Li', 'K', 'O', 'H', 'Na'}
[1840 1840 1840 1840 1840]
[ 931  969 1016 1020 1060]

{'P', 'N'}
[1840 1840]
[916 984]

{'Ba', 'Th', 'Mn', 'Cu', 'Li', 'Nb', 'Na', 'Si', 'Cs', 'Pd', 'Ni', 'Rh', 'Cl', 'I', 'W', 'Os', 'Pb', 'U', 'Be', 'Sb', 'Zr', 'Sn', 'Ta', 'Br', 'Al', 'H', 'Te', 'Ti', 'S', 'Zn', 'Mo', 'Cr', 'K', 'Co', 'Pt', 'Fe', 'Tl', 'Ir', 'Rb'}
[1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840
 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840
 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840]
[ 918  919  920  921  922  923  924  925  926  927  928  930  931  941
  942  943  944  946  950  965  968  970 1002 1005 1017 1021 1040 1042
 1063 2079 2397 2556 2596 2747 2906 7660 8381 8446 9186]

{'C', 'S'}
[1840 1840]
[ 935 3837]

{'Cu', 'K', 'Cd', 'Fe', 'Al', 'Na'}
[1840 1840 1840 1840 1840 1840]
[ 974 1022 2080 2398 3535 9187]

{'Cl', 'Br'}
[1840 1840]
[2907 7661]

{'K', 'Co'}
[1840 1840]
[1023 2748]

{'Os', 'Zn', 'Pd', 'Mg', 'Be', 'Zr', 'C'}
[

{'Mo', 'W'}
[1840 1840]
[1292 1322]

{'Cd', 'Mn'}
[1840 1840]
[1326 3545]

{'F', 'O', 'Br', 'Cl', 'I'}
[1840 1840 1840 1840 1840]
[1299 1364 2203 2926 7667]

{'Cl', 'I', 'Br', 'H'}
[1840 1840 1840 1840]
[1365 1392 2927 7668]

{'N', 'As'}
[1840 1840]
[1303 8901]

{'N', 'Br', 'P', 'I', 'Cl'}
[1840 1840 1840 1840 1840]
[1300 1305 1366 2928 7669]

{'H', 'Cu'}
[1840 1840]
[1394 2410]

{'N', 'I', 'Cl'}
[1840 1840 1840]
[1307 1367 2929]

{'Mg', 'Zn'}
[1840 1840]
[1301 1333]

{'Al', 'Fe'}
[1840 1840]
[2089 9193]

{'C', 'Zn'}
[1840 1840]
[1302 3896]

{'O', 'C'}
[1840 1840]
[1303 3899]

{'O', 'S'}
[1840 1840]
[1304 1305]

{'Os', 'O'}
[1840 1840]
[1306 1307]

{'Cd', 'Ba'}
[1840 1840]
[3546 8468]

{'Hg', 'Zn'}
[1840 1840]
[1310 1378]

{'H', 'Pb'}
[1840 1840]
[1315 1401]

{'P', 'N'}
[1840 1840]
[1328 1333]

{'Mo', 'S'}
[1840 1840]
[1329 1334]

{'P', 'As'}
[1840 1840]
[1333 8902]

{'Os', 'W', 'Te'}
[1840 1840 1840]
[1344 1345 1346]

{'Cd', 'Mn', 'Be'}
[1840 1840 1840]
[1351 3547 8388]

{'Pd', 'Pt'}


{'Ti', 'Sn', 'Si'}
[1840 1840 1840]
[2327 2328 2329]

{'Zr', 'Si'}
[1840 1840]
[2332 2333]

{'Ni', 'Fe', 'Mg'}
[1840 1840 1840]
[2334 2337 2341]

{'Zn', 'Ni', 'Cu', 'Fe', 'Hg'}
[1840 1840 1840 1840 1840]
[2333 2335 2338 2342 2450]

{'Ti', 'Si'}
[1840 1840]
[2334 2335]

{'Ti', 'Si'}
[1840 1840]
[2341 2342]

{'As', 'Ta'}
[1840 1840]
[2343 8929]

{'Ti', 'Zr', 'Si'}
[1840 1840 1840]
[2351 2352 2353]

{'H', 'F', 'S', 'Zn', 'O', 'Se', 'Br', 'Cl', 'I', 'Te'}
[1840 1840 1840 1840 1840 1840 1840 1840 1840 1840]
[2368 2369 2370 2371 2375 2393 2396 2439 2947 7678]

{'Ag', 'Cu'}
[1840 1840]
[2455 9353]

{'Mo', 'W'}
[1840 1840]
[2373 2389]

{'H', 'O', 'Cu'}
[1840 1840 1840]
[2376 2397 2460]

{'C', 'S', 'Te'}
[1840 1840 1840]
[2377 2378 3942]

{'O', 'Cu'}
[1840 1840]
[2380 2461]

{'K', 'O'}
[1840 1840]
[2381 2391]

{'Cr', 'Se', 'W', 'S'}
[1840 1840 1840 1840]
[2379 2380 2381 2635]

{'Cl', 'F', 'W'}
[1840 1840 1840]
[2398 2440 2948]

{'W', 'S'}
[1840 1840]
[2406 2407]

{'O', 'Cu'}
[1840 1840]
[2414 2

{'Pd', 'Pt'}
[1840 1840]
[3254 3255]

{'Hg', 'Zn'}
[1840 1840]
[3256 3257]

{'I', 'Sb', 'Bi'}
[1840 1840 1840]
[3259 3261 8289]

{'Cd', 'Zn'}
[1840 1840]
[3264 3580]

{'Ba', 'Sr'}
[1840 1840]
[3268 8518]

{'Pd', 'Fe', 'Pt', 'Zn'}
[1840 1840 1840 1840]
[3270 3272 3273 3300]

{'Fe', 'Mn', 'Co'}
[1840 1840 1840]
[3276 3301 3314]

{'Cu', 'Zn'}
[1840 1840]
[3278 3303]

{'Ir', 'Pt', 'Te'}
[1840 1840 1840]
[3281 3282 3287]

{'Pb', 'Zn', 'Cu', 'Cd', 'Pt', 'Hg'}
[1840 1840 1840 1840 1840 1840]
[3283 3284 3286 3289 3305 3581]

{'Cu', 'Pt'}
[1840 1840]
[3291 3306]

{'Ti', 'Si'}
[1840 1840]
[3295 3296]

{'Mo', 'As', 'Sb', 'Ta', 'Nb', 'P', 'I', 'W'}
[1840 1840 1840 1840 1840 1840 1840 1840]
[3317 3318 3319 3321 3325 3326 3330 8947]

{'N', 'H'}
[1840 1840]
[3329 3332]

{'Ir', 'Bi'}
[1840 1840]
[3336 8290]

{'Rh', 'Fe'}
[1840 1840]
[3334 3349]

{'Bi', 'Sb'}
[1840 1840]
[3341 8292]

{'Mo', 'Ir', 'Pt', 'W'}
[1840 1840 1840 1840]
[3352 3354 3366 3375]

{'Fe', 'Tl'}
[1840 1840]
[3353 3433]

{'Ba', 'Ca'}


{'Ag', 'Au'}
[1840 1840]
[8811 9372]

{'Mg', 'Cu', 'Be'}
[1840 1840 1840]
[4051 4285 8397]

{'O', 'Hg'}
[1840 1840]
[4055 4061]

{'Cl', 'I', 'Br'}
[1840 1840 1840]
[4057 4325 7692]

{'K', 'H', 'Br', 'Na'}
[1840 1840 1840 1840]
[4069 4080 4083 7693]

{'N', 'Co', 'Cu'}
[1840 1840 1840]
[4073 4286 4311]

{'H', 'Na'}
[1840 1840]
[4070 4086]

{'Li', 'K', 'Tl', 'H', 'Na', 'Rb'}
[1840 1840 1840 1840 1840 1840]
[4066 4067 4071 4079 4081 4087]

{'Ni', 'U'}
[1840 1840]
[4074 4076]

{'H', 'O'}
[1840 1840]
[4082 4103]

{'H', 'O'}
[1840 1840]
[4086 4105]

{'Ba', 'Pb', 'Mg', 'Cu', 'Ca', 'Hg'}
[1840 1840 1840 1840 1840 1840]
[4088 4098 4102 4287 4426 8538]

{'C', 'Sr', 'Ca'}
[1840 1840 1840]
[4089 4427 4457]

{'Mg', 'Ca', 'Fe', 'Ba'}
[1840 1840 1840 1840]
[4099 4273 4428 8539]

{'H', 'Na'}
[1840 1840]
[4092 4109]

{'N', 'H'}
[1840 1840]
[4094 4112]

{'I', 'H'}
[1840 1840]
[4101 4122]

{'H', 'N', 'Br', 'Cl', 'I'}
[1840 1840 1840 1840 1840]
[4111 4120 4127 4326 7694]

{'H', 'F', 'N', 'O', 'Br', 'Cl', '

{'C', 'S'}
[1840 1840]
[4521 4763]

{'Cl', 'H'}
[1840 1840]
[4531 4609]

{'N', 'H', 'As', 'Cl'}
[1840 1840 1840 1840]
[4523 4532 4610 8969]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[4534 4611 7712]

{'N', 'H', 'Ag'}
[1840 1840 1840]
[4524 4536 9383]

{'N', 'H'}
[1840 1840]
[4525 4537]

{'O', 'C'}
[1840 1840]
[4525 4774]

{'Cr', 'Fe'}
[1840 1840]
[4584 4596]

{'O', 'C'}
[1840 1840]
[4527 4775]

{'Cl', 'I', 'H'}
[1840 1840 1840]
[4529 4539 4612]

{'H', 'N', 'Br', 'Cl', 'I'}
[1840 1840 1840 1840 1840]
[4532 4539 4541 4615 7713]

{'N', 'H', 'Cl'}
[1840 1840 1840]
[4534 4543 4616]

{'N', 'H', 'Br', 'Cl'}
[1840 1840 1840 1840]
[4536 4545 4617 7714]

{'P', 'H', 'N'}
[1840 1840 1840]
[4530 4537 4547]

{'O', 'C'}
[1840 1840]
[4534 4802]

{'O', 'S'}
[1840 1840]
[4533 4535]

{'O', 'S'}
[1840 1840]
[4535 4536]

{'O', 'C'}
[1840 1840]
[4537 4804]

{'O', 'C'}
[1840 1840]
[4538 4810]

{'O', 'C'}
[1840 1840]
[4543 4817]

{'O', 'C', 'S'}
[1840 1840 1840]
[4542 4544 4818]

{'O', 'C'}
[1840 1840]
[4545 4819]


{'Cu', 'Pt'}
[1840 1840]
[4976 5036]

{'K', 'Rb'}
[1840 1840]
[4989 4990]

{'Pd', 'Pt'}
[1840 1840]
[4998 5000]

{'O', 'C'}
[1840 1840]
[5013 5256]

{'K', 'H'}
[1840 1840]
[5055 5056]

{'K', 'Cu'}
[1840 1840]
[5058 5063]

{'Cl', 'H'}
[1840 1840]
[5075 5089]

{'Cl', 'H'}
[1840 1840]
[5077 5091]

{'Cl', 'H'}
[1840 1840]
[5078 5092]

{'O', 'C'}
[1840 1840]
[5078 5270]

{'Cl', 'H'}
[1840 1840]
[5084 5105]

{'Cl', 'H'}
[1840 1840]
[5089 5107]

{'C', 'Sn'}
[1840 1840]
[5094 5284]

{'Sr', 'Ca', 'Zn'}
[1840 1840 1840]
[5099 5100 5149]

{'Pd', 'Pt'}
[1840 1840]
[5101 5102]

{'C', 'Pt'}
[1840 1840]
[5115 5291]

{'Cl', 'H'}
[1840 1840]
[5116 5119]

{'O', 'C'}
[1840 1840]
[5118 5293]

{'Pt', 'Zn'}
[1840 1840]
[5136 5137]

{'Cl', 'H'}
[1840 1840]
[5154 5263]

{'I', 'Br', 'H'}
[1840 1840 1840]
[5153 5155 7733]

{'O', 'C'}
[1840 1840]
[5155 5323]

{'C', 'S'}
[1840 1840]
[5157 5326]

{'O', 'C'}
[1840 1840]
[5159 5335]

{'N', 'H'}
[1840 1840]
[5161 5164]

{'H', 'Br'}
[1840 1840]
[5165 7734]

{'Cl', 'H'

{'Al', 'N', 'As', 'Sb', 'B', 'P', 'Bi'}
[1840 1840 1840 1840 1840 1840 1840]
[5510 5513 5526 8306 8698 8984 9230]

{'Cl', 'I'}
[1840 1840]
[5531 5754]

{'I', 'Br', 'H'}
[1840 1840 1840]
[5532 5536 7763]

{'P', 'Sb'}
[1840 1840]
[5511 5514]

{'O', 'Se', 'S'}
[1840 1840 1840]
[5511 5512 5517]

{'As', 'Sb', 'Bi', 'P', 'I'}
[1840 1840 1840 1840 1840]
[5512 5515 5533 8307 8985]

{'Cl', 'I'}
[1840 1840]
[5534 5755]

{'O', 'Se', 'S'}
[1840 1840 1840]
[5514 5515 5518]

{'P', 'Sb', 'As'}
[1840 1840 1840]
[5517 5518 8986]

{'O', 'S'}
[1840 1840]
[5519 5521]

{'P', 'B', 'As', 'N'}
[1840 1840 1840 1840]
[5520 5527 8700 8987]

{'Cl', 'H'}
[1840 1840]
[5541 5756]

{'O', 'S'}
[1840 1840]
[5521 5523]

{'S', 'Pb', 'Sn', 'Te'}
[1840 1840 1840 1840]
[5531 5532 5533 5534]

{'As', 'Sb'}
[1840 1840]
[5535 8988]

{'C', 'Sn'}
[1840 1840]
[5536 5947]

{'O', 'C'}
[1840 1840]
[5539 5948]

{'Pb', 'S', 'Sn', 'C', 'Te'}
[1840 1840 1840 1840 1840]
[5537 5538 5539 5540 5949]

{'C', 'Si'}
[1840 1840]
[5541 5952]

{'N'

{'N', 'H'}
[1840 1840]
[6080 6094]

{'N', 'H'}
[1840 1840]
[6081 6095]

{'N', 'H'}
[1840 1840]
[6082 6096]

{'O', 'C'}
[1840 1840]
[6082 6312]

{'O', 'C'}
[1840 1840]
[6084 6313]

{'Ni', 'Ca', 'Hg'}
[1840 1840 1840]
[6088 6090 6281]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[6100 6236 7794]

{'N', 'H', 'Cl'}
[1840 1840 1840]
[6093 6102 6237]

{'N', 'H'}
[1840 1840]
[6094 6103]

{'N', 'H'}
[1840 1840]
[6096 6105]

{'O', 'C'}
[1840 1840]
[6092 6320]

{'O', 'C'}
[1840 1840]
[6094 6322]

{'O', 'C'}
[1840 1840]
[6096 6323]

{'N', 'H'}
[1840 1840]
[6097 6110]

{'O', 'C'}
[1840 1840]
[6098 6328]

{'Cl', 'H'}
[1840 1840]
[6111 6238]

{'O', 'S'}
[1840 1840]
[6101 6102]

{'O', 'C'}
[1840 1840]
[6103 6335]

{'O', 'C'}
[1840 1840]
[6104 6336]

{'C', 'S'}
[1840 1840]
[6104 6337]

{'O', 'C'}
[1840 1840]
[6105 6338]

{'C', 'S'}
[1840 1840]
[6106 6339]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[6114 6239 7795]

{'N', 'H'}
[1840 1840]
[6109 6115]

{'N', 'H'}
[1840 1840]
[6110 6116]

{'O', 'C'}
[1840 1840]
[6109 63

{'N', 'H'}
[1840 1840]
[6546 6549]

{'N', 'H'}
[1840 1840]
[6548 6552]

{'O', 'Hg', 'S', 'Zn'}
[1840 1840 1840 1840]
[6550 6551 6552 6557]

{'O', 'C', 'Si'}
[1840 1840 1840]
[6554 6555 6667]

{'N', 'I'}
[1840 1840]
[6562 6563]

{'H', 'Br'}
[1840 1840]
[6574 7818]

{'Cl', 'H'}
[1840 1840]
[6579 6590]

{'Cl', 'H'}
[1840 1840]
[6581 6593]

{'N', 'H'}
[1840 1840]
[6582 6583]

{'N', 'Cl'}
[1840 1840]
[6585 6595]

{'P', 'N'}
[1840 1840]
[6587 6588]

{'Cl', 'H'}
[1840 1840]
[6599 6602]

{'Cl', 'H'}
[1840 1840]
[6601 6604]

{'I', 'H'}
[1840 1840]
[6617 6622]

{'N', 'H'}
[1840 1840]
[6622 6624]

{'O', 'S'}
[1840 1840]
[6619 6621]

{'O', 'C'}
[1840 1840]
[6624 6704]

{'O', 'C'}
[1840 1840]
[6625 6705]

{'O', 'C'}
[1840 1840]
[6627 6716]

{'N', 'H'}
[1840 1840]
[6630 6632]

{'N', 'H'}
[1840 1840]
[6631 6633]

{'O', 'C'}
[1840 1840]
[6633 6734]

{'O', 'C'}
[1840 1840]
[6635 6736]

{'O', 'C'}
[1840 1840]
[6636 6737]

{'N', 'H'}
[1840 1840]
[6638 6642]

{'N', 'H'}
[1840 1840]
[6639 6643]

{'N', 'H'}

{'O', 'C'}
[1840 1840]
[7779 7788]

{'C', 'S'}
[1840 1840]
[7782 7790]

{'H', 'Br'}
[1840 1840]
[7784 8028]

{'N', 'H', 'Br'}
[1840 1840 1840]
[7786 7788 8030]

{'K', 'Br'}
[1840 1840]
[7787 8031]

{'N', 'H', 'Br'}
[1840 1840 1840]
[7788 7790 8032]

{'H', 'Br'}
[1840 1840]
[7791 8033]

{'O', 'C'}
[1840 1840]
[7791 7800]

{'H', 'Br'}
[1840 1840]
[7792 8036]

{'H', 'Br'}
[1840 1840]
[7794 8038]

{'Cl', 'Br'}
[1840 1840]
[7799 8040]

{'H', 'Br'}
[1840 1840]
[7798 8043]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[7802 7808 8046]

{'H', 'Br'}
[1840 1840]
[7804 8047]

{'N', 'Br'}
[1840 1840]
[7801 8048]

{'H', 'Br'}
[1840 1840]
[7806 8050]

{'H', 'Br'}
[1840 1840]
[7809 8051]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[7810 7818 8052]

{'H', 'Br'}
[1840 1840]
[7816 8054]

{'H', 'Br'}
[1840 1840]
[7820 8056]

{'N', 'Br'}
[1840 1840]
[7820 8057]

{'H', 'Br'}
[1840 1840]
[7825 8066]

{'H', 'Br'}
[1840 1840]
[7828 8074]

{'H', 'Br'}
[1840 1840]
[7830 8078]

{'H', 'Br'}
[1840 1840]
[7836 8083]

{'Ba', 'Mn', 'C

{'K', 'Na'}
[1840 1840]
[8911 8913]

{'H', 'Na'}
[1840 1840]
[8919 8920]

{'Fe', 'S'}
[1840 1840]
[8922 8924]

{'As', 'Co', 'S'}
[1840 1840 1840]
[8936 8937 9085]

{'O', 'Hg'}
[1840 1840]
[8939 8940]

{'As', 'H'}
[1840 1840]
[8949 9093]

{'O', 'S'}
[1840 1840]
[8953 8954]

{'I', 'H'}
[1840 1840]
[8957 8958]

{'Cl', 'I', 'Br', 'F'}
[1840 1840 1840 1840]
[8958 8962 8963 9005]

{'H', 'Ag'}
[1840 1840]
[8960 9430]

{'Cl', 'Cu'}
[1840 1840]
[8966 8967]

{'I', 'H'}
[1840 1840]
[8973 8974]

{'As', 'I'}
[1840 1840]
[8976 9103]

{'As', 'H'}
[1840 1840]
[8978 9107]

{'O', 'C'}
[1840 1840]
[8978 8980]

{'O', 'S'}
[1840 1840]
[8985 8986]

{'I', 'H'}
[1840 1840]
[8988 8989]

{'I', 'Br'}
[1840 1840]
[8989 9006]

{'As', 'I'}
[1840 1840]
[8991 9112]

{'H', 'Br'}
[1840 1840]
[9006 9007]

{'As', 'H'}
[1840 1840]
[9014 9126]

{'Co', 'Pt', 'Zn'}
[1840 1840 1840]
[9015 9027 9085]

{'S', 'Zn'}
[1840 1840]
[9024 9025]

{'Ba', 'O', 'Ca'}
[1840 1840 1840]
[9029 9093 9126]

{'Mg', 'O'}
[1840 1840]
[9038 9040]



{'Pb', 'N', 'B', 'Cr', 'Cl', 'C'}
[1840 1840 1840 1840 1840 1840]
[ 224  449 2640 3002 4012 8710]

{'H', 'Tl'}
[1840 1840]
[ 226 1089]

{'H', 'O'}
[1840 1840]
[ 258 1091]

{'U', 'N', 'As', 'Sb', 'Bi', 'Ta', 'Nb', 'P', 'V', 'I', 'W'}
[1840 1840 1840 1840 1840 1840 1840 1840 1840 1840 1840]
[ 229  230  231  232  233  239  324  450  748 8323 9029]

{'K', 'Na'}
[1840 1840]
[363 628]

{'K', 'Na'}
[1840 1840]
[365 629]

{'Cl', 'Br'}
[1840 1840]
[3003 7853]

{'K', 'Na', 'Cl'}
[1840 1840 1840]
[ 366  630 3004]

{'S', 'N', 'Sb', 'Br', 'P', 'I', 'Cl'}
[1840 1840 1840 1840 1840 1840 1840]
[ 240  244  257  452  749 3005 7854]

{'S', 'N', 'Sb', 'Br', 'P', 'I', 'Cl'}
[1840 1840 1840 1840 1840 1840 1840]
[ 241  245  258  453  750 3006 7855]

{'N', 'Cl'}
[1840 1840]
[ 454 3007]

{'K', 'H'}
[1840 1840]
[ 631 1095]

{'S', 'Cr', 'Bi', 'Fe', 'H'}
[1840 1840 1840 1840 1840]
[ 247 1096 2121 2641 8324]

{'S', 'K', 'Ag', 'H', 'Rb', 'Hg'}
[1840 1840 1840 1840 1840 1840]
[ 249  255  632  858 1098 9441]

{'O', '

{'I', 'Br'}
[1840 1840]
[1054 7878]

{'As', 'O', 'C', 'Te'}
[1840 1840 1840 1840]
[1058 1064 4083 9049]

{'O', 'S'}
[1840 1840]
[1061 1068]

{'P', 'H'}
[1840 1840]
[1062 1248]

{'N', 'C'}
[1840 1840]
[1129 4085]

{'C', 'S', 'Si'}
[1840 1840 1840]
[1075 1078 4086]

{'S', 'Rh', 'Cu', 'Mn', 'Cr', 'Al', 'C', 'Si'}
[1840 1840 1840 1840 1840 1840 1840 1840]
[1083 1086 1088 1146 2469 2664 4087 9258]

{'P', 'S'}
[1840 1840]
[1087 1091]

{'K', 'O'}
[1840 1840]
[1096 1165]

{'K', 'O'}
[1840 1840]
[1098 1166]

{'N', 'C'}
[1840 1840]
[1130 4089]

{'H', 'Li'}
[1840 1840]
[1155 1266]

{'N', 'B'}
[1840 1840]
[1131 8720]

{'As', 'Bi', 'Sb', 'S'}
[1840 1840 1840 1840]
[1095 1098 8330 9050]

{'Se', 'S'}
[1840 1840]
[1097 1100]

{'K', 'H', 'Na'}
[1840 1840 1840]
[1122 1168 1275]

{'Bi', 'Sb'}
[1840 1840]
[1106 8331]

{'P', 'Br', 'Cl'}
[1840 1840 1840]
[1107 3046 7881]

{'K', 'H', 'Ag', 'Na'}
[1840 1840 1840 1840]
[1124 1169 1277 9447]

{'Ba', 'Ca'}
[1840 1840]
[3731 8605]

{'H', 'Na'}
[1840 1840]
[1125 1

{'Cl', 'H'}
[1840 1840]
[2919 3149]

{'V', 'O'}
[1840 1840]
[2860 2865]

{'Cl', 'H', 'O', 'Hg'}
[1840 1840 1840 1840]
[2879 2904 2920 3155]

{'H', 'O'}
[1840 1840]
[2885 2921]

{'H', 'Hg'}
[1840 1840]
[2923 2926]

{'H', 'Hg'}
[1840 1840]
[2924 2927]

{'O', 'C'}
[1840 1840]
[2928 4331]

{'O', 'C'}
[1840 1840]
[2933 4338]

{'Cl', 'Br'}
[1840 1840]
[3189 7928]

{'Cl', 'I', 'Br'}
[1840 1840 1840]
[2960 3209 7929]

{'C', 'Se', 'S', 'Hg'}
[1840 1840 1840 1840]
[2974 2978 3033 4348]

{'Cl', 'O'}
[1840 1840]
[2993 3216]

{'Cl', 'O'}
[1840 1840]
[2994 3217]

{'Cl', 'Br'}
[1840 1840]
[3223 7930]

{'Cl', 'O', 'Cd'}
[1840 1840 1840]
[2996 3225 3594]

{'Cl', 'I'}
[1840 1840]
[3028 3226]

{'O', 'Pb'}
[1840 1840]
[2997 3002]

{'Cl', 'O'}
[1840 1840]
[3004 3235]

{'P', 'As'}
[1840 1840]
[3026 9091]

{'O', 'Se', 'S'}
[1840 1840 1840]
[3035 3036 3037]

{'C', 'Pb'}
[1840 1840]
[3045 4357]

{'Cl', 'H'}
[1840 1840]
[3070 3265]

{'Cl', 'C'}
[1840 1840]
[3272 4373]

{'Cl', 'Br'}
[1840 1840]
[3282 7931]

{'C'

{'O', 'S'}
[1840 1840]
[4214 4215]

{'O', 'C'}
[1840 1840]
[4219 4784]

{'N', 'C'}
[1840 1840]
[4223 4793]

{'O', 'S'}
[1840 1840]
[4224 4225]

{'N', 'O', 'C'}
[1840 1840 1840]
[4228 4233 4802]

{'O', 'C'}
[1840 1840]
[4231 4805]

{'N', 'C'}
[1840 1840]
[4237 4817]

{'H', 'C'}
[1840 1840]
[4247 4838]

{'O', 'C'}
[1840 1840]
[4238 4839]

{'N', 'C'}
[1840 1840]
[4247 4880]

{'Cl', 'Br'}
[1840 1840]
[4377 7963]

{'H', 'C'}
[1840 1840]
[4275 4988]

{'O', 'C'}
[1840 1840]
[4275 4991]

{'K', 'Na'}
[1840 1840]
[4280 4285]

{'H', 'C', 'S'}
[1840 1840 1840]
[4290 4292 5006]

{'O', 'C'}
[1840 1840]
[4292 5009]

{'H', 'C'}
[1840 1840]
[4297 5014]

{'K', 'Na'}
[1840 1840]
[4315 4316]

{'Cl', 'H'}
[1840 1840]
[4326 4382]

{'Cl', 'H'}
[1840 1840]
[4327 4383]

{'Cl', 'H'}
[1840 1840]
[4328 4384]

{'Cl', 'H'}
[1840 1840]
[4331 4386]

{'Cl', 'H', 'O'}
[1840 1840 1840]
[4328 4334 4387]

{'H', 'C'}
[1840 1840]
[4338 5072]

{'N', 'C'}
[1840 1840]
[4344 5076]

{'Cl', 'H'}
[1840 1840]
[4355 4395]

{'Cl', 'H

{'H', 'O'}
[1840 1840]
[4967 4976]

{'H', 'O'}
[1840 1840]
[4976 4980]

{'H', 'O'}
[1840 1840]
[4981 4982]

{'H', 'O'}
[1840 1840]
[4988 4991]

{'H', 'O'}
[1840 1840]
[5006 5009]

{'K', 'Li'}
[1840 1840]
[5007 5008]

{'H', 'O'}
[1840 1840]
[5016 5025]

{'N', 'C'}
[1840 1840]
[5023 5637]

{'N', 'C'}
[1840 1840]
[5024 5638]

{'N', 'C'}
[1840 1840]
[5025 5639]

{'C', 'Cu', 'S'}
[1840 1840 1840]
[5020 5046 5640]

{'N', 'O'}
[1840 1840]
[5033 5035]

{'H', 'O'}
[1840 1840]
[5056 5059]

{'H', 'O'}
[1840 1840]
[5067 5068]

{'O', 'C'}
[1840 1840]
[5072 5713]

{'Cl', 'H', 'C'}
[1840 1840 1840]
[5074 5106 5728]

{'Cl', 'H', 'O', 'C'}
[1840 1840 1840 1840]
[5073 5075 5107 5729]

{'H', 'O'}
[1840 1840]
[5075 5076]

{'H', 'C'}
[1840 1840]
[5077 5737]

{'O', 'C'}
[1840 1840]
[5078 5743]

{'H', 'C', 'Br'}
[1840 1840 1840]
[5086 5760 7991]

{'O', 'C'}
[1840 1840]
[5086 5765]

{'H', 'C'}
[1840 1840]
[5089 5766]

{'Cl', 'H'}
[1840 1840]
[5091 5108]

{'O', 'S'}
[1840 1840]
[5090 5092]

{'H', 'C'}
[1840 18

{'N', 'O'}
[1840 1840]
[5644 5646]

{'Cl', 'H'}
[1840 1840]
[5713 5788]

{'Cl', 'H'}
[1840 1840]
[5716 5789]

{'Cl', 'H'}
[1840 1840]
[5718 5790]

{'N', 'H'}
[1840 1840]
[5711 5719]

{'Cl', 'H', 'O'}
[1840 1840 1840]
[5713 5726 5791]

{'Cl', 'H'}
[1840 1840]
[5728 5792]

{'Cl', 'H'}
[1840 1840]
[5729 5793]

{'Cl', 'O'}
[1840 1840]
[5715 5794]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[5732 5795 8019]

{'H', 'O'}
[1840 1840]
[5716 5734]

{'H', 'O'}
[1840 1840]
[5717 5735]

{'N', 'O', 'Br', 'Cl', 'H'}
[1840 1840 1840 1840 1840]
[5718 5721 5736 5796 8020]

{'N', 'C'}
[1840 1840]
[5722 6226]

{'Cl', 'Br'}
[1840 1840]
[5798 8021]

{'N', 'O'}
[1840 1840]
[5729 5732]

{'N', 'H'}
[1840 1840]
[5733 5737]

{'O', 'C'}
[1840 1840]
[5735 6227]

{'H', 'C'}
[1840 1840]
[5739 6230]

{'H', 'C'}
[1840 1840]
[5743 6231]

{'H', 'C'}
[1840 1840]
[5744 6232]

{'N', 'C'}
[1840 1840]
[5738 6233]

{'N', 'C'}
[1840 1840]
[5741 6235]

{'H', 'C'}
[1840 1840]
[5746 6236]

{'H', 'C'}
[1840 1840]
[5747 6237]

{'H', 'O'}
[

{'K', 'Na'}
[1840 1840]
[6280 6281]

{'H', 'O'}
[1840 1840]
[6291 6304]

{'N', 'H', 'O'}
[1840 1840 1840]
[6292 6294 6305]

{'H', 'O'}
[1840 1840]
[6293 6307]

{'H', 'C'}
[1840 1840]
[6321 6613]

{'Cl', 'H'}
[1840 1840]
[6323 6424]

{'H', 'O'}
[1840 1840]
[6302 6325]

{'H', 'C'}
[1840 1840]
[6330 6614]

{'H', 'O'}
[1840 1840]
[6306 6335]

{'O', 'Br', 'Cl', 'H', 'C'}
[1840 1840 1840 1840 1840]
[6307 6337 6425 6615 8046]

{'N', 'O', 'Br', 'Cl', 'H'}
[1840 1840 1840 1840 1840]
[6309 6312 6338 6426 8047]

{'H', 'Br'}
[1840 1840]
[6339 8048]

{'N', 'H', 'O', 'C'}
[1840 1840 1840 1840]
[6310 6313 6340 6616]

{'H', 'O'}
[1840 1840]
[6311 6341]

{'H', 'O'}
[1840 1840]
[6313 6343]

{'H', 'O'}
[1840 1840]
[6314 6344]

{'H', 'O'}
[1840 1840]
[6315 6345]

{'H', 'O', 'C'}
[1840 1840 1840]
[6323 6348 6621]

{'H', 'O', 'C'}
[1840 1840 1840]
[6324 6349 6622]

{'N', 'H', 'O', 'Br'}
[1840 1840 1840 1840]
[6325 6328 6350 8049]

{'H', 'O'}
[1840 1840]
[6327 6351]

{'H', 'O'}
[1840 1840]
[6330 6353]

{'N',

{'H', 'O'}
[1840 1840]
[6778 6788]

{'H', 'O', 'C'}
[1840 1840 1840]
[6779 6789 7043]

{'H', 'C'}
[1840 1840]
[6791 7044]

{'H', 'O', 'C'}
[1840 1840 1840]
[6792 6797 7045]

{'Cl', 'H'}
[1840 1840]
[6836 6889]

{'Cl', 'H'}
[1840 1840]
[6840 6891]

{'Ir', 'Co'}
[1840 1840]
[6844 6860]

{'Co', 'Fe'}
[1840 1840]
[6845 6861]

{'Co', 'Fe'}
[1840 1840]
[6868 6870]

{'Cl', 'H'}
[1840 1840]
[6880 6892]

{'Cl', 'H'}
[1840 1840]
[6883 6894]

{'C', 'S'}
[1840 1840]
[6876 7072]

{'O', 'S'}
[1840 1840]
[6878 6880]

{'P', 'Sb', 'As'}
[1840 1840 1840]
[6886 6887 9117]

{'Cl', 'H'}
[1840 1840]
[6892 6898]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[6899 6903 8059]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[6917 6967 8060]

{'H', 'O'}
[1840 1840]
[6916 6918]

{'H', 'C'}
[1840 1840]
[6920 7088]

{'O', 'C'}
[1840 1840]
[6920 7089]

{'N', 'O'}
[1840 1840]
[6925 6927]

{'Cl', 'I', 'O', 'H'}
[1840 1840 1840 1840]
[6927 6929 6941 6968]

{'N', 'O', 'C'}
[1840 1840 1840]
[6931 6933 7097]

{'N', 'C'}
[1840 1840]
[6935 7098]

{'H', 'C'}
[1840 1840]
[8041 8053]

{'H', 'O'}
[1840 1840]
[8046 8050]

{'H', 'Br'}
[1840 1840]
[8056 8196]

{'N', 'C'}
[1840 1840]
[8055 8064]

{'N', 'C'}
[1840 1840]
[8057 8066]

{'H', 'Br'}
[1840 1840]
[8064 8197]

{'P', 'As'}
[1840 1840]
[8069 9121]

{'H', 'Br'}
[1840 1840]
[8071 8200]

{'H', 'Br'}
[1840 1840]
[8079 8202]

{'H', 'Br'}
[1840 1840]
[8080 8203]

{'H', 'Br'}
[1840 1840]
[8084 8204]

{'H', 'O', 'Br'}
[1840 1840 1840]
[8114 8115 8210]

{'O', 'Br'}
[1840 1840]
[8118 8211]

{'H', 'C'}
[1840 1840]
[8123 8130]

{'O', 'C'}
[1840 1840]
[8124 8141]

{'H', 'O'}
[1840 1840]
[8125 8128]

{'H', 'Br'}
[1840 1840]
[8129 8212]

{'H', 'O'}
[1840 1840]
[8126 8132]

{'H', 'O'}
[1840 1840]
[8127 8133]

{'H', 'O', 'Br'}
[1840 1840 1840]
[8132 8136 8213]

{'H', 'O'}
[1840 1840]
[8143 8144]

{'H', 'O'}
[1840 1840]
[8145 8146]

{'C', 'Tl'}
[1840 1840]
[8153 8181]

{'Cl', 'Br'}
[1840 1840]
[8178 8224]

{'H', 'Br'}
[1840 1840]
[8183 8241]

{'Cl', 'Br'}
[1840 1840]
[8185 8242]

{'H', 'O', 'Br'}


{'O', 'S'}
[1840 1840]
[2806 2809]

{'Cr', 'S'}
[1840 1840]
[2824 2825]

{'P', 'S'}
[1840 1840]
[2841 2842]

{'Cl', 'I'}
[1840 1840]
[2899 3226]

{'Cl', 'O'}
[1840 1840]
[2953 3312]

{'Cl', 'S'}
[1840 1840]
[2985 3322]

{'Pb', 'Cu', 'Hg'}
[1840 1840 1840]
[2997 3037 3120]

{'O', 'Pb'}
[1840 1840]
[3000 3008]

{'O', 'Hg'}
[1840 1840]
[3031 3038]

{'Cs', 'Li', 'K', 'Ag', 'Na', 'Rb', 'Hg'}
[1840 1840 1840 1840 1840 1840 1840]
[3376 3378 3379 3381 3385 3438 9505]

{'Bi', 'C'}
[1840 1840]
[4648 8359]

{'O', 'S'}
[1840 1840]
[3507 3514]

{'Cl', 'Br'}
[1840 1840]
[3576 8109]

{'O', 'S'}
[1840 1840]
[3625 3631]

{'O', 'S'}
[1840 1840]
[3626 3632]

{'O', 'S'}
[1840 1840]
[3790 3791]

{'O', 'S'}
[1840 1840]
[3792 3793]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[3878 3988 8110]

{'O', 'S'}
[1840 1840]
[3824 3825]

{'O', 'S'}
[1840 1840]
[3831 3832]

{'H', 'O', 'Br', 'Cl', 'I'}
[1840 1840 1840 1840 1840]
[3840 3855 3887 3989 8111]

{'Cl', 'H'}
[1840 1840]
[3891 3990]

{'Cl', 'H'}
[1840 1840]
[3892 3991]

{'Cl', 'H'}
[1840 1840]
[5726 5802]

{'Cl', 'H'}
[1840 1840]
[5729 5803]

{'N', 'H'}
[1840 1840]
[5741 5747]

{'Cl', 'H'}
[1840 1840]
[5765 5805]

{'Cl', 'H'}
[1840 1840]
[5791 5807]

{'H', 'Ag'}
[1840 1840]
[5822 9517]

{'H', 'Br'}
[1840 1840]
[5845 8138]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[5857 6012 8139]

{'H', 'Br'}
[1840 1840]
[5881 8140]

{'N', 'H'}
[1840 1840]
[5828 5883]

{'Cl', 'H'}
[1840 1840]
[5893 6014]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[5895 6015 8141]

{'N', 'I', 'Br', 'H'}
[1840 1840 1840 1840]
[5843 5852 5896 8142]

{'H', 'Br'}
[1840 1840]
[5905 8143]

{'O', 'C'}
[1840 1840]
[5864 6439]

{'O', 'C'}
[1840 1840]
[5873 6441]

{'N', 'H'}
[1840 1840]
[5886 5917]

{'N', 'H'}
[1840 1840]
[5887 5918]

{'N', 'H'}
[1840 1840]
[5888 5919]

{'H', 'Br'}
[1840 1840]
[5922 8144]

{'O', 'C'}
[1840 1840]
[5898 6449]

{'O', 'C'}
[1840 1840]
[5899 6450]

{'N', 'H'}
[1840 1840]
[5914 5928]

{'Cl', 'H'}
[1840 1840]
[5934 6016]

{'O', 'S'}
[1840 1840]
[5915 5917]

{'O', 'C'}
[1840 1840]
[

{'P', 'O', 'B', 'Cl'}
[1840 1840 1840 1840]
[ 115  212 3225 8744]

{'O', 'S'}
[1840 1840]
[ 57 213]

{'O', 'S'}
[1840 1840]
[ 58 215]

{'O', 'S'}
[1840 1840]
[ 63 243]

{'Cl', 'O'}
[1840 1840]
[ 275 3447]

{'P', 'Co', 'Cr'}
[1840 1840 1840]
[ 117 2704 2841]

{'N', 'As', 'S'}
[1840 1840 1840]
[  63  476 9140]

{'P', 'As'}
[1840 1840]
[ 118 9141]

{'Cl', 'I', 'F'}
[1840 1840 1840]
[ 782 2258 3226]

{'O', 'S'}
[1840 1840]
[ 86 277]

{'Cl', 'I', 'Br'}
[1840 1840 1840]
[ 789 3321 8207]

{'I', 'H'}
[1840 1840]
[ 783 1248]

{'O', 'Se', 'S'}
[1840 1840 1840]
[100 103 239]

{'O', 'S'}
[1840 1840]
[118 259]

{'O', 'S'}
[1840 1840]
[119 260]

{'O', 'S'}
[1840 1840]
[123 228]

{'P', 'Ni', 'Cu', 'Fe'}
[1840 1840 1840 1840]
[ 144  320 2160 2508]

{'Cl', 'Br', 'F'}
[1840 1840 1840]
[2259 3232 8160]

{'O', 'Zn'}
[1840 1840]
[136 249]

{'H', 'Na'}
[1840 1840]
[ 397 1251]

{'K', 'H'}
[1840 1840]
[ 675 1253]

{'H', 'Na'}
[1840 1840]
[ 398 1255]

{'In', 'Rh'}
[1840 1840]
[238 707]

{'P', 'S'}
[1840 1840]


{'Cl', 'H'}
[1840 1840]
[4615 4648]

{'Cl', 'H'}
[1840 1840]
[4607 4646]

{'Cl', 'H'}
[1840 1840]
[4608 4647]

{'Cl', 'H'}
[1840 1840]
[4607 4643]

{'Cl', 'H'}
[1840 1840]
[4615 4644]

{'N', 'C'}
[1840 1840]
[4605 5975]

{'H', 'C'}
[1840 1840]
[4623 5988]

{'Cl', 'H'}
[1840 1840]
[4630 4648]

{'Cl', 'H', 'Br'}
[1840 1840 1840]
[4627 4646 8185]

{'Cl', 'H'}
[1840 1840]
[4628 4647]

{'Cl', 'H'}
[1840 1840]
[4637 4648]

{'Cl', 'H'}
[1840 1840]
[4642 4648]

{'Cl', 'H'}
[1840 1840]
[4747 5118]

{'Cl', 'H'}
[1840 1840]
[4880 5123]

{'Cl', 'H'}
[1840 1840]
[4822 5122]

{'Cl', 'H'}
[1840 1840]
[4753 5119]

{'H', 'O'}
[1840 1840]
[4674 4727]

{'Mg', 'H'}
[1840 1840]
[4680 4747]

{'Cl', 'H'}
[1840 1840]
[4753 5116]

{'Cl', 'H'}
[1840 1840]
[4805 5117]

{'H', 'O'}
[1840 1840]
[4686 4773]

{'H', 'C'}
[1840 1840]
[4808 6025]

{'H', 'O'}
[1840 1840]
[4698 4810]

{'H', 'O'}
[1840 1840]
[4700 4817]

{'H', 'O'}
[1840 1840]
[4701 4872]

{'H', 'O'}
[1840 1840]
[4701 4822]

{'H', 'C'}
[1840 1840]
[4825 60

{'Cl', 'H'}
[1840 1840]
[5746 5808]

{'H', 'C'}
[1840 1840]
[5746 6573]

{'Cl', 'H'}
[1840 1840]
[5765 5807]

{'Cl', 'H'}
[1840 1840]
[5779 5808]

{'Cl', 'H'}
[1840 1840]
[5857 6017]

{'H', 'O'}
[1840 1840]
[5820 5888]

{'Cl', 'H'}
[1840 1840]
[5893 6018]

{'H', 'O'}
[1840 1840]
[5821 5896]

{'H', 'O'}
[1840 1840]
[5822 5898]

{'H', 'O'}
[1840 1840]
[5826 5922]

{'H', 'O'}
[1840 1840]
[5826 5905]

{'H', 'O'}
[1840 1840]
[5827 5906]

{'H', 'O'}
[1840 1840]
[5839 5910]

{'H', 'O'}
[1840 1840]
[5840 5911]

{'H', 'O'}
[1840 1840]
[5841 5912]

{'H', 'O'}
[1840 1840]
[5842 5913]

{'H', 'O'}
[1840 1840]
[5846 5914]

{'N', 'H'}
[1840 1840]
[5874 5918]

{'H', 'O'}
[1840 1840]
[5871 5922]

{'O', 'C'}
[1840 1840]
[5883 6612]

{'O', 'C'}
[1840 1840]
[5884 6613]

{'N', 'H', 'C'}
[1840 1840 1840]
[5909 5928 6615]

{'H', 'C'}
[1840 1840]
[5940 6915]

{'Cl', 'H'}
[1840 1840]
[5929 6019]

{'H', 'C'}
[1840 1840]
[5942 6916]

{'H', 'C'}
[1840 1840]
[5935 6619]

{'H', 'C'}
[1840 1840]
[5936 6622]

{'H', '

{'H', 'C'}
[1840 1840]
[6598 7336]

{'H', 'O'}
[1840 1840]
[6626 6649]

{'H', 'O'}
[1840 1840]
[6626 6643]

{'H', 'C'}
[1840 1840]
[6644 7089]

{'H', 'O', 'C'}
[1840 1840 1840]
[6634 6648 7097]

{'O', 'C'}
[1840 1840]
[6636 7276]

{'H', 'O', 'C'}
[1840 1840 1840]
[6635 6649 7098]

{'H', 'O'}
[1840 1840]
[6636 6651]

{'H', 'O'}
[1840 1840]
[6640 6655]

{'H', 'C'}
[1840 1840]
[6656 7105]

{'H', 'C'}
[1840 1840]
[6668 7344]

{'H', 'C'}
[1840 1840]
[6657 7108]

{'H', 'C'}
[1840 1840]
[6668 7245]

{'O', 'C'}
[1840 1840]
[6652 7112]

{'O', 'C'}
[1840 1840]
[6659 7290]

{'H', 'O'}
[1840 1840]
[6657 6667]

{'H', 'O', 'C'}
[1840 1840 1840]
[6659 6668 7116]

{'O', 'C'}
[1840 1840]
[6664 7121]

{'Cl', 'H'}
[1840 1840]
[6708 6898]

{'H', 'O'}
[1840 1840]
[6683 6739]

{'H', 'O'}
[1840 1840]
[6683 6718]

{'H', 'O'}
[1840 1840]
[6683 6689]

{'H', 'O'}
[1840 1840]
[6684 6732]

{'H', 'O'}
[1840 1840]
[6684 6697]

{'H', 'O'}
[1840 1840]
[6685 6768]

{'H', 'O'}
[1840 1840]
[6685 6758]

{'Cl', 'H'}
[1840 

{'O', 'S'}
[1840 1840]
[9011 9012]

{'O', 'Cu', 'S'}
[1840 1840 1840]
[9023 9029 9083]

{'K', 'Ag'}
[1840 1840]
[9044 9536]

{'K', 'H', 'Na'}
[1840 1840 1840]
[9036 9045 9054]

{'O', 'S'}
[1840 1840]
[9035 9036]

{'O', 'S'}
[1840 1840]
[9079 9080]

{'Mg', 'Ba'}
[1840 1840]
[9091 9138]

{'Sr', 'Ca'}
[1840 1840]
[9162 9163]

{'Cl', 'I', 'Br'}
[1840 1840 1840]
[9168 9169 9172]

{'Cl', 'I', 'Br'}
[1840 1840 1840]
[9185 9221 9236]

{'H', 'O'}
[1840 1840]
[9229 9230]

{'H', 'Na'}
[1840 1840]
[9255 9261]

{'Se', 'S'}
[1840 1840]
[9275 9276]

{'Se', 'S'}
[1840 1840]
[9277 9278]

{'N', 'C'}
[1840 1840]
[9378 9396]

{'O', 'S'}
[1840 1840]
[9434 9440]

{'H', 'Ag'}
[1840 1840]
[9450 9539]

{'Cl', 'Br'}
[1840 1840]
[9505 9520]

{'O', 'S'}
[1840 1840]
[9535 9536]



In [None]:
def getCommonal(Rs,rank,cmpnds,years,subsID,elemDict,useID=False):
    """Get Commonalities. For each R(n) find all elements X such that compound R-Xn exists in dataset. 
    Build a list of these for each R(n)
    rank is the number of processors in which the operation is to be run"""

    
    if rank==0:   print("\n Finding commonalities (finding sets for each (R,n) pair)...\n")
    
    
    Comm_list = []
    R_list = []
    Table_list = []
    # Now generate the R representations on TP (TPR).

    j=0 #Counter for Rs with more than one appearence
    sumCmpnds = cmpnds.sum(axis=1)

    for i,R_ in enumerate(Rs):
        if i%1000==0 and rank==0:       print( f"\t{i}th R evaluated..." )

        n = R_[-1]  #Take subindex 
        R = R_[:-1] #The actual R
        curr_list = []

        # Encode a condition to search only within a subset of compounds fulfulling certain conditions based on R
        # 1. R is contained in compound
        cond1 = ((cmpnds - R) >= 0).all(axis=1)
        # 2. sum of atoms in cmpnd == sum of atoms in R_ (sum(R) + n)
        cond2 = (cmpnds.sum(axis=1) == R_.sum())

        subsetCmpnds = cmpnds[cond1 & cond2]  # Select subset of cmpnds
        curr_years = years[cond1 & cond2]
        curr_subsID = subsID[cond1 & cond2]

        cmpnds_no_R = (subsetCmpnds - R)
        # Now select only those cmpnds where residual is due to one element only (X_n)
        subsetCmpnds = subsetCmpnds[(cmpnds_no_R!=0).sum(axis=1)==1]
        curr_years = years[cond1 & cond2]
        curr_subsID = subsID[cond1 & cond2]

        cmpnds_no_R = (subsetCmpnds - R)
        # Now select only those cmpnds where residual is due to one element only (X_n)
        subsetCmpnds = subsetCmpnds[(cmpnds_no_R!=0).sum(axis=1)==1]
        curr_years = curr_years[(cmpnds_no_R!=0).sum(axis=1)==1]
        curr_subsID = curr_subsID[(cmpnds_no_R!=0).sum(axis=1)==1]

        if subsetCmpnds.shape[0] > 1:

            curr_list = list(map(lambda x: elemDict[x]  , (subsetCmpnds - R).nonzero()[1] ))
            Table_list.append(getTable(curr_list,curr_years,curr_subsID,useID=useID))
            Comm_list.append(curr_list)
            R_list.append(R_)
            j+=1

    if rank==0:    print("Saving...")
    table_list_arr = np.array(Table_list,dtype=np.intc)
    np.save(f'./Data/TablesID_NMax{NMax}_P{rank}.npy',table_list_arr)
    np.save(f'./Data/RVector_NMax{NMax}_P{rank}.npy',np.array(R_list,dtype=np.short))


    ### Make a function here that produces a new array exchanging ID for year, just a mapping
    if useID:
        if rank==0:  print("\n\tProducing array filled with years from ID array...")

        # Mapping going from ID to year
        mapping = dict(zip(list(subsID),list(years)))
        mapping[-1] = -1
        # Apply mapping 
        yearArray = np.vectorize(mapping.__getitem__)(table_list_arr)
        np.save(f'./Data/TablesYears_NMax{NMax}_P{rank}.npy',np.array(yearArray,dtype=np.short))

    return Comm_list,R_list  #This list contains lists (one for each R) of elements X such that R-X exist in dataset.

In [None]:
def distribRs_forUnique(Rs,max_n):

    def split_chunk(chunk,i):
        maxSplit = 5  # Maximum number of subsplits you want 

        # Split using ith index:
        split = []
        step = 2
        for j in range(maxSplit):
            lower,upper = j*step,(j+1)*step
            if j < maxSplit-1:         tmp_splt = chunk[(chunk[:,i]>=lower) & (chunk[:,i]<upper)]  # Entries that are either j or j+1
            else:                      tmp_splt = chunk[chunk[:,i]>=lower]   # Entries that are maxSplit-1 or larger
            split.append(tmp_splt)

        # Now recursively further split here
        newList = []
        for l in split:
            if l.nbytes/1e6 > maxWeightArray:
                print("\t** Found a very large chunk! (Inside recursive function)")
                newList = newList + split_chunk(l,i+1)  # Further split l by next i
            elif l.shape[0]>1:      newList.append(l)  # Append only if chunk contains more than one entry
        #    else:   newList.append(l)  # Append only if chunk contains more than one entry

        return newList


    # First split by n, the most natural choice.
    dis_list = [Rs[Rs[:,-1]==i] for i in range(1,max_n)]

    print("\nStarting recursive splitting of Rs")

    new_chunks = []
    for l in dis_list:
        if l.nbytes/1e6 > maxWeightArray:
            print("\t** Found a very large chunk!")
            new_chunks = new_chunks + split_chunk(l,0)
        else:        new_chunks.append(l)

    print("Ended recursion")

    return new_chunks


In [141]:
uniq_cmpnds = np.unique(sparse.toarray(),axis=0)
Rs = findRs(uniq_cmpnds)

Rs,c=np.unique(Rs,axis=0,return_counts=True)
Rs[c>1].shape

(6114, 61)