In [423]:
import scipy.sparse as sp
import scipy.sparse.linalg as sp_linalg
import pandas as pd
from itertools import chain
import numpy as np
import sys

## Depending on the dataset (and the memory we have) you may use the following dtypes:

- np.ubyte: 1 byte Unsigned char ( 0 ... 255 )
- np.byte : 1 byte Signed char ( -128 ... 127 )
- np.short: 2 bytes C short   ( -32,768 ... 32,767 )  <-- This is more than enough

In [13]:
def getVec_sparse_v2(tx,elemList): #Tx = molecular formula, e.g. C4H2ClBr
    #### This regex handles non-integer subindices: C6H16Na3O12.5 (which happens in DS) 
    Li = re.split(r"(?<!^)(?=[A-Z])",tx)  #Split as ['H2','O']
    
    # Adds 1 if no subindex. Result is ['H2','O1']. 
    # Right after, split chem symbol from subindex as [['H',2],['O',1]]
    
    li = [re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i)
          if bool(re.match(r'[A-z]*([0-9]*[.])?[0-9]+',i))
          else re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i+'1') for i in Li]  
    
    # Construct two lists: input for sparse matrix construction
    col  = [elemList.index(i[1]) for i in li]  # Index of element i to put correspondent data
    data = [float(i[2]) for i in li]           # Num. atoms of element i
    
    for i in data:
        if float(i)!=int(i):
            return None  # Return empty lists or better None?
    return col,data

def getElems(DataFile,NMax=None):
    """NMax specifies number of rows of dataset to be used"""
    col_names = ['ID','formula','year']
    sep = '\t'

    df = pd.read_csv(DataFile,header=None,sep=sep,nrows=NMax,names=col_names)  #Load data
    
    df['formula'] = df['formula'].str.strip()   #Remove white spaces at begginning and end of string 

    elems = set([])

    for cmpnd in df['formula']:
        txt = ''.join(re.findall(r'[A-z]',cmpnd))   #Remove all subindices (there must be a regex to this but who knows)
        elems = elems.union(  set(re.split(r"(?<!^)(?=[A-Z])",txt))  )  # Add elements of this set to the set of known elements

    elems = sorted(list(elems)) # Convert to list and sort

    # Save this list of elements so it doesn't have to be calculated every time
    #with open("./Data/ElementList.txt", "w") as f:
    #    for A in elems:
    #        f.write(str(A) +"\n")

    return elems  # This returns a list with all sorted elements in dataset

def allVecs_sparse(DataFile,NMax=None):
    col_names = ['ID','formula','year']
    sep = '\t'

    df = pd.read_csv(DataFile,header=None,sep=sep,nrows=NMax,names=col_names)  #Load data
        
    df['formula'] = df['formula'].str.strip()   #Remove white spaces at begginning and end of string 

    elemList = getElems(DataFile,NMax)
    
    # List of lists [col,data]
    colXdata = list(map(lambda x: getVec_sparse_v2(x,elemList) , df['formula'].values))
    index = [i for i, l in enumerate(colXdata) if l is not None]
    colXdata = [l for l in colXdata if l is not None]
    
    # See docs for scipy.sparse.csr_matrix to understand the syntaxis
    indptr = np.cumsum([0]+list(map(lambda x: len(x[0]) , colXdata)))
    indices = np.array(list(chain(*[l[0] for l in colXdata])))
    data = np.array(list(chain(*[l[1] for l in colXdata])))

    cmpnds = sp.csr_matrix((data, indices, indptr), 
                           shape=(len(colXdata), len(elemList)),
                           dtype=np.short)
       
    years = df['year'].values[index]
    subsID = df['ID'].values[index]
    
    del indptr, indices, data, df, index, colXdata

    return cmpnds,years,subsID, elemList

In [112]:
sparse , _ , _1 , elemList = allVecs_sparse('../SendProfGR/sample_w_IDs.csv')
sparse

<11138x60 sparse matrix of type '<class 'numpy.int16'>'
	with 41691 stored elements in Compressed Sparse Row format>

In [35]:
%%timeit

a = sparse.data

80.9 ns ± 1.15 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


In [699]:
def findRs(cmpnds):
    # Find all unique Rs
    n = len(cmpnds[0])
    Rs = []

    cShape = cmpnds.shape[1]
    for c in cmpnds:
        indx = np.nonzero(c)  #Get index of non-zero entries
        for i in indx[0]:     #Loop through these
            c_ = np.zeros(cShape+1,dtype=np.short)
            c_[:-1] = c
            n = int(c_[i])
            for j in range(n):  #Loop through ith element's subindex
                c_[i] -= 1      #Remove one
                c_[-1] = j+1    #How many atoms of this element have been removed 

                Rs.append(c_.copy())  #Append the compound with a reduced entry (R-Xn-1)

    # Get only the Rs that were produced more than once (meaning it's guaranteed at least 2 elems per R)
    return Rs

In [718]:
def findRs_sparse(cmpnds,elemList):
    indices = cmpnds.indices
    data = cmpnds.data
    indptr = cmpnds.indptr
    
    sz_cols = cmpnds.shape[1] # Number of elements
    
    Rs = []  
    cmpnd_num_Rs = []
    
    for c in range(indptr.shape[0]-1):
        indx = indices[indptr[c]:indptr[c+1]]
        sub_dat = data[indptr[c]:indptr[c+1]]
        
        for i in range(indx.shape[0]):
            c_data = sub_dat.copy()
            n = int(c_data[i])          
            
            for j in range(n):   #Loop through ith element's subindex
                c_data[i] -= 1   #Remove one
                n = j+1          #How many atoms of this element have been removed 

                #Append compound data with a reduced entry (R-X n-1)
                Rs.append(  np.append(c_data.copy(),n)  )

        cmpnd_num_Rs.append((sub_dat.sum(),indx))
   
    # Construct sparse matrix
    for_indics = list(chain(*[l[0]*[l[1]] for l in cmpnd_num_Rs]) )

    indptr = np.cumsum([0]+list(map(lambda x: len(x)+1 , for_indics)))
    indices = np.array(list(chain(*[list(l)+[sz_cols] for l in for_indics])))
    data = np.array(list(chain(*[l for l in Rs])))

    Rs = sp.csr_matrix((data, indices, indptr),
                        shape=(len(Rs), sz_cols+1),
                        dtype=np.short)

    return Rs

In [719]:
%%time
Rs = findRs_sparse(uniq_sparse,elemList)
Rs

CPU times: user 3.66 s, sys: 51.4 ms, total: 3.71 s
Wall time: 3.72 s


<211425x61 sparse matrix of type '<class 'numpy.int16'>'
	with 1102416 stored elements in Compressed Sparse Row format>

In [720]:
# Save sparse mat
sp.save_npz('./sparse.npz',Rs)

In [721]:
uniq_arr = uniq_sparse.toarray()

In [723]:
%%time
Rs = findRs(uniq_arr)
Rs

CPU times: user 898 ms, sys: 7.78 ms, total: 905 ms
Wall time: 907 ms


[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int16),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int16),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int16),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1], dtype=int16),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [725]:
np.save('./array.npy',Rs)

In [726]:
25793978/949350

27.170145889292673

In [698]:
%%time
a,c = np.unique(Rs.toarray(),axis=0,return_counts=True)
Rs.shape[0],a[c>1].shape[0]

CPU times: user 2.03 s, sys: 7.64 ms, total: 2.04 s
Wall time: 2.04 s


(211425, 6114)

In [440]:
Rs.shape

(211425, 61)

In [339]:
suma = 0
for i in range(10):
    rs = Rs[n==i]
    #print(rs.shape)
    
    rslil = rs.tolil()
    rslil_un, c = np.unique(rslil.data + rslil.rows  ,return_counts=True)
    print(sum(c>1))
    rslil_un = rslil_un[c>1]
    
    suma+=rslil_un.shape[0]
    print(i,rslil_un.shape[0])


0
788
821
184
277
31
89
4
13
3


In [334]:
for i in range(10):
    rs = np.concatenate([Rs.toarray(),n.reshape(-1,1)],axis=1)[n==i]
    #print(rs.shape)
    a,c = np.unique(rs,axis=0,return_counts=True)
    print(i,a[c>1].shape[0])

0 0
1 2715
2 1874
3 560
4 604
5 92
6 192
7 15
8 35
9 5


In [344]:
rs = np.concatenate([Rs.toarray(),n.reshape(-1,1)],axis=1)[n==9]

a,c = np.unique(rs,axis=0,return_counts=True)


print(sp.lil_matrix(a[c>1]))

  (0, 9)	4
  (0, 20)	11
  (0, 31)	1
  (0, 35)	1
  (0, 60)	9
  (1, 9)	4
  (1, 13)	1
  (1, 35)	1
  (1, 60)	9
  (2, 9)	5
  (2, 13)	1
  (2, 35)	3
  (2, 60)	9
  (3, 9)	6
  (3, 20)	6
  (3, 31)	1
  (3, 60)	9
  (4, 9)	9
  (4, 20)	3
  (4, 31)	4
  (4, 60)	9


In [353]:
rs = Rs[n==9].tolil()

print(rs.shape)

#rslil.data + rslil.rows
rslil_un, c = np.unique(rslil.data + rslil.rows  ,return_index=True)
#rslil_un = rslil_un[c>1]
c.shape
#print(rslil_un)

(7034, 60)


(7031,)

In [349]:
rslil.data

array([list([11, 2, 5]), list([1, 3]), list([5, 2, 4]), ...,
       list([14, 48, 20, 87, 2]), list([15, 2, 15]), list([24, 2, 6])],
      dtype=object)

In [92]:
def findRs(cmpnds):

    # Find all unique Rs
    n = len(cmpnds[0])
    Rs = []
 
    cShape = cmpnds.shape[1]
    for c in cmpnds:
        indx = np.nonzero(c)  #Get index of non-zero entries
        for i in indx[0]:     #Loop through these
            c_ = np.zeros(cShape+1,dtype=np.short)
            c_[:-1] = c
            n = int(c_[i])
            for j in range(n):  #Loop through ith element's subindex
            c_[i] -= 1      #Remove one
            c_[-1] = j+1    #How many atoms of this element have been removed 

            Rs.append(c_.copy())  #Append the compound with a reduced entry (R-Xn-1)


    # Get only the Rs that were produced more than once (meaning it's guaranteed at least 2 elems per R)

    # Split Rs so np.unique runs in parallel + faster as new subprocesses are much lighter

    Rs = np.array(Rs)
    max_n = 160               # Choose wisely, this may bias results a little (the bigger the better). Read next comment
    Rs = Rs[Rs[:,-1]<max_n]  # Cut Rs by the n. If n>max_n it's very (very) likely no two compounds share it

    Rs_distrib_list = distribRs_forUnique(Rs,max_n)  # Create chunks of Rs for parallel processing

    print("\n\t Let's see how big our chunks are:")
    for r in Rs_distrib_list:
        print(f"{r.shape}, {r.nbytes/1e6} MB")

    with mp.Pool(processes=size) as pool:
        # starts the sub-processes without blocking
        # pass the chunk to each worker process

        R_results = [pool.apply_async(getRepeated,
                                      args=(Rs_i,))
                     for Rs_i in Rs_distrib_list]

        # blocks until all results are fetched
        R_results_get = [r.get() for r in R_results]   # A list of arrays

    # Merge filtered Rs by concatenating the resulting list
    Rs = np.concatenate(R_results_get,axis=0)

    return Rs

    return np.array(Rs)#[c>1]

uniq_cmpnds = np.unique(sparse.toarray(),axis=0)

In [104]:
%%time
orig = findRs(uniq_cmpnds)
len(orig)

CPU times: user 1.1 s, sys: 15.8 ms, total: 1.11 s
Wall time: 1.13 s


211425

In [141]:
uniq_cmpnds = np.unique(sparse.toarray(),axis=0)
Rs = findRs(uniq_cmpnds)

Rs,c=np.unique(Rs,axis=0,return_counts=True)
Rs[c>1].shape

(6114, 61)

(6114, 61)

In [170]:
Rs1,n = findRs_sparse(uniq_sparse,elemList)
Rs2 = findRs(uniq_cmpnds)

In [183]:
i=7

print(Rs1[i].toarray(),n[i])
print()
print(Rs2[i])

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]] 1

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
