In [3]:
import scipy.sparse as sp
import numpy as np
import sys

## Depending on the dataset (and the memory we have) you may use the following dtypes:

- np.ubyte: 1 byte Unsigned char ( 0 ... 255 )
- np.byte : 1 byte Signed char ( -128 ... 127 )
- np.short: 2 bytes C short   ( -32,768 ... 32,767 )  <-- This is more than enough

In [68]:
np_arr = np.zeros((10000,1000),dtype=np.ubyte)
rows = [1,210,241,12,417,71,723,121,73,13,20]
cols = [12,110,221,2,117,81,773,131,23,413,20]
vals = [1,12,21,1,121,81,73,31,123,3,1]

np_arr[rows,cols] = vals

print(sys.getsizeof(np_arr))

10000112


In [69]:
sparse_forms = [sp.bsr_matrix,sp.coo_matrix,sp.csc_matrix,
                sp.csr_matrix,sp.dia_matrix,sp.dok_matrix,
                sp.lil_matrix]

for form in sparse_forms:
    spar = form(np_arr,dtype=np.ubyte)
    print(sys.getsizeof(spar))

64
64
64
64
64
672
64


In [70]:
spar_csr = sp.csr_matrix(np_arr,dtype=np.ubyte)
spar_csc = sp.csc_matrix(np_arr,dtype=np.ubyte)

In [75]:
%%timeit

n = spar_csr[812]

77.9 µs ± 2.93 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [76]:
%%timeit

n = spar_csc[812]

78.5 µs ± 1.11 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [77]:
%%timeit

n = np_arr[812]

193 ns ± 9.45 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [87]:
subs = np.random.randint(0,255,1000,dtype=np.ubyte)
subs_csr = sp.csr_matrix(subs,dtype=np.ubyte)

In [90]:
%%timeit

spar_csr[817] - subs

105 µs ± 2.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [91]:
%%timeit

spar_csr[817] - subs_csr

188 µs ± 5.59 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [84]:
%%timeit

spar_csc - subs

4.63 ms ± 72.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [86]:
%%timeit

np_arr - subs

987 µs ± 35.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [93]:
subs[0] = np.array([80],dtype=np.int64)

In [100]:
sp.csr_matrix((1,10))

<1x10 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [114]:
import re
def getVec_sparse(tx,elemList): #Tx = molecular formula, e.g. C4H2ClBr
    #### This regex handles non-integer subindices: C6H16Na3O12.5 (which happens in DS) 
    Li = re.split(r"(?<!^)(?=[A-Z])",tx)  #Split as ['H2','O']
    li = [i if bool(re.match(r'[A-z]*([0-9]*[.])?[0-9]+',i)) else i+'1' for i in Li]  #Adds 1 if no subindex. Result is ['H2','O1']
    # Construct vector where each entry corresponds to the num. of atoms of that element in the compound.
    col  = []
    data = []
    breakFlag = False
    for i in li: #Loop through elements in compound
        #i is a string: Xn, where X is an element and n its subindex on the compound
        a = re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i) #Split these two components (X, n)
        elem = a[1] #Get element
        mul = float(a[2])  #Get subindex
        if mul != int(mul):
            breakFlag = True
            break
        elem_idx = elemList.index(elem)
        col.append(elem_idx)
        data.append(mul) #Assign this index to the position of the element in elemList
        # Above we sum as it may happen something like C3H2COOH. If that happens, Hs sum up to 3, Os to 2, and so on.
    if breakFlag:  return None  # If we found a non-integer index, return vec
    else:          return sp.csr_matrix((data,([0]*len(col),col)), shape=(1,len(elemList)),dtype=np.short)
    

    
def getVec(tx,elemList): #Tx = molecular formula, e.g. C4H2ClBr

    #### This regex handles non-integer subindices: C6H16Na3O12.5 (which happens in DS) 
    Li = re.split(r"(?<!^)(?=[A-Z])",tx)  #Split as ['H2','O']
    li = [i if bool(re.match(r'[A-z]*([0-9]*[.])?[0-9]+',i)) else i+'1' for i in Li]  #Adds 1 if no subindex. Result is ['H2','O1']

    # Construct vector where each entry corresponds to the num. of atoms of that element in the compound.
    vec = np.zeros(len(elemList))
    for i in li: #Loop through elements in compound
        #i is a string: Xn, where X is an element and n its subindex on the compound
        a = re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i) #Split these two components (X, n)
        elem = a[1] #Get element
        mul = float(a[2])  #Get subindex
        vec[elemList.index(elem)] += mul #Assign this index to the position of the element in elemList

    if np.all(vec == vec.astype(int)):     return vec
    else: return None

In [125]:
%%timeit

getVec('CH2O3HOC2.1OOH2He3N2O',elemList)

63.7 µs ± 1.8 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [126]:
%%timeit

getVec_sparse('CH2O3HOC2.1OOH2He3N2O',elemList)

26.4 µs ± 311 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [346]:
def getVec_sparse_v2(tx,elemList): #Tx = molecular formula, e.g. C4H2ClBr
    #### This regex handles non-integer subindices: C6H16Na3O12.5 (which happens in DS) 
    Li = re.split(r"(?<!^)(?=[A-Z])",tx)  #Split as ['H2','O']
    
    # Adds 1 if no subindex. Result is ['H2','O1']. 
    # Right after, split chem symbol from subindex as [['H',2],['O',1]]
    
    li = [re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i)
          if bool(re.match(r'[A-z]*([0-9]*[.])?[0-9]+',i))
          else re.split(r"([A-z]+)(([0-9]*[.])?[0-9]+)",i+'1') for i in Li]  
    
    # Construct two lists: input for sparse matrix construction
    col  = [elemList.index(i[1]) for i in li]  # Index of element i to put correspondent data
    data = [float(i[2]) for i in li]           # Num. atoms of element i
    
    for i in data:
        if float(i)!=int(i):
            return None  # Return empty lists or better None?
    return col,data

def getElems(DataFile,NMax=None):
    """NMax specifies number of rows of dataset to be used"""
    col_names = ['ID','formula','year']
    sep = '\t'

    df = pd.read_csv(DataFile,header=None,sep=sep,nrows=nrows,names=col_names)  #Load data
    
    df['formula'] = df['formula'].str.strip()   #Remove white spaces at begginning and end of string 

    elems = set([])

    for cmpnd in df['formula']:
        txt = ''.join(re.findall(r'[A-z]',cmpnd))   #Remove all subindices (there must be a regex to this but who knows)
        elems = elems.union(  set(re.split(r"(?<!^)(?=[A-Z])",txt))  )  # Add elements of this set to the set of known elements

    elems = sorted(list(elems)) # Convert to list and sort

    # Save this list of elements so it doesn't have to be calculated every time
    #with open("./Data/ElementList.txt", "w") as f:
    #    for A in elems:
    #        f.write(str(A) +"\n")

    return elems  # This returns a list with all sorted elements in dataset


def allVecs_sparse(DataFile,NMax=None):
    col_names = ['ID','formula','year']
    sep = '\t'

    df = pd.read_csv(DataFile,header=None,sep=sep,nrows=nrows,names=col_names)  #Load data

    df['formula'] = df['formula'].str.strip()   #Remove white spaces at begginning and end of string 

    elemList = getElems(DataFile,NMax)
    
    # List of lists [col,data]
    colXdata = list(map(lambda x: getVec_sparse_v2(x,elemList) , df['formula'].values))
    index = [i for i, l in enumerate(colXdata) if l is not None]
    colXdata = [l for l in colXdata if l is not None]
    
    # See docs for scipy.sparse.csr_matrix to understand the syntaxis
    indptr = np.cumsum([0]+list(map(lambda x: len(x[0]) , colXdata)))
    indices = np.array(list(chain(*[l[0] for l in colXdata])))
    data = np.array(list(chain(*[l[1] for l in colXdata])))

    cmpnds = sp.csr_matrix((data, indices, indptr), 
                           shape=(len(colXdata), len(elemList)),
                           dtype=np.short)
       
    years = df['year'].values[index]
    subsID = df['ID'].values[index]
    
    del indptr, indices, data, df, index, colXdata

    return cmpnds,years,subsID, elemList

In [351]:
lel = [1,2,None,4,5]

[i for i in lel if i is not None]

[1, 2, 4, 5]

In [349]:
subs_csr = allVecs_sparse(path,0)[0]
subs_csr

<11138x60 sparse matrix of type '<class 'numpy.int16'>'
	with 41691 stored elements in Compressed Sparse Row format>

In [335]:
sys.getsizeof(subs_csr.toarray())

(1336672, 64)

In [342]:
a = subs_csr

a.data.nbytes + a.indptr.nbytes + a.indices.nbytes

294702

In [350]:
11356-11138

218

In [345]:
1336560/294702

4.535293279312661

In [339]:
from collections import Mapping, Container
from sys import getsizeof
 
def deep_getsizeof(o, ids):
    """Find the memory footprint of a Python object
 
    This is a recursive function that drills down a Python object graph
    like a dictionary holding nested dictionaries with lists of lists
    and tuples and sets.
 
    The sys.getsizeof function does a shallow size of only. It counts each
    object inside a container as pointer only regardless of how big it
    really is.
 
    :param o: the object
    :param ids:
    :return:
    """
    d = deep_getsizeof
    if id(o) in ids:
        return 0
 
    r = getsizeof(o)
    ids.add(id(o))
 
    if isinstance(o, str) or isinstance(0, unicode):
        return r
 
    if isinstance(o, Mapping):
        return r + sum(d(k, ids) + d(v, ids) for k, v in o.iteritems())
 
    if isinstance(o, Container):
        return r + sum(d(x, ids) for x in o)
 
    return r 

  """Entry point for launching an IPython kernel.


In [None]:
indptr = np.array([0, 2, 3, 6])

indices = np.array([0, 2, 2, 0, 1, 2])

data = np.array([1, 2, 3, 4, 5, 6])

csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()
array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])

In [233]:
elemList = ['H', 'He', 'Li', 'Be', 'B', 'C', 'N', 'O', 'F', 'Ne', 'Na', 'Mg', 'Al', 'Si', 'P', 'S', 'Cl', 'Ar', 'K', 'Ca', 'Sc', 'Ti', 'V', 'Cr', 'Mn', 'Fe', 'Co', 'Ni', 'Cu', 'Zn', 'Ga', 'Ge', 'As', 'Se', 'Br', 'Kr', 'Rb', 'Sr', 'Y', 'Zr', 'Nb', 'Mo', 'Tc', 'Ru', 'Rh', 'Pd', 'Ag', 'Cd', 'In', 'Sn', 'Sb', 'Te', 'I', 'Xe', 'Cs', 'Ba', 'La', 'Ce', 'Pr', 'Nd', 'Pm', 'Sm', 'Eu', 'Gd', 'Tb', 'Dy', 'Ho', 'Er', 'Tm', 'Yb', 'Lu', 'Hf', 'Ta', 'W', 'Re', 'Os', 'Ir', 'Pt', 'Au', 'Hg', 'Tl', 'Pb', 'Bi', 'Po', 'At', 'Rn', 'Fr', 'Ra', 'Ac', 'Th', 'Pa', 'U', 'Np', 'Pu', 'Am', 'Cm', 'Bk', 'Cf', 'Es', 'Fm', 'Md', 'No', 'Lr', 'Rf', 'Db', 'Sg', 'Bh', 'Hs', 'Mt', 'Ds', 'Rg', 'Cn', 'Nn', 'Fl', 'Mc', 'Lv', 'Ts', 'Og']
len(elemList)

118

In [238]:
%%timeit 

getVec_sparse_v2('CH2O3HOC2OOH2He3N2O',elemList)

38.8 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [239]:
%%timeit

getVec('CH2O3HOC2OOH2He3N2O',elemList)

58.7 µs ± 5.84 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [240]:
sys.getsizeof(getVec_sparse_v2('CH2O3HOC2OOH2He3N2OHe3Lv',elemList))

72

In [241]:
sys.getsizeof(getVec('CH2O3HOC2OOH2He3N2OHe3Lv',elemList))

1040

In [242]:
getVec_sparse_v2('CH2',elemList)

([5, 0], [1.0, 2.0])

In [243]:
getVec('CH2',elemList)

array([2., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [229]:
1040/72

14.444444444444445