In [2]:
import pandas as pd
import numpy as np
from scipy import sparse

In [4]:
# Example of the 'counts' file
counts = pd.read_hdf('cookbooks/cookbook.0005.h5', 'counts')
counts[:1000:100][['count', 'id']]

Unnamed: 0,count,id
0,2,26
100,1,423
200,1,649
300,1,42
400,1,339
500,2,29
600,1,615
700,1,1246
800,1,146
900,1,298


In [25]:
def make_sparse_doc_vec(arr, size):
    data = arr[:,0]
    ids = arr[:,1]
    page_freq_vec = sparse.csr_matrix((data,
                                       ids,
                                       [0, len(data)]),
                                      shape=(1, size),
                                      dtype=np.uint64)
    return page_freq_vec
a = counts[:1000:100][['count', 'id']].values
make_sparse_doc_vec(a, 1400)

<1x1400 sparse matrix of type '<class 'numpy.uint64'>'
	with 10 stored elements in Compressed Sparse Row format>

In [None]:
# How big is this data?
(big_matrix.data.nbytes + big_matrix.indices.nbytes + big_matrix.indptr.nbytes) / 1024 / 1024 / 1024

In [103]:
import glob
paths = glob.glob('cookbooks/cookbook*h5')
all_pages = []
doc_ref = []
vocab_size = 13529
i = 0 
for path in paths:
    if i % 200 == 0:
        print(i)
    try:
        counts = pd.read_hdf(path, 'counts')
    except:
        continue
    page_matrix = counts.groupby(['volid', 'page']).apply(lambda x: make_sparse_doc_vec(x[['count', 'id']].values, vocab_size))
    all_pages += page_matrix.tolist()
    doc_ref += page_matrix.index.tolist()
    i += 1

0
200
400
600
800
1000
1200
1400
1600


In [114]:
doc_ref_df = pd.DataFrame(doc_ref, columns=['volid', 'page'])

In [72]:
big_matrix = sparse.vstack(all_pages)
del all_pages

<280861x13529 sparse matrix of type '<class 'numpy.uint64'>'
	with 26030446 stored elements in Compressed Sparse Row format>

In [115]:
sparse.save_npz(file='page_vecs_sparse.npz', matrix=big_matrix, compressed=True)
doc_ref_df.to_hdf('page_vecs_order_ref.h5', 'order', complib='blosc')

# Create a big cooccurence matrix

Where the dictionary size is 13529, this creates a 13529x13529 matrix of counts for all words co-occurring. Each row is multiplied by itself and added with 

In [None]:
sparse_bytes = lambda x: (x.data.nbytes + x.indices.nbytes + x.indptr.nbytes) / 1024 / 1024 / 1024

In [97]:
cooc

<13529x13529 sparse matrix of type '<class 'numpy.uint64'>'
	with 18993706 stored elements in Compressed Sparse Column format>

In [96]:
for i in range(0, big_matrix.shape[0]):
    if i % 150 == 0:
        print(i, (cooc.data.nbytes + cooc.indices.nbytes + cooc.indptr.nbytes) / 1024 / 1024 / 1024)
    page_cooc = big_matrix[i,:].T * big_matrix[i,:]
    if i == 0:
        cooc = page_cooc
    else:
        cooc += page_cooc

0 0.02312443032860756
150 0.015728596597909927
300 0.02932947501540184
450 0.04581984132528305
600 0.05173022300004959
750 0.05717368796467781
900 0.0599420964717865
1050 0.06140926480293274
1200 0.06321552023291588
1350 0.06654242053627968
1500 0.07131785899400711
1650 0.0789937824010849
1800 0.08364346995949745
1950 0.08647402748465538
2100 0.08742445707321167
2250 0.08865708857774734
2400 0.09179692715406418
2550 0.09959365054965019
2700 0.10403022542595863
2850 0.10815862566232681
3000 0.1137116476893425
3150 0.11554813385009766
3300 0.11745193228125572
3450 0.11880869418382645
3600 0.12166506797075272
3750 0.12468011677265167
3900 0.12708017975091934
4050 0.12936751171946526
4200 0.13144905120134354
4350 0.14200489595532417
4500 0.15351784229278564
4650 0.1578560583293438
4800 0.15900981053709984
4950 0.16065575927495956
5100 0.1612863577902317
5250 0.1628401167690754
5400 0.16409866511821747
5550 0.1659129559993744
5700 0.16779081523418427
5850 0.16932614520192146
6000 0.17285283

KeyboardInterrupt: 

In [None]:
print(i, sparse_bytes(cooc)

In [None]:
cooc.setdiag(0)

In [None]:
sparse.save_npz(file='word_cooc_sparse.npz', matrix=cooc, compressed=True)

In [21]:
sparse.vstack([page_freq_vec, page_freq_vec])

<2x1400 sparse matrix of type '<class 'numpy.uint64'>'
	with 20 stored elements in Compressed Sparse Row format>

In [4]:
def make_cooc_mat(arr, size, log=False):
    ''' Take an array of values, where arr[:,0] is the word counts, and arr[:1] is the corresponding indices'''
    if log:
        nwords = arr[:,0].sum()
        data = np.log(1+(arr[:,0]/nwords))
        dtype = np.float
    else:
        data = arr[:,0]
        dtype = np.uint64
    
    page_freq_vec = sparse.csr_matrix((data,
                                       arr[:,1],
                                       [0, arr.shape[0]]),
                                      shape=(1, size),
                                      dtype=dtype)
    # Multiply counts to get cooccurances
    # does NOT set the diagonals to 0: that's more efficient to do at the end
    # so that the sparse matrix format isn't being tinkered with too much
    page_cooc = (page_freq_vec.T * page_freq_vec)
        
    return page_cooc

In [None]:
arr = counts[:1000:100][['count', 'id']].values
a = make_cooc_mat(arr, size=1246)