In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
%load_ext cython

In [3]:
from bokeh.io import output_notebook
output_notebook()

In [4]:
import collections
import glob
import json
import pickle

from multiprocessing.pool import Pool, ThreadPool

import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize

In [5]:
import random_indexing as ri

In [13]:
load_tweets = (
    db.read_text(
        #'rehydrated_tweets_dump.jsonl',
        'dump10K.jsonl',
        blocksize=200*(1024**1)
    )
    .map(json.loads)
)

In [7]:
%%time
_ = (
    load_tweets
    .pluck('text', '')
    .flatten()
    .frequencies()
)

with ProgressBar():
    _ = _.compute()

vocabulary = pd.DataFrame.from_records(
    _,
    columns=['letter', 'count'],
    index='letter',
)
vocabulary.loc[''] = 0

vocabulary = vocabulary['count'].sort_values(ascending=False)

[########################################] | 100% Completed |  6.9s
CPU times: user 6.92 s, sys: 392 ms, total: 7.31 s
Wall time: 7.33 s


In [8]:
len(vocabulary)

4680

In [9]:
D = 10_000

In [10]:
%%time
features = ri.Features(vocabulary.index, D=D)

CPU times: user 1.05 s, sys: 64 ms, total: 1.11 s
Wall time: 1.11 s


In [11]:
%%cython

def pool_init(features):
    import imp, sys
    __ctx = imp.new_module('__ctx')
    sys.modules['__ctx'] = __ctx
    
    __ctx.features = features


def vectorize(tweets):
    import numpy as np
    import random_indexing as ri
    
    from __ctx import features

    def f(t):
        vector = np.stack(ri.vectorize(t, window_length=3, features=features)).prod(axis=0)
        return np.packbits(vector == 1)
        
    return [f(tweet['text']) for tweet in tweets]

In [44]:
%%time

with Pool(initializer=pool_init, initargs=[features]) as pool:
    with dask.set_options(
        pool=pool,
        get=dask.multiprocessing.get,
    ), ProgressBar():
        with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:

            pool_init(features)    

            vectors = (
                load_tweets
                .map_partitions(vectorize)
            )
            
            tweet_ids = load_tweets.pluck('tweet_id').to_dataframe(meta=[('tweet_id', 'uint64')])['tweet_id']
            
            result = (
                vectors.to_dataframe(meta=[(i, 'uint8')for i in range(1250)])
                .set_index(tweet_ids, sorted=True)
                .repartition(npartitions=100)
            )
            
            result.to_hdf('tweet_vectors.hdf', key='data/v*', mode='w')

[########################################] | 100% Completed |  9.6s
[########################################] | 100% Completed | 10.6s
CPU times: user 1.81 s, sys: 484 ms, total: 2.3 s
Wall time: 20.8 s


In [28]:
result.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1240,1241,1242,1243,1244,1245,1246,1247,1248,1249
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
853035540559671296,148,21,1,143,207,151,224,21,249,215,...,82,162,106,21,247,253,228,77,123,143
853036662594707456,111,48,13,201,162,155,46,62,190,27,...,25,214,209,248,157,190,115,26,172,51
853038492380794880,6,224,172,108,105,199,238,215,182,31,...,55,137,67,1,205,192,83,174,151,20
853040176620326914,112,138,103,175,125,49,196,85,86,103,...,141,130,136,9,192,84,180,167,170,23
853046831227514884,236,222,47,4,90,151,198,212,231,42,...,158,67,251,87,173,74,203,247,44,41


In [15]:
visualize(prof, rprof, cprof)

In [37]:
with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar():
    data = (
        dd.read_hdf('tweet_vectors.hdf', key='data/v*', sorted_index=True)
    )
    
    result = data.loc[[853035540559671296, 853046831227514884]].compute()

[########################################] | 100% Completed |  0.1s


In [39]:
visualize(prof, rprof, cprof);

In [40]:
result

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1240,1241,1242,1243,1244,1245,1246,1247,1248,1249
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
853035540559671296,148,21,1,143,207,151,224,21,249,215,...,82,162,106,21,247,253,228,77,123,143
853046831227514884,236,222,47,4,90,151,198,212,231,42,...,158,67,251,87,173,74,203,247,44,41
