In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from bokeh.io import output_notebook
output_notebook()

In [33]:
import collections
import glob
import json
import pickle

from multiprocessing.pool import Pool, ThreadPool

import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd
import dask.bag as db
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize

In [4]:
import random_indexing as ri

In [57]:
load_tweets = (
    db.read_text('rehydrated_tweets_dump.jsonl', blocksize=200*(1024**1))
    .map(json.loads)
)

In [6]:
%%time
_ = (
    load_tweets
    .pluck('text', '')
    .flatten()
    .frequencies()
)

with ProgressBar():
    _ = _.compute()

vocabulary = pd.DataFrame.from_records(
    _,
    columns=['letter', 'count'],
    index='letter',
)
vocabulary.loc[''] = 0

vocabulary = vocabulary['count'].sort_values(ascending=False)

[########################################] | 100% Completed | 25.1s
CPU times: user 13.8 s, sys: 856 ms, total: 14.7 s
Wall time: 25.9 s


In [7]:
len(vocabulary)

4680

In [8]:
D = 10_000

In [9]:
features = ri.Features(vocabulary.index, D=D)

In [60]:
%%time

def pool_init(features):
    import imp, sys
    __ctx = imp.new_module('__ctx')
    sys.modules['__ctx'] = __ctx
    
    __ctx.features = features

def vectorize(tweet):
    import numpy as np
    import random_indexing as ri
    
    from __ctx import features
    
    text = tweet['text']
    tweet_vector = np.stack(ri.vectorize(text, window_length=3, features=features)).prod(axis=0)
    
    return np.concatenate([[tweet['tweet_id']], tweet_vector])

with Pool(processes=2, initializer=pool_init, initargs=[features]) as pool:
    with dask.set_options(
        pool=pool,
        get=dask.multiprocessing.get,
    ), ProgressBar():
        with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:

            pool_init(features)    

            result = (
                load_tweets
                .map(vectorize)
                .take(10_000, compute=False, npartitions=2)
                .to_dataframe(meta=[('tweet_id', 'int64')] + [(str(k), 'int16') for k in range(D)])
                .set_index('tweet_id', sorted=True, npartitions='auto')
            )

            dd.to_parquet(result, 'tweet_vectors.parquet', write_index=True)

[#############                           ] | 33% Completed | 23.4s

  "larger `npartitions` to `take`.".format(n, len(r)))


[########################################] | 100% Completed | 34.3s
[####################                    ] | 50% Completed | 20.7s

  "larger `npartitions` to `take`.".format(n, len(r)))


[########################################] | 100% Completed | 38.8s
CPU times: user 7.3 s, sys: 6.54 s, total: 13.8 s
Wall time: 1min 18s


In [61]:
visualize([prof, rprof, cprof])

In [48]:
visualize([prof, rprof, cprof])

In [67]:
dd.read_parquet('tweet_vectors.parquet').count()

Dask Series Structure:
npartitions=1
0       int64
9999      ...
dtype: int64
Dask Name: dataframe-count-agg, 3 tasks