In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
%load_ext cython

In [3]:
from bokeh.io import output_notebook
output_notebook()

In [4]:
import collections
import glob
import json
import pickle

from multiprocessing.pool import Pool, ThreadPool

import numpy as np
import pandas as pd

import dask
import dask.dataframe as dd
import dask.array as da
import dask.bag as db
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize

In [5]:
import random_indexing as ri

In [6]:
load_tweets = (
    db.read_text(
        'rehydrated_tweets_dump.jsonl',
        #'dump10K.jsonl',
        blocksize=200*(1024**1)
    )
    .map(json.loads)
)

In [7]:
%%time
_ = (
    load_tweets
    .pluck('text', '')
    .flatten()
    .frequencies()
)

with ProgressBar():
    _ = _.compute()

vocabulary = pd.DataFrame.from_records(
    _,
    columns=['letter', 'count'],
    index='letter',
)
vocabulary.loc[''] = 0

vocabulary = vocabulary['count'].sort_values(ascending=False)

[########################################] | 100% Completed |  7.4s
CPU times: user 7.17 s, sys: 376 ms, total: 7.54 s
Wall time: 7.88 s


In [8]:
len(vocabulary)

4680

In [9]:
D = 10_000

In [10]:
%%time
features = ri.Features(vocabulary.index, D=D)

CPU times: user 1.08 s, sys: 52 ms, total: 1.13 s
Wall time: 1.13 s


In [11]:
%%cython

def pool_init(features):
    import imp, sys
    __ctx = imp.new_module('__ctx')
    sys.modules['__ctx'] = __ctx
    
    __ctx.features = features


def vectorize(tweets):
    import numpy as np
    import random_indexing as ri
    
    from __ctx import features

    def f(t):
        vector = np.stack(ri.vectorize(t, window_length=3, features=features)).prod(axis=0)
        return np.packbits(vector == 1)
        
    return [f(tweet['text']) for tweet in tweets]

In [None]:
%%time

with Pool(initializer=pool_init, initargs=[features]) as pool:
    with dask.set_options(
        pool=pool,
        get=dask.multiprocessing.get,
    ), ProgressBar():
        with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof:

            pool_init(features)    

            vectors = (
                load_tweets
                .map_partitions(vectorize)
            )
            
            tweet_ids = load_tweets.pluck('tweet_id').to_dataframe(meta=[('tweet_id', 'uint64')])['tweet_id']
            
            from_scratch = (
                vectors.to_dataframe(meta=[(i, 'uint8')for i in range(1250)])
                .set_index(tweet_ids, sorted=True)
                .repartition(npartitions=100)
            )
            
            from_scratch.to_hdf('tweet_vectors.hdf', key='data/v*', mode='w')

[                                        ] | 1% Completed | 20.6s

In [None]:
visualize(prof, rprof, cprof);

In [None]:
%%time
from_hdf = (
    dd.read_hdf('tweet_vectors.hdf', key='data/v*', sorted_index=True)
)

In [None]:
with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar():    
    from_hdf.loc[[853035540559671296, 853046831227514884]].compute()

In [None]:
screen_names = pd.DataFrame(
    {
        'screen_name': [
            '@three',
            '@two',
            '@two',
            '@three',
            '@three', '@SOME', 
        ],
    },
    index=[
            853035540559671296,
            853036662594707456, 
            853038492380794880,
            853040176620326914,
            853046831227514884, 853046831227514884
            #1,
        ],
)

In [None]:
screen_names

In [None]:
def unpack(uint8values):
    mask = np.unpackbits(uint8values, axis=1).astype(bool)
    result = np.full_like(mask, -1, dtype=int)
    result[mask] = 1
    return result

In [None]:
def by_screen_name(screen_names):
    return (
        from_hdf
            .join(screen_names, how='right')
            .groupby('screen_name')
            .apply(
                lambda g: unpack(g.dropna(axis=0, how='any').values[:, :-1].astype('uint8')).sum(axis=0),
                meta=('x', int),
            )
        ).compute()

In [None]:
all_screen_names = pd.read_csv('rehydrated_tweets_tweet_screen_names.csv', index_col='tweet_id')
all_screen_names = all_screen_names[~all_screen_names['screen_name'].str.startswith('@@')]

In [None]:
tracked_screen_names = all_screen_names[
    all_screen_names['screen_name'].str.startswith('@')
]

In [None]:
len(all_screen_names)

In [None]:
with Profiler() as prof, ResourceProfiler(dt=0.25) as rprof, CacheProfiler() as cprof, ProgressBar():
    vectors = by_screen_name(all_screen_names)
visualize([prof, rprof, cprof]);

In [None]:
pd.DataFrame(np.vstack(vectors.values), index=vectors.index).to_csv('screen_names.csv')