# Distributed Dask (Phase 5)

#### Input: 
   Models trained in stage 5.
#### Output:
   Distributed and cached dataframes that can scale upto cluster.
#### Algorithm:
   a) Read pandas dataframes and convert them to dask dataframes. <br>
   b) Persist the dataframes into distributed memory.<br>
   c) Using joblib to cache tfidfVectorizer, SVD and doc2vec<br>

In [1]:
from dask.distributed import Client
import dask.array as da
import dask.dataframe as dd
import joblib
import time
import gc
from gensim.models import Doc2Vec

In [2]:
client = Client()

In [3]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:40605  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 4  Memory: 16.69 GB


In [4]:
def convert_to_distributed(path, partitions):
    feature_matrix = joblib.load(path)
    feature_matrix.reset_index(inplace=True)
    dask_matrix = dd.from_pandas(feature_matrix, npartitions=partitions)
    del feature_matrix
    gc.collect()
    return dask_matrix

In [5]:
svd_feature_matrix = convert_to_distributed("./model/lsa_embeddings.pkl", 10)

In [6]:
doc2vec_feature_matrix = convert_to_distributed("./model/doc2vec_embeddings.pkl", 4)

In [7]:
svd_feature_matrix = svd_feature_matrix.drop("files", axis=1)
doc2vec_feature_matrix = doc2vec_feature_matrix.drop("files", axis=1)

In [8]:
df = convert_to_distributed("./model/dataset.pkl", 4)
files = df["files"]
weights = df["weights"]
del df
gc.collect()

0

In [9]:
files = client.persist(files)
svd_feature_matrix = client.persist(svd_feature_matrix)
doc2vec_feature_matrix = client.persist(doc2vec_feature_matrix)
weights = client.persist(weights)

  (        level_0   index                           ... umns], 'files')
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


In [10]:
client.publish_dataset(svd_feature_matrix=svd_feature_matrix)
client.publish_dataset(doc2vec_feature_matrix=doc2vec_feature_matrix)
client.publish_dataset(weights=weights)
client.publish_dataset(files=files)

In [11]:
def cache_models():
    dv = Doc2Vec.load("./model/doc2vec_model")
    tf = joblib.load("./model/tfidf_model.pkl")
    svd = joblib.load("./model/svd_model.pkl")
    return dv, tf, svd

In [12]:
memory = joblib.Memory(location="./model/joblib")

In [13]:
costly_compute_cached = memory.cache(func=cache_models)

In [14]:
start = time.time()
dv, tf, svd = costly_compute_cached()
end = time.time()

# End of Phase 5