In [41]:
import ray
import pandas as pd
import numpy as np
from sadedegel import Doc
from sadedegel.extension.sklearn import TfidfVectorizer, Text2Doc
from sadedegel.config import tokenizer_context

from sklearn.pipeline import Pipeline

In [43]:
ray.shutdown()

In [3]:
df = pd.read_csv('/Users/dorukhanafacan/Desktop/sadedegel_turkcell/telco_sentiment/telco_sentiment/telco_sentiment_train.csv.gz')

In [20]:
dfs = np.array_split(df.sample(10000), 4)

In [7]:
with tokenizer_context('simple') as Doc2:
    ts = Doc2("merhaba dünya biz dostuz selamlar.").get_tfidf(tf_method='raw', idf_method='smooth')

2021-04-11 00:01:54.058 | INFO     | sadedegel.ml.sbd:predict:22 - Loading sbd model from /anaconda3/envs/py37/lib/python3.7/site-packages/sadedegel/ml/model/sbd.pickle


In [46]:
@ray.remote
def get_len_parallel(df):
    return df.tweet.apply(lambda x: len(x))

@ray.remote
def text2doc_parallel(df):
    with tokenizer_context('simple') as Doc2:
        return df.tweet.apply(lambda x: Doc2(x))
                              
def get_len(df):
    return df.tweet.apply(lambda x: len(x))

@ray.remote
def get_tfidf_parallel(df):
    return Pipeline([("Text2Doc", Text2Doc('simple')),
                     ("TfIdf", TfidfVectorizer(show_progress=False))]).transform(df)

In [44]:
ray.init(num_cpus=4)

2021-04-11 00:17:30,877	INFO services.py:1174 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


{'node_ip_address': '192.168.1.104',
 'raylet_ip_address': '192.168.1.104',
 'redis_address': '192.168.1.104:6379',
 'object_store_address': '/tmp/ray/session_2021-04-11_00-17-30_429470_20696/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-04-11_00-17-30_429470_20696/sockets/raylet',
 'webui_url': '127.0.0.1:8265',
 'session_dir': '/tmp/ray/session_2021-04-11_00-17-30_429470_20696',
 'metrics_export_port': 64530,
 'node_id': '2412e42ce4e191b1d4c9a69aad4b8bdfcbc4f35d9156b1e18c59704e'}

In [16]:
future_lens = [get_len_parallel.remote(df_part) for df_part in dfs]

In [23]:
%%time
lens = ray.get(future_lens)

CPU times: user 1.1 ms, sys: 112 µs, total: 1.21 ms
Wall time: 1.14 ms


In [33]:
%%time
lens = get_len(df)

CPU times: user 5.93 ms, sys: 88 µs, total: 6.02 ms
Wall time: 5.94 ms


In [19]:
future_docs = [text2doc_parallel.remote(df_part) for df_part in dfs]

In [21]:
docs = ray.get(future_docs)

[2m[36m(pid=22448)[0m 2021-04-11 00:09:15.638 | INFO     | sadedegel.ml.sbd:predict:22 - Loading sbd model from /anaconda3/envs/py37/lib/python3.7/site-packages/sadedegel/ml/model/sbd.pickle
[2m[36m(pid=22448)[0m 2021-04-11 00:09:15.638 | INFO     | sadedegel.ml.sbd:predict:22 - Loading sbd model from /anaconda3/envs/py37/lib/python3.7/site-packages/sadedegel/ml/model/sbd.pickle
[2m[36m(pid=22449)[0m 2021-04-11 00:09:15.701 | INFO     | sadedegel.ml.sbd:predict:22 - Loading sbd model from /anaconda3/envs/py37/lib/python3.7/site-packages/sadedegel/ml/model/sbd.pickle
[2m[36m(pid=22449)[0m 2021-04-11 00:09:15.701 | INFO     | sadedegel.ml.sbd:predict:22 - Loading sbd model from /anaconda3/envs/py37/lib/python3.7/site-packages/sadedegel/ml/model/sbd.pickle
[2m[36m(pid=22447)[0m 2021-04-11 00:09:15.862 | INFO     | sadedegel.ml.sbd:predict:22 - Loading sbd model from /anaconda3/envs/py37/lib/python3.7/site-packages/sadedegel/ml/model/sbd.pickle
[2m[36m(pid=22447)[0m 2021-

In [54]:
future_vecs = [get_tfidf_parallel.remote(df_part.tweet) for df_part in dfs]

In [55]:
docs = ray.get(future_vecs)
ray.shutdown()

In [56]:
docs

[<2500x61954 sparse matrix of type '<class 'numpy.float32'>'
 	with 17603 stored elements in Compressed Sparse Row format>,
 <2500x61954 sparse matrix of type '<class 'numpy.float32'>'
 	with 17707 stored elements in Compressed Sparse Row format>,
 <2500x61954 sparse matrix of type '<class 'numpy.float32'>'
 	with 17983 stored elements in Compressed Sparse Row format>,
 <2500x61954 sparse matrix of type '<class 'numpy.float32'>'
 	with 17394 stored elements in Compressed Sparse Row format>]