# Dask for NLP

In [19]:
import dask.dataframe as dd
import coiled
import pandas as pd
import dask.bag as db
import re

In [2]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

## 1. Spin up Cluster

In [62]:
cluster = coiled.Cluster(
    name="dask-nlp",
    software="dask-nlp",
    n_workers=25,
    worker_cpu=4,
    worker_memory="16Gib",
    backend_options={'spot':'True'}
)

Output()

Found software environment build
Created fw rule: inbound [8786-8787] [0.0.0.0/0] []
Created FW rules: coiled-dask-rrpelgr71-120656-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-120656-firewall -> coiled-dask-rrpelgr71-120656-firewall]
Created FW rules: coiled-dask-rrpelgr71-120656-cluster-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-120656-cluster-firewall -> coiled-dask-rrpelgr71-120656-cluster-firewall]
Created scheduler VM: coiled-dask-rrpelgr71-120656-scheduler (type: t3.medium, ip: ['3.226.241.172'])


In [63]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.226.241.172:8787,

0,1
Dashboard: http://3.226.241.172:8787,Workers: 10
Total threads: 40,Total memory: 151.76 GiB

0,1
Comm: tls://10.4.2.13:8786,Workers: 10
Dashboard: http://10.4.2.13:8787/status,Total threads: 40
Started: Just now,Total memory: 151.76 GiB

0,1
Comm: tls://10.4.9.185:46269,Total threads: 4
Dashboard: http://10.4.9.185:35177/status,Memory: 15.18 GiB
Nanny: tls://10.4.9.185:44129,
Local directory: /dask-worker-space/worker-wd5ybur6,Local directory: /dask-worker-space/worker-wd5ybur6

0,1
Comm: tls://10.4.7.19:40059,Total threads: 4
Dashboard: http://10.4.7.19:46031/status,Memory: 15.18 GiB
Nanny: tls://10.4.7.19:45791,
Local directory: /dask-worker-space/worker-kq9_4_eo,Local directory: /dask-worker-space/worker-kq9_4_eo

0,1
Comm: tls://10.4.2.81:44857,Total threads: 4
Dashboard: http://10.4.2.81:43041/status,Memory: 15.18 GiB
Nanny: tls://10.4.2.81:46825,
Local directory: /dask-worker-space/worker-0n8zs7dy,Local directory: /dask-worker-space/worker-0n8zs7dy

0,1
Comm: tls://10.4.3.89:35793,Total threads: 4
Dashboard: http://10.4.3.89:38797/status,Memory: 15.18 GiB
Nanny: tls://10.4.3.89:37397,
Local directory: /dask-worker-space/worker-t39hkz4o,Local directory: /dask-worker-space/worker-t39hkz4o

0,1
Comm: tls://10.4.5.91:41785,Total threads: 4
Dashboard: http://10.4.5.91:37039/status,Memory: 15.18 GiB
Nanny: tls://10.4.5.91:34063,
Local directory: /dask-worker-space/worker-fxcxxu8x,Local directory: /dask-worker-space/worker-fxcxxu8x

0,1
Comm: tls://10.4.3.62:40473,Total threads: 4
Dashboard: http://10.4.3.62:41125/status,Memory: 15.18 GiB
Nanny: tls://10.4.3.62:33231,
Local directory: /dask-worker-space/worker-04s7huq9,Local directory: /dask-worker-space/worker-04s7huq9

0,1
Comm: tls://10.4.11.140:45051,Total threads: 4
Dashboard: http://10.4.11.140:40905/status,Memory: 15.18 GiB
Nanny: tls://10.4.11.140:33037,
Local directory: /dask-worker-space/worker-8aw5stnx,Local directory: /dask-worker-space/worker-8aw5stnx

0,1
Comm: tls://10.4.1.111:45219,Total threads: 4
Dashboard: http://10.4.1.111:45619/status,Memory: 15.18 GiB
Nanny: tls://10.4.1.111:44723,
Local directory: /dask-worker-space/worker-6w4y3umq,Local directory: /dask-worker-space/worker-6w4y3umq

0,1
Comm: tls://10.4.1.133:35969,Total threads: 4
Dashboard: http://10.4.1.133:36295/status,Memory: 15.18 GiB
Nanny: tls://10.4.1.133:33433,
Local directory: /dask-worker-space/worker-kzzy69rj,Local directory: /dask-worker-space/worker-kzzy69rj

0,1
Comm: tls://10.4.7.140:33697,Total threads: 4
Dashboard: http://10.4.7.140:42683/status,Memory: 15.18 GiB
Nanny: tls://10.4.7.140:45895,
Local directory: /dask-worker-space/worker-jz0f1xmz,Local directory: /dask-worker-space/worker-jz0f1xmz


## 2. Load Clean Twitter Data

The cleaned dataset contains ~6 million tweets = ~650MB.

Let's load in with Dask to save time then load only the tweet contents into local memory.

- `df_full` contains the unlemmatized unique tweets (works with sklearn hopefully)
- `df` contains the lemmatized tweets (works with Gensim)

In [6]:
# read in cleaned, full-text data (only tweet_text column)
df_full = dd.read_parquet(
    's3://coiled-datasets/arabic-tweets/unique_tweets_whole.parquet',
    columns=['tweet_text', 'hashtags', 'is_retweet', 'retweet_tweetid'],
    engine='pyarrow',
).repartition(npartitions=4)

In [7]:
df_full.head()

Unnamed: 0,tweet_text,hashtags,is_retweet,retweet_tweetid
0,السلام عليكم ورحمة الله وبركاته مرحبا عملاء م...,,True,9.986493e+17
1,للتأجير لبيع النطيطات زحاليق مائيه صابونية مل...,"[للتأجير, لبيع النطيطات, زحاليق مائيه صابونية,...",True,9.996373e+17
2,مظلات وسواتر آفاق الرياض مظلات استراحات مظلات...,"[مظلات, آفاق الرياض, مظلات استراحات, مظلات مسا...",True,9.993939e+17
3,فيديو شاهد مواطن يوثق بالفيديو كميات كبيرة من...,,True,9.983516e+17
4,أستغفر الله العظيم وأتوب إليه,,False,


In [8]:
df_full

Unnamed: 0_level_0,tweet_text,hashtags,is_retweet,retweet_tweetid
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,bool,float64
,...,...,...,...
,...,...,...,...
,...,...,...,...
,...,...,...,...


In [4]:
# load in cleaned AND lemmatized data
df = dd.read_parquet(
    's3://coiled-datasets/arabic-tweets/arabic_twitter_clean.parquet',
)

In [5]:
df.head()

Unnamed: 0,tweet_text,hashtags,is_retweet,retweet_tweetid,timestamp_first,user_reference_id
0,"[سَلام_1, عَلَى_1, رَحْمَة_1, اللَّه_1, بَرَكَ...",,True,9.986493e+17,2018-05-25 00:15:00,58
1,"[تَأْجِير_1, بَيْع_1, النطيطات_0, زحاليق_0, ما...","[للتأجير, لبيع النطيطات, زحاليق مائيه صابونية,...",True,9.996373e+17,2018-04-17 12:22:00,0
2,"[مِظَلَّة_1, ساتِر_1, أُفُق_1, رِياض_1, مِظَلّ...","[مظلات, آفاق الرياض, مظلات استراحات, مظلات مسا...",True,9.993939e+17,2018-05-25 00:15:00,58
3,"[فِيدْيُو_1, شاهَد_1, مُواطِن_1, وَثِق-ia_1, ف...",,True,9.983516e+17,2018-05-25 13:06:00,1
4,"[ٱِسْتَغْفَر_1, اللَّه_1, عَظِيم_2, تاب-u_1]",,False,,2014-04-12 03:34:00,657


In [6]:
# get only tweet content
docs = df.tweet_text

This Dask Series is about 70MB in size. That's small enough to load into local memory and continue working locally from there.

In [8]:
# load into local memory
docs = docs.compute()

In [10]:
type(docs)

pandas.core.series.Series

In [11]:
docs

0          [سَلام_1, عَلَى_1, رَحْمَة_1, اللَّه_1, بَرَكَ...
1          [تَأْجِير_1, بَيْع_1, النطيطات_0, زحاليق_0, ما...
2          [مِظَلَّة_1, ساتِر_1, أُفُق_1, رِياض_1, مِظَلّ...
3          [فِيدْيُو_1, شاهَد_1, مُواطِن_1, وَثِق-ia_1, ف...
4               [ٱِسْتَغْفَر_1, اللَّه_1, عَظِيم_2, تاب-u_1]
                                 ...                        
6145778    [أنا_1_0, قلب_3_0, تركي_1_0, ال_1_0, شيخ_2_0, ...
6145779    [أخت_1_0, جوز_2_0, شافه_1_0, طالع_1_0, مسجد_1_...
6145780    [رمضان_1_0, كريم_1_0, الدحيل_0_0, عين_1_0, قدس...
6145781    [رسول_1_0, الله_1_0, جمعة_1_0, ساعة_1_0, وافق_...
6145782    [إنجاز_2_0, شخص_1_0, عضو_1_0, شنو_0_0, إنجاز_2...
Name: tweet_text, Length: 6145783, dtype: object

## 3. Run Arabic Preprocessing

In [34]:
tweets_before_process = df_full['tweet_text'].compute()

In [35]:
tweets_before_process.sample(10)

5799406     عزيزي المتقاعد اشركت في السنة بمبلغ تافه تاكد...
1946587    واااو توقيع مذكرة تعاون بين الإمارات وكينيا ال...
5248444     يظِل الله تعالى المتصدقين يو القيامة في يومَ ...
2651230     القايد يبارك تمويل شخصي بدون كفيل تمويل شرعي ...
5077225     الامير المعاند محب قلم فاخر يسبك الحروف بوشم ...
2961894     اللهُمَ أستَودعتكـ مَن بَات فِي قَبره وحِيداً...
1525804    كيف يحق لنا إن نكره او نكون ضد المجلس العسكري ...
3251222    لن تموت إذا خسرت من تحب ولكنك ستعيش كالميت إذا...
3409278     أتشرف بمتابعتكم لي على الإنستقرام أدخلوا على ...
344683      نائب كويتي يدعو لتبني حملة عربية موحدة بشأن ا...
Name: tweet_text, dtype: object

In [9]:
%%time
tweets = df_full['tweet_text'].compute()

CPU times: user 7.57 s, sys: 2.3 s, total: 9.86 s
Wall time: 3min 20s


### Remove Repeating Characters
Using the regex pattern below, we will replace any character that is repeated more than twice with a single instance of that character. This is to account for informal text input such as (the Arabic equivalents of): "yeeeees" or "haaaahaaa", etc.

In [20]:
# define function
def remove_repeating_char(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

In [31]:
# show what preprocessing function does
remove_repeating_char("ههههه")

'ه'

In [25]:
%%time
tweets = tweets.apply(remove_repeating_char)
tweets

CPU times: user 21.2 s, sys: 66 ms, total: 21.3 s
Wall time: 21.3 s


0           السلام عليكم ورحمة الله وبركاته مرحبا عملاء م...
1           للتأجير لبيع النطيطات زحاليق مائيه صابونية مل...
2           مظلات وسواتر آفاق الرياض مظلات استراحات مظلات...
3           فيديو شاهد مواطن يوثق بالفيديو كميات كبيرة من...
4                             أستغفر الله العظيم وأتوب إليه 
                                 ...                        
6145778          وأنا بقلّب في تركى آل شيخ لقيت التايم لاين 
6145779     اختي جوزها شافها وهي طالعه من مسجد بالعشر الا...
6145780     رمضان كريم الدحيل العين القدس عاصمه فلسطين ال...
6145781     قال رسول الله إنَّ في الجمُعةِ لساعَةٌ لا يوا...
6145782     إنجازات شخصية للأعضاء فقط شنو انجازات المجلس ...
Name: tweet_text, Length: 6145783, dtype: object

### Orthographic Normalization
Let's now move on to normalize spellings to account for inconsistencies across dialects and common spelling 'mistakes'. This will reduce data sparsity.

In [27]:
def ortho_normalize(text):
    text = normalize_alef_maksura_ar(text)
    text = normalize_alef_ar(text)
    text = normalize_teh_marbuta_ar(text)
    return text

`camel-tools` does this by removing particular symbols from particular letters (e.g. the dots from the teh-marbuta and the hamza from the alef). For more details see [the documentation](https://camel-tools.readthedocs.io/en/latest/api/utils/normalize.html).

In [37]:
# show what preprocessing function does
ortho_normalize("أحمر حمرة")

'احمر حمره'

In [28]:
%%time
tweets = tweets.apply(ortho_normalize)

CPU times: user 5.65 s, sys: 137 ms, total: 5.79 s
Wall time: 5.8 s


### Dediacritization
Now let's proceed to remove the diacritics, again to significnatly reduce data sparsity.

*NB: diacritics are, loosely put, the Arabic equivalent of vowels. They are symbols written above or below the main characters that change the pronunciation (and possibly the meaning) of the word. This means that, technically speaking, the different words can look the same when we remove the diacritics. However, fluent Arabic-speaking people can ascertain the correct meaning of the word from context. For example, most Arabic newspapers are written without the diacritics.*

We use the dediac_ar function included in the camel_tools library.

In [32]:
# show what preprocessing function does
dediac_ar("حَرَكَات")

'حركات'

In [30]:
%%time
tweets = tweets.apply(dediac_ar)

CPU times: user 3.89 s, sys: 544 ms, total: 4.44 s
Wall time: 4.63 s


We have now done the basic NLP preprocessing. Let's save this intermediate file containing the **clean, unlemmatized tweets**.

In [38]:
tweets.sample(5)

4860071     الاما ن اللي ب تنا ضح صارت علي مر السنين تب ي...
3829917                           بناديله ولا بيسمعني النوم 
1897250     اصبحنا واصبح الملك لله اللهم اني اسالك خير هذ...
381439     شركه النيل للانتاج الاذاعي تقاضي المطربه شيرين...
1499596     اللهم امين ستحل معظم المشاكل وسيتم القضاء علي...
Name: tweet_text, dtype: object

In [40]:
df_tweets = pd.DataFrame(data=tweets)

In [42]:
# write intermediary file to S3
df_tweets.to_parquet("s3://coiled-datasets/arabic-tweets/tweets_cleaned_unlemmatized.parquet")

We're now set to input these documents into a `Td-Idf Vectorizer` and then perform LDA Topic Modelling with `scikit-learn`. See Section 4

**Alternatively** you can continue performing morphological disambiguation (incl. lemmatization) below. This means you'll have to use Gensim instead of Scikit-Learn for LDA as Gensim works better with lemmatized tokens. Scikit-learn performs the tokenization (as well as stopword removal) as part of the `Tf-Idf Vectorizer`.

### Alternative route: Morphological Disambiguation (incl. Lemmatization)

In [None]:
# First, we need to load a morphological database.
# Here, we load the default database which is used for analyzing
# Modern Standard Arabic. 
db = MorphologyDB.builtin_db()

analyzer = Analyzer(db)

analyses = analyzer.analyze('سيحاسب')

for analysis in analyses:
    print(analysis, '\n')

### Simple Word Tokenize

## 4. Tf-Idf Vectorizer with Sklearn

In [50]:
# turn tweets into list of strings
docs = list(tweets)

In [57]:
# define stopwords
with open('/Users/rpelgrim/Desktop/data/arabic-stopwords.txt', 'r') as file:
    stopwords = file.read()
    stopwords_list = stopwords.split('\n')

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
vectorizer = TfidfVectorizer(stop_words=stopwords_list)

In [61]:
%%time
X = vectorizer.fit_transform(docs)

CPU times: user 49.3 s, sys: 1.44 s, total: 50.7 s
Wall time: 52.7 s


## 5. LDA with Sklearn

In [64]:
from sklearn.decomposition import LatentDirichletAllocation

In [65]:
lda = LatentDirichletAllocation(
    n_components=15,
    random_state=42,
)

In [None]:
%%time
lda.fit(X)

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/tcp.py", line 426, in connect
    stream = await self.client.connect(
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/tornado/tcpclient.py", line 275, in connect
    af, addr, stream = await connector.start(connect_timeout=timeout)
asyncio.exceptions.CancelledError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/asyncio/tasks.py", line 490, in wait_for
    return fut.result()
asyncio.exceptions.CancelledError

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/Users/rpelgrim/mambaforge/envs/dask-nlp/lib/python3.9/site-packages/distributed/comm/core.py", l

## Gensim

### Create BOW Dictionary with Gensim

In [17]:
import gensim

In [18]:
%%time
# create BOW dictionary
dictionary = gensim.corpora.Dictionary(docs)

CPU times: user 49.2 s, sys: 788 ms, total: 50 s
Wall time: 50.4 s


In [19]:
# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [20]:
%%time
# map docs to bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client


CPU times: user 37.4 s, sys: 553 ms, total: 38 s
Wall time: 38 s


In [22]:
# inspect
bow_doc_300 = bow_corpus[300]

for i in range(len(bow_doc_300)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_300[i][0], 
                                                     dictionary[bow_doc_300[i][0]],
                                                     bow_doc_300[i][1]))

Word 175 ("تويتر_0") appears 1 time(s).
Word 253 ("بَرْنامَج_1") appears 2 time(s).
Word 912 ("تَخَلُّص_1") appears 1 time(s).
Word 1113 ("تَنْحِيف_1") appears 1 time(s).
Word 1242 ("حَقِيقِيّ_1") appears 1 time(s).
Word 1243 ("مُسْتَعِير_1") appears 1 time(s).
Word 1244 ("ٱِسْم_1") appears 1 time(s).
Word 1331 ("وَزْن_1") appears 1 time(s).
Word 1348 ("كِيلُو_1") appears 1 time(s).
Word 1676 ("الكورس_0") appears 1 time(s).
Word 1677 ("تَثْبِيت_1") appears 1 time(s).
Word 1680 ("وَرْس_1") appears 1 time(s).


### Run LDA with Gensim

In [23]:
%%time
lda_model_5 = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=5, 
                                         id2word=dictionary, 
                                         passes=2, 
                                         workers=7,
                                         random_state=21)

CPU times: user 1min 10s, sys: 29.8 s, total: 1min 40s
Wall time: 2min 29s


### Visualize LDA with pyLDAvis

In [25]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [26]:
%%time
# prepare visualisation data
vis_data_LDA_5 = gensimvis.prepare(lda_model_5, bow_corpus, dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


CPU times: user 3min 57s, sys: 2.55 s, total: 4min
Wall time: 4min 3s


In [29]:
# create filepath to save HTML visualisation
filepath = "/Users/rpelgrim/Desktop/LDA_5.html"

In [30]:
# save visualisation to HTML in repo
pyLDAvis.save_html(vis_data_LDA_5, filepath)

Excellent. This works.

BUT I actually really want to get the SKLEARN version working so I can use Dask as a backend.

## SKlearn

### Vectorize

Vectorizing isn't possible at the moment because the cleaned dataframe contains numpy arrays of the lemmas. The `Vectorizers` expect a string per document. 

**TO DO: Try loading in the untokenized, cleaned tweet texts and Vectorizing those directly. NO >> Arabic-specific preprocessing to do. OR find a way to write custom preprocessor and tokenizers.**

To do that I'll probably have to:
- input custom preprocessors/tokenizers.
- input the list of stop words (we have it somewhere)
- 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
CountVec = CountVectorizer(ngram_range=(2,2))
Count_data = CountVec.fit_transform(docs)

## Dask-ML

The array of documents `X` is only 47MB. Doesn't make sense to use Dask-ML for this. Instead use `sklearn` tf-idf vectorizer and then train LDA in parallel with Dask backend.

In [10]:
# vectorize contents
from dask_ml.feature_extraction.text import HashingVectorizer
from dask_ml.feature_extraction.text import CountVectorizer

### Hashing Vectorizer

In [11]:
vect = HashingVectorizer(lowercase=False)

In [12]:
X = df_full['tweet_text'].to_dask_array(lengths=True)

In [18]:
X

Unnamed: 0,Array,Chunk
Bytes,46.89 MiB,46.89 MiB
Shape,"(6145783,)","(6145783,)"
Count,3 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 46.89 MiB 46.89 MiB Shape (6145783,) (6145783,) Count 3 Tasks 1 Chunks Type object numpy.ndarray",6145783  1,

Unnamed: 0,Array,Chunk
Bytes,46.89 MiB,46.89 MiB
Shape,"(6145783,)","(6145783,)"
Count,3 Tasks,1 Chunks
Type,object,numpy.ndarray


In [17]:
X[1].compute()

' للتأجير لبيع النطيطات زحاليق مائيه صابونية ملاعب صابونيه زحاليق في جدة ألعاب أولاد بنات بالرياض '

In [34]:
docs_vect = vect.fit_transform(docs)

In [35]:
docs_vect.compute_chunk_sizes()

TypeError: cannot use a string pattern on a bytes-like object

In [None]:
docs_local = docs_vect.compute().toarray()

## X. Preprocessing with Dask Bags (not working)

In [35]:
# cast tweet texts into a Dask bag
bag = df_full['tweet_text'].to_bag(index=False)

In [19]:
# get number of items in bag
bag.count().compute()

6145783

In [36]:
t = bag.take(1)

In [37]:
t

(' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر ',)

In [38]:
type(t)

tuple

In [39]:
t[0]

' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر '

In [40]:
# extract value from tuple
def get_tweets(element):
    return element[0]

In [41]:
tweets = bag.map(get_tweets)

In [42]:
tweets.take(1)

(' ',)

In [27]:
t[0]

' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر '

In [25]:
type(t)

tuple

I think there's an issue with how the values are cast into the Bag. Seems like they're being cast as tuples when I actually just want the value. Is that what's tripping up the `bag.apply` and killing workers?

### Remove Repeating Characters

In [44]:
# remove repeating characters if character repeats more than once
def remove_repeating_char(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

In [46]:
# apply regex function to contents of Dask bag
bag2 = db.map(remove_repeating_char, bag)

dask.bag<remove_repeating_char, npartitions=4>

In [47]:
bag2.take(1)

(' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر ',)