# Detecting Arabic-Language Misinformation on Twitter

In [1]:
import dask.dataframe as dd
import coiled
import pandas as pd
import dask.bag as db
import re

In [2]:
from camel_tools.tokenizers.word import simple_word_tokenize
from camel_tools.utils.normalize import normalize_alef_maksura_ar
from camel_tools.utils.normalize import normalize_alef_ar
from camel_tools.utils.normalize import normalize_teh_marbuta_ar
from camel_tools.utils.dediac import dediac_ar
from camel_tools.morphology.database import MorphologyDB
from camel_tools.morphology.analyzer import Analyzer
from camel_tools.disambig.mle import MLEDisambiguator
from camel_tools.tokenizers.morphological import MorphologicalTokenizer

## 1. Spin up Cluster

In [3]:
cluster = coiled.Cluster(
    name="dask-nlp",
    software="dask-nlp",
    n_workers=25,
    worker_cpu=4,
    worker_memory="16Gib",
    backend_options={'spot':'True'},
    scheduler_options={'idle_timeout':'2 hours'}
)

Output()

Found software environment build
Created fw rule: inbound [8786-8787] [0.0.0.0/0] []
Created FW rules: coiled-dask-rrpelgr71-121100-firewall
Created FW rules: coiled-dask-rrpelgr71-121100-cluster-firewall
Created fw rule: cluster [0-65535] [None] [coiled-dask-rrpelgr71-121100-cluster-firewall -> coiled-dask-rrpelgr71-121100-cluster-firewall]
Created scheduler VM: coiled-dask-rrpelgr71-121100-scheduler (type: t3.medium, ip: ['3.220.169.225'])


In [4]:
from distributed import Client
client = Client(cluster)
client


+---------+----------------+---------------+---------------+
| Package | client         | scheduler     | workers       |
+---------+----------------+---------------+---------------+
| msgpack | 1.0.3          | 1.0.2         | 1.0.2         |
| python  | 3.9.10.final.0 | 3.9.7.final.0 | 3.9.7.final.0 |
+---------+----------------+---------------+---------------+
Notes: 
-  msgpack: Variation is ok, as long as everything is above 0.6


0,1
Connection method: Cluster object,Cluster type: coiled.Cluster
Dashboard: http://3.220.169.225:8787,

0,1
Dashboard: http://3.220.169.225:8787,Workers: 15
Total threads: 60,Total memory: 229.94 GiB

0,1
Comm: tls://10.4.9.7:8786,Workers: 15
Dashboard: http://10.4.9.7:8787/status,Total threads: 60
Started: Just now,Total memory: 229.94 GiB

0,1
Comm: tls://10.4.10.139:35837,Total threads: 4
Dashboard: http://10.4.10.139:45483/status,Memory: 15.34 GiB
Nanny: tls://10.4.10.139:41581,
Local directory: /dask-worker-space/worker-d5r79fs0,Local directory: /dask-worker-space/worker-d5r79fs0

0,1
Comm: tls://10.4.3.85:36921,Total threads: 4
Dashboard: http://10.4.3.85:40223/status,Memory: 15.34 GiB
Nanny: tls://10.4.3.85:46501,
Local directory: /dask-worker-space/worker-tclwigrk,Local directory: /dask-worker-space/worker-tclwigrk

0,1
Comm: tls://10.4.9.137:39001,Total threads: 4
Dashboard: http://10.4.9.137:36991/status,Memory: 15.34 GiB
Nanny: tls://10.4.9.137:40463,
Local directory: /dask-worker-space/worker-6rs0k8ox,Local directory: /dask-worker-space/worker-6rs0k8ox

0,1
Comm: tls://10.4.1.230:43709,Total threads: 4
Dashboard: http://10.4.1.230:46241/status,Memory: 15.34 GiB
Nanny: tls://10.4.1.230:39923,
Local directory: /dask-worker-space/worker-si85s3_3,Local directory: /dask-worker-space/worker-si85s3_3

0,1
Comm: tls://10.4.4.122:36191,Total threads: 4
Dashboard: http://10.4.4.122:46201/status,Memory: 15.34 GiB
Nanny: tls://10.4.4.122:33237,
Local directory: /dask-worker-space/worker-asfhajhe,Local directory: /dask-worker-space/worker-asfhajhe

0,1
Comm: tls://10.4.4.37:41293,Total threads: 4
Dashboard: http://10.4.4.37:45687/status,Memory: 15.34 GiB
Nanny: tls://10.4.4.37:45503,
Local directory: /dask-worker-space/worker-qs78fzhe,Local directory: /dask-worker-space/worker-qs78fzhe

0,1
Comm: tls://10.4.9.31:36013,Total threads: 4
Dashboard: http://10.4.9.31:41925/status,Memory: 15.34 GiB
Nanny: tls://10.4.9.31:46199,
Local directory: /dask-worker-space/worker-x5l9bz3b,Local directory: /dask-worker-space/worker-x5l9bz3b

0,1
Comm: tls://10.4.6.186:43801,Total threads: 4
Dashboard: http://10.4.6.186:42073/status,Memory: 15.34 GiB
Nanny: tls://10.4.6.186:37097,
Local directory: /dask-worker-space/worker-qd7mnmey,Local directory: /dask-worker-space/worker-qd7mnmey

0,1
Comm: tls://10.4.14.254:37995,Total threads: 4
Dashboard: http://10.4.14.254:36255/status,Memory: 15.34 GiB
Nanny: tls://10.4.14.254:36159,
Local directory: /dask-worker-space/worker-jojeamqh,Local directory: /dask-worker-space/worker-jojeamqh

0,1
Comm: tls://10.4.6.201:41027,Total threads: 4
Dashboard: http://10.4.6.201:46633/status,Memory: 15.18 GiB
Nanny: tls://10.4.6.201:45861,
Local directory: /dask-worker-space/worker-4d4eq9aw,Local directory: /dask-worker-space/worker-4d4eq9aw

0,1
Comm: tls://10.4.1.107:41871,Total threads: 4
Dashboard: http://10.4.1.107:35103/status,Memory: 15.34 GiB
Nanny: tls://10.4.1.107:38897,
Local directory: /dask-worker-space/worker-q8c6aq6o,Local directory: /dask-worker-space/worker-q8c6aq6o

0,1
Comm: tls://10.4.11.86:43647,Total threads: 4
Dashboard: http://10.4.11.86:38473/status,Memory: 15.34 GiB
Nanny: tls://10.4.11.86:35055,
Local directory: /dask-worker-space/worker-u828wv86,Local directory: /dask-worker-space/worker-u828wv86

0,1
Comm: tls://10.4.1.111:39457,Total threads: 4
Dashboard: http://10.4.1.111:46623/status,Memory: 15.34 GiB
Nanny: tls://10.4.1.111:45963,
Local directory: /dask-worker-space/worker-t5rytauy,Local directory: /dask-worker-space/worker-t5rytauy

0,1
Comm: tls://10.4.10.235:42559,Total threads: 4
Dashboard: http://10.4.10.235:42897/status,Memory: 15.34 GiB
Nanny: tls://10.4.10.235:36391,
Local directory: /dask-worker-space/worker-z18ew92y,Local directory: /dask-worker-space/worker-z18ew92y

0,1
Comm: tls://10.4.14.240:43471,Total threads: 4
Dashboard: http://10.4.14.240:37991/status,Memory: 15.34 GiB
Nanny: tls://10.4.14.240:41853,
Local directory: /dask-worker-space/worker-cioxc8g8,Local directory: /dask-worker-space/worker-cioxc8g8


## 2. Load Clean Twitter Data

The cleaned dataset contains ~6 million tweets = ~650MB.

Let's load in with Dask to save time then load only the tweet contents into local memory.

- `df_full` contains the unlemmatized unique tweets (works with sklearn hopefully)
- `df` contains the lemmatized tweets (works with Gensim)

In [5]:
# read in cleaned, full-text data (only tweet_text column)
df_full = dd.read_parquet(
    's3://coiled-datasets/arabic-tweets/unique_tweets_whole.parquet',
    columns=['tweet_text', 'hashtags', 'is_retweet', 'retweet_tweetid'],
    engine='pyarrow',
).repartition(npartitions=4)

In [6]:
df_full.head()

Unnamed: 0,tweet_text,hashtags,is_retweet,retweet_tweetid
0,السلام عليكم ورحمة الله وبركاته مرحبا عملاء م...,,True,9.986493e+17
1,للتأجير لبيع النطيطات زحاليق مائيه صابونية مل...,"[للتأجير, لبيع النطيطات, زحاليق مائيه صابونية,...",True,9.996373e+17
2,مظلات وسواتر آفاق الرياض مظلات استراحات مظلات...,"[مظلات, آفاق الرياض, مظلات استراحات, مظلات مسا...",True,9.993939e+17
3,فيديو شاهد مواطن يوثق بالفيديو كميات كبيرة من...,,True,9.983516e+17
4,أستغفر الله العظيم وأتوب إليه,,False,


In [7]:
df_full

Unnamed: 0_level_0,tweet_text,hashtags,is_retweet,retweet_tweetid
npartitions=4,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,bool,float64
,...,...,...,...
,...,...,...,...
,...,...,...,...
,...,...,...,...


## 3. Run Arabic Preprocessing

In [8]:
%%time
tweets = df_full['tweet_text'].compute()

CPU times: user 10.4 s, sys: 2.54 s, total: 12.9 s
Wall time: 13min 24s


### Remove Repeating Characters
Using the regex pattern below, we will replace any character that is repeated more than twice with a single instance of that character. This is to account for informal text input such as (the Arabic equivalents of): "yeeeees" or "haaaahaaa", etc.

In [9]:
# define function
def remove_repeating_char(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

In [10]:
# show what preprocessing function does
remove_repeating_char("ههههه")

'ه'

In [11]:
%%time
tweets = tweets.apply(remove_repeating_char)
tweets

CPU times: user 21.4 s, sys: 61.6 ms, total: 21.5 s
Wall time: 21.5 s


0           السلام عليكم ورحمة الله وبركاته مرحبا عملاء م...
1           للتأجير لبيع النطيطات زحاليق مائيه صابونية مل...
2           مظلات وسواتر آفاق الرياض مظلات استراحات مظلات...
3           فيديو شاهد مواطن يوثق بالفيديو كميات كبيرة من...
4                             أستغفر الله العظيم وأتوب إليه 
                                 ...                        
6145778          وأنا بقلّب في تركى آل شيخ لقيت التايم لاين 
6145779     اختي جوزها شافها وهي طالعه من مسجد بالعشر الا...
6145780     رمضان كريم الدحيل العين القدس عاصمه فلسطين ال...
6145781     قال رسول الله إنَّ في الجمُعةِ لساعَةٌ لا يوا...
6145782     إنجازات شخصية للأعضاء فقط شنو انجازات المجلس ...
Name: tweet_text, Length: 6145783, dtype: object

### Orthographic Normalization
Let's now move on to normalize spellings to account for inconsistencies across dialects and common spelling 'mistakes'. This will reduce data sparsity.

In [12]:
def ortho_normalize(text):
    text = normalize_alef_maksura_ar(text)
    text = normalize_alef_ar(text)
    text = normalize_teh_marbuta_ar(text)
    return text

`camel-tools` does this by removing particular symbols from particular letters (e.g. the dots from the teh-marbuta and the hamza from the alef). For more details see [the documentation](https://camel-tools.readthedocs.io/en/latest/api/utils/normalize.html).

In [13]:
# show what preprocessing function does
ortho_normalize("أحمر حمرة")

'احمر حمره'

In [14]:
%%time
tweets = tweets.apply(ortho_normalize)

CPU times: user 5.69 s, sys: 153 ms, total: 5.84 s
Wall time: 5.86 s


### Dediacritization
Now let's proceed to remove the diacritics, again to significnatly reduce data sparsity.

*NB: diacritics are, loosely put, the Arabic equivalent of vowels. They are symbols written above or below the main characters that change the pronunciation (and possibly the meaning) of the word. This means that, technically speaking, the different words can look the same when we remove the diacritics. However, fluent Arabic-speaking people can ascertain the correct meaning of the word from context. For example, most Arabic newspapers are written without the diacritics.*

We use the dediac_ar function included in the camel_tools library.

In [15]:
# show what preprocessing function does
dediac_ar("حَرَكَات")

'حركات'

In [16]:
%%time
tweets = tweets.apply(dediac_ar)

CPU times: user 3.96 s, sys: 281 ms, total: 4.24 s
Wall time: 4.32 s


We have now done the basic NLP preprocessing. Let's save this intermediate file containing the **clean, unlemmatized tweets**.

In [31]:
tweets.sample(5)

4742556     كم نسبتك بالثانويه تسديد القروض البنكيه الراج...
5075488     الابتعاد عن الاشخاص الذين يتعمدوا في تعكير مز...
674847      مناحل ابو سلطان اقسم بالله عسل سدر بلدي طبيعي...
287904     اللهم اجعل امي لاتشكي هما ولا تتالم وجعا واسعد...
5211478     سقوط جرحي من المدنيين اثر قصف مدفعي لقوات الن...
Name: tweet_text, dtype: object

In [32]:
df_tweets = pd.DataFrame(data=tweets)

In [42]:
# write intermediary file to S3
df_tweets.to_parquet("s3://coiled-datasets/arabic-tweets/tweets_cleaned_unlemmatized.parquet")

We're now set to input these documents into a `Td-Idf Vectorizer` and then perform LDA Topic Modelling with `scikit-learn`. See Section 4

**Alternatively** you can continue performing morphological disambiguation (incl. lemmatization) below. This means you'll have to use Gensim instead of Scikit-Learn for LDA as Gensim works better with lemmatized tokens. Scikit-learn performs the tokenization (as well as stopword removal) as part of the `Tf-Idf Vectorizer`.

### Alternative route: Morphological Disambiguation (incl. Lemmatization)
Arabic has a very rich inflectional system. A verb could have up to 5400 inflections (compared to 6 in English and 1 in Chinese). So the trick is knowing...what does a word mean? Especially when stripped of its diacritics?

CAMeL Tools allows us to perform analysis against a morphological database to get all of that word's possible meanings. We can then select one.

In [33]:
# First, we need to load a morphological database.
# Here, we load the default database which is used for analyzing
# Modern Standard Arabic. 
db = MorphologyDB.builtin_db()

analyzer = Analyzer(db)

analyses = analyzer.analyze('سيحاسب')

for analysis in analyses:
    print(analysis, '\n')

{'diac': 'سَيُحاسِب', 'lex': 'حاسَب', 'bw': 'سَ/FUT_PART+يُ/IV3MS+حاسِب/IV', 'gloss': 'will_+_he;it+hold_responsible;get_even_with', 'pos': 'verb', 'prc3': '0', 'prc2': '0', 'prc1': 'sa_fut', 'prc0': '0', 'per': '3', 'asp': 'i', 'vox': 'a', 'mod': 'i', 'stt': 'na', 'cas': 'na', 'enc0': '0', 'rat': 'n', 'source': 'lex', 'form_gen': 'm', 'form_num': 's', 'd3seg': 'سَ+_يُحاسِب', 'caphi': 's_a_y_u_7_aa_s_i_b', 'd1tok': 'سَيُحاسِب', 'd2tok': 'سَ+_يُحاسِب', 'pos_logprob': -1.023208, 'd3tok': 'سَ+_يُحاسِب', 'd2seg': 'سَ+_يُحاسِب', 'pos_lex_logprob': -5.099521, 'num': 's', 'ud': 'AUX+VERB', 'gen': 'm', 'catib6': 'PRT+VRB', 'root': 'ح.س.ب', 'bwtok': 'سَ+_يُ+_حاسِب', 'pattern': 'سَيُ1ا2ِ3', 'lex_logprob': -5.099521, 'atbtok': 'سَ+_يُحاسِب', 'atbseg': 'سَ+_يُحاسِب', 'd1seg': 'سَيُحاسِب', 'stem': 'حاسِب', 'stemgloss': 'hold_responsible;get_even_with', 'stemcat': 'IV_yu'} 



### Simple Word Tokenize
Before we can perform Morpohological Disambiguation (select a particular meaning and form of our word from the range of possibilities), we need to perform a simple word tokenizing in order to be able to feed these into the disambiguating algorithm.

While testing this tool, we discovered that the word يارب was not being tokenized correctly. It is, in fact, two words, but because some tweets include it as one word it was getting processed incorrectly. Therefore, let's first split the instances of يارب and insert a whitespace in between them so that it's tokenized properly.

In [34]:
# define variables with strings to avoid problems with right-to-left order in .replace() call
yarab = 'يارب'
ya_rab = 'يا رب'

In [35]:
def split_yarab(text):
    text = text.replace(yarab, ya_rab)
    return text

In [36]:
%%time
tweets = tweets.apply(split_yarab)

CPU times: user 1.22 s, sys: 24.1 ms, total: 1.24 s
Wall time: 1.24 s


Let's now use the `simple_word_tokenize` function to tokenize our tweets.

In [37]:
%%time
tokens = tweets.apply(simple_word_tokenize)

CPU times: user 19.9 s, sys: 12.6 s, total: 32.6 s
Wall time: 1min 2s


### Removing Stop Words
Using [this Github text file](https://github.com/mohataher/arabic-stop-words), we will define our set of Arabic stop words to remove from the tokenized tweet_text column.

In [38]:
# define stopwords
with open('/Users/rpelgrim/Desktop/data/arabic-stopwords.txt', 'r') as file:
    stopwords = file.read()
    stopwords_list = stopwords.split('\n')

In [39]:
def remove_stopwords(tokenized_text):
    tokens_without_sw = [word for word in tokenized_text if word not in stopwords_list]
    return tokens_without_sw

In [40]:
%%time
tokens_nostop = tokens.apply(remove_stopwords)

CPU times: user 8min 28s, sys: 35.9 s, total: 9min 4s
Wall time: 10min 52s


In [41]:
df_tokens_nostop = pd.DataFrame(data=tokens_nostop)

In [43]:
# write intermediary file to S3
df_tokens_nostop.to_parquet("s3://coiled-datasets/arabic-tweets/tweets_tokenized_nostopwords.parquet")

### Morphological Disambiguation
The next and final step is to conduct morphological disambiguation: to reduce the range of possible forms and meanings of the words in our Arabic text (which has been dediacritized and therefore can have multiple meanings) to a single form and meaning.

For this project we will also use this step to directly lemmatize our tokens. There are many different ways to create 'morphological tokens' (using 9 different schemas built into the CAMeL Morphological Disambiguator). But since we will be conducting Topic Modelling on the text, the lemmas will suffice for our purposes.

In [44]:
# instantiate the Maximum Likelihood Disambiguator
mle = MLEDisambiguator.pretrained()

Let's run on a sample sentence to see how it works:

In [45]:
# The disambiguator expects pre-tokenized text
sentence = simple_word_tokenize('نجح بايدن في الانتخابات')

disambig = mle.disambiguate(sentence)

# For each disambiguated word d in disambig, d.analyses is a list of analyses
# sorted from most likely to least likely. Therefore, d.analyses[0] would
# be the most likely analysis for a given word. Below we extract different
# features from the top analysis of each disambiguated word into seperate lists.
diacritized = [d.analyses[0].analysis['diac'] for d in disambig]
pos_tags = [d.analyses[0].analysis['pos'] for d in disambig]
lemmas = [d.analyses[0].analysis['lex'] for d in disambig]

# Print the combined feature values extracted above
for triplet in zip(diacritized, pos_tags, lemmas):
    print(triplet)

# print lemmas
print(lemmas)

('نَجَحَ', 'verb', 'نَجَح')
('بايدن', 'noun_prop', 'بايدن')
('فِي', 'prep', 'فِي')
('الاِنْتِخاباتِ', 'noun', 'ٱِنْتِخاب')
['نَجَح', 'بايدن', 'فِي', 'ٱِنْتِخاب']


The above example from the CAMeL documentation works perfectly.

Let's now adapt so that we can get just the lemmas.

**NOTE**: We included the try/except clauses because some list indexing was throwing an 'out of range' error. **The function now returns NaN if it can't lemmatize a token.**

In [46]:
def get_lemmas(tokenized_text):
    disambig = mle.disambiguate(tokenized_text)
    try:
        lemmas = [d.analyses[0].analysis['lex'] for d in disambig]
        return lemmas
    except:
        return np.nan

In [None]:
%%time
# NOTE: this cell takes a long time to run (>1 hour on 8-core Macbook Pro)
lemmas = tokens_nostop.apply(get_lemmas)

Awesome -- we've now got our lemmatized tokens and are ready to continue on to our Topic Modelling.

## 4. Topic Modelling: LDA with Gensim

In [49]:
# load in cleaned AND lemmatized data
df = pd.read_parquet(
    's3://coiled-datasets/arabic-tweets/arabic_twitter_clean.parquet',
)

In [51]:
df.head()

Unnamed: 0,tweet_text,hashtags,is_retweet,retweet_tweetid,timestamp_first,user_reference_id
0,"[سَلام_1, عَلَى_1, رَحْمَة_1, اللَّه_1, بَرَكَ...",,True,9.986493e+17,2018-05-25 00:15:00,58
1,"[تَأْجِير_1, بَيْع_1, النطيطات_0, زحاليق_0, ما...","[للتأجير, لبيع النطيطات, زحاليق مائيه صابونية,...",True,9.996373e+17,2018-04-17 12:22:00,0
2,"[مِظَلَّة_1, ساتِر_1, أُفُق_1, رِياض_1, مِظَلّ...","[مظلات, آفاق الرياض, مظلات استراحات, مظلات مسا...",True,9.993939e+17,2018-05-25 00:15:00,58
3,"[فِيدْيُو_1, شاهَد_1, مُواطِن_1, وَثِق-ia_1, ف...",,True,9.983516e+17,2018-05-25 13:06:00,1
4,"[ٱِسْتَغْفَر_1, اللَّه_1, عَظِيم_2, تاب-u_1]",,False,,2014-04-12 03:34:00,657


In [52]:
# get only tweet content
docs = df.tweet_text

In [53]:
docs

0          [سَلام_1, عَلَى_1, رَحْمَة_1, اللَّه_1, بَرَكَ...
1          [تَأْجِير_1, بَيْع_1, النطيطات_0, زحاليق_0, ما...
2          [مِظَلَّة_1, ساتِر_1, أُفُق_1, رِياض_1, مِظَلّ...
3          [فِيدْيُو_1, شاهَد_1, مُواطِن_1, وَثِق-ia_1, ف...
4               [ٱِسْتَغْفَر_1, اللَّه_1, عَظِيم_2, تاب-u_1]
                                 ...                        
6145778    [أنا_1_0, قلب_3_0, تركي_1_0, ال_1_0, شيخ_2_0, ...
6145779    [أخت_1_0, جوز_2_0, شافه_1_0, طالع_1_0, مسجد_1_...
6145780    [رمضان_1_0, كريم_1_0, الدحيل_0_0, عين_1_0, قدس...
6145781    [رسول_1_0, الله_1_0, جمعة_1_0, ساعة_1_0, وافق_...
6145782    [إنجاز_2_0, شخص_1_0, عضو_1_0, شنو_0_0, إنجاز_2...
Name: tweet_text, Length: 6145783, dtype: object

### Create BOW Dictionary with Gensim

In [54]:
import gensim

In [55]:
%%time
# create BOW dictionary
dictionary = gensim.corpora.Dictionary(docs)

CPU times: user 51.3 s, sys: 764 ms, total: 52.1 s
Wall time: 53.3 s


In [56]:
# filter extreme cases out of dictionary
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

In [57]:
%%time
# map docs to bag of words
bow_corpus = [dictionary.doc2bow(doc) for doc in docs]

CPU times: user 30.4 s, sys: 20.3 s, total: 50.6 s
Wall time: 2min 17s


Let's test our Bag-of-Words for good measure.

In [58]:
# inspect
bow_doc_300 = bow_corpus[300]

for i in range(len(bow_doc_300)):
    print("Word {} (\"{}\") appears {} time(s).".format(bow_doc_300[i][0], 
                                                     dictionary[bow_doc_300[i][0]],
                                                     bow_doc_300[i][1]))

Word 175 ("تويتر_0") appears 1 time(s).
Word 253 ("بَرْنامَج_1") appears 2 time(s).
Word 912 ("تَخَلُّص_1") appears 1 time(s).
Word 1113 ("تَنْحِيف_1") appears 1 time(s).
Word 1242 ("حَقِيقِيّ_1") appears 1 time(s).
Word 1243 ("مُسْتَعِير_1") appears 1 time(s).
Word 1244 ("ٱِسْم_1") appears 1 time(s).
Word 1331 ("وَزْن_1") appears 1 time(s).
Word 1348 ("كِيلُو_1") appears 1 time(s).
Word 1676 ("الكورس_0") appears 1 time(s).
Word 1677 ("تَثْبِيت_1") appears 1 time(s).
Word 1680 ("وَرْس_1") appears 1 time(s).


### Run LDA with Gensim

Experimentation in [a separate notebook](https://github.com/rrpelgrim/portfolio/blob/master/0_FINAL_CAPSTONE_Identifying_Politiical_Misinformation/notebooks/03-rrp-topic-modelling.ipynb) showed that the LDA Model with 15 topics performed the best out of 5 tested options. Below, we provide a summary of our in-depth analysis of the LDA Visualisation Report of this 15-Topic LDA Model.

- LDA Visualisation shows Top 30 words that occur in each Topic
- This more in-depth view of the topics confirms our initial 'First-Glance Analysis':
  - There are 2 clearly political clusters
  - There is 1 cluster mixing political with misc. content
- The 15 clusters are not evenly distributed throughout the clustering space. Instead, there is one cluster on one side, and all 14 other clusters are overlapping on the other (see screenshot). This may be a sign that this clustering is not functioning entirely as it should.

In [59]:
%%time
lda_model_15 = gensim.models.LdaMulticore(bow_corpus, 
                                         num_topics=15, 
                                         id2word=dictionary, 
                                         passes=2, 
                                         workers=7,
                                         random_state=21)

CPU times: user 1min 7s, sys: 3min 41s, total: 4min 48s
Wall time: 5min 33s


In [60]:
from gensim.models import CoherenceModel

In [61]:
# evaluate model using Topic Coherence score
cm_15 = CoherenceModel(model=lda_model_15, corpus=bow_corpus, texts=docs, coherence='c_v')
coherence_15 = cm_15.get_coherence()  # get coherence value

In [62]:
coherence_15

0.6059190414540195

### Visualize LDA with pyLDAvis

In [63]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [68]:
%%time
# prepare visualisation data
vis_data = gensimvis.prepare(lda_model_15, bow_corpus, dictionary)

  default_term_info = default_term_info.sort_values(
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload


CPU times: user 3min 56s, sys: 3.51 s, total: 3min 59s
Wall time: 4min 7s


In [65]:
# create filepath to save HTML visualisation
filepath = "/Users/rpelgrim/Desktop/LDA_5.html"

In [None]:
# save visualisation to HTML in repo
pyLDAvis.save_html(vis_data, filepath)

In [None]:
from IPython.display import HTML
HTML(filename='/Users/rpelgrim/Desktop/LDA_5.html')

### Subset Political from LDA Output

In [None]:
# define function to get topics
def get_topics_LDA_15(row):
    index = int(row.name)
    try:
        topic = sorted(lda_model_15.get_document_topics(bow_corpus[index], minimum_probability=0.4), reverse=True)[0][0]
        return topic
    except:
        return np.nan

In [None]:
%%time
# assign topic labels 
df['topic'] = df.apply(get_topics_LDA_15, axis=1)

In [None]:
# filter tweets with political topics
df_pol = df[df.topic.isin([5,11,13])]

In [None]:
# get shape
df_pol.shape

Extracting just the tweets labelled with Topic 5, 11 or 13, yields a dataframe of **just under 350K political tweets**.

### Run GSDMM on LDA Output

In [None]:
# create array of documents
docs_pol = df_pol.tweet_text.to_numpy()

In [None]:
%%time
# create BOW dictionary
dictionary_pol = gensim.corpora.Dictionary(docs_pol)

In [None]:
# get vocab length
vocab_length_pol = len(dictionary_pol)

In [None]:
%%time
# map docs to bag of words
bow_corpus_pol = [dictionary_pol.doc2bow(doc) for doc in docs_pol]

In [None]:
# instantiate GSDMM
gsdmm_pol = MovieGroupProcess(K=30, alpha=0.4, beta=1, n_iters=12)

In [None]:
%%time
y_pol = gsdmm_pol.fit(docs_pol, vocab_length_pol)

## 4. Tf-Idf Vectorizer with Sklearn

In [19]:
# turn tweets into list of strings
docs = list(tweets)

In [20]:
# define stopwords
with open('/Users/rpelgrim/Desktop/data/arabic-stopwords.txt', 'r') as file:
    stopwords = file.read()
    stopwords_list = stopwords.split('\n')

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(stop_words=stopwords_list)

In [23]:
%%time
X = vectorizer.fit_transform(docs)



CPU times: user 51.8 s, sys: 1.06 s, total: 52.9 s
Wall time: 53.2 s


## 5. LDA with Sklearn

In [24]:
from sklearn.decomposition import LatentDirichletAllocation

In [29]:
lda = LatentDirichletAllocation(
    n_components=5,
    random_state=42,
    n_jobs=-1
)

In [26]:
import joblib

In [None]:
%%time
with joblib.parallel_backend("dask"):
    lda.fit(X)

^^ This gives "module not found scipy.sparse..."

- still not working, even after updating coiled s-env to explicitly include scipy and scikit-learn

In [28]:
%%time
lda.fit(X)

CPU times: user 53min 33s, sys: 15.2 s, total: 53min 48s
Wall time: 53min 46s


LatentDirichletAllocation(n_components=5, random_state=42)

In [30]:
%%time
lda.fit(X)

CPU times: user 1min 47s, sys: 21.4 s, total: 2min 8s
Wall time: 21min 56s


LatentDirichletAllocation(n_components=5, n_jobs=-1, random_state=42)

## Gensim

## SKlearn

### Vectorize

Vectorizing isn't possible at the moment because the cleaned dataframe contains numpy arrays of the lemmas. The `Vectorizers` expect a string per document. 

**TO DO: Try loading in the untokenized, cleaned tweet texts and Vectorizing those directly. NO >> Arabic-specific preprocessing to do. OR find a way to write custom preprocessor and tokenizers.**

To do that I'll probably have to:
- input custom preprocessors/tokenizers.
- input the list of stop words (we have it somewhere)
- 

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
CountVec = CountVectorizer(ngram_range=(2,2))
Count_data = CountVec.fit_transform(docs)

## Dask-ML

The array of documents `X` is only 47MB. Doesn't make sense to use Dask-ML for this. Instead use `sklearn` tf-idf vectorizer and then train LDA in parallel with Dask backend.

In [10]:
# vectorize contents
from dask_ml.feature_extraction.text import HashingVectorizer
from dask_ml.feature_extraction.text import CountVectorizer

### Hashing Vectorizer

In [11]:
vect = HashingVectorizer(lowercase=False)

In [12]:
X = df_full['tweet_text'].to_dask_array(lengths=True)

In [18]:
X

Unnamed: 0,Array,Chunk
Bytes,46.89 MiB,46.89 MiB
Shape,"(6145783,)","(6145783,)"
Count,3 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 46.89 MiB 46.89 MiB Shape (6145783,) (6145783,) Count 3 Tasks 1 Chunks Type object numpy.ndarray",6145783  1,

Unnamed: 0,Array,Chunk
Bytes,46.89 MiB,46.89 MiB
Shape,"(6145783,)","(6145783,)"
Count,3 Tasks,1 Chunks
Type,object,numpy.ndarray


In [17]:
X[1].compute()

' للتأجير لبيع النطيطات زحاليق مائيه صابونية ملاعب صابونيه زحاليق في جدة ألعاب أولاد بنات بالرياض '

In [34]:
docs_vect = vect.fit_transform(docs)

In [35]:
docs_vect.compute_chunk_sizes()

TypeError: cannot use a string pattern on a bytes-like object

In [None]:
docs_local = docs_vect.compute().toarray()

## X. Preprocessing with Dask Bags (not working)

In [35]:
# cast tweet texts into a Dask bag
bag = df_full['tweet_text'].to_bag(index=False)

In [19]:
# get number of items in bag
bag.count().compute()

6145783

In [36]:
t = bag.take(1)

In [37]:
t

(' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر ',)

In [38]:
type(t)

tuple

In [39]:
t[0]

' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر '

In [40]:
# extract value from tuple
def get_tweets(element):
    return element[0]

In [41]:
tweets = bag.map(get_tweets)

In [42]:
tweets.take(1)

(' ',)

In [27]:
t[0]

' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر '

In [25]:
type(t)

tuple

I think there's an issue with how the values are cast into the Bag. Seems like they're being cast as tuples when I actually just want the value. Is that what's tripping up the `bag.apply` and killing workers?

### Remove Repeating Characters

In [44]:
# remove repeating characters if character repeats more than once
def remove_repeating_char(text):
    return re.sub("(.)\\1{2,}", "\\1", text)

In [46]:
# apply regex function to contents of Dask bag
bag2 = db.map(remove_repeating_char, bag)

dask.bag<remove_repeating_char, npartitions=4>

In [47]:
bag2.take(1)

(' السلام عليكم ورحمة الله وبركاته مرحبا عملاء متجر ون واي وكل عام وانتم بخير نعتذر لكم عن تاخرنا في العودة بسبب بعض الظر ',)