# Convert snippets to conversations

## first pass:

 - [x] get posts + comments
 - [x] append toplevel posts and comments
 - [x] groupby root_id
 - [x] sort by root_id, parent_id, time?
 - [x] make a single row per post + comments with display text: f"@{post_author}:{post_text} \n @{reply_author}: {reply_text} etc." up to x00 words.
 - [x] Save on local_artifacts
 - [x] show in prodigy
 
## second pass?

 Get a better way to annotate specific parts of the text? spancat? ccreate custom 

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import datetime

import pandas as pd
import tentaclio
import spacy

from phoenix.common import artifacts, run_params, utils

In [None]:
utils.setup_notebook_output()
utils.setup_notebook_logging()

In [None]:
# Files
posts_df_path = f"{artifacts.urls.get_local()}/prodigy/reddit_posts_2019_5.csv"
comments_df_path = f"{artifacts.urls.get_local()}/prodigy/reddit_comments_2019_5.csv"
twitter_df_path = f"{artifacts.urls.get_local()}/prodigy/2022-6.parquet"
output_path = f"{artifacts.urls.get_local()}/prodigy/reddit_conversations.csv"
twitter_output_path = f"{artifacts.urls.get_local()}/prodigy/twitter_2022-6_conversations.csv"

with tentaclio.open(posts_df_path, "r") as fb:
    posts_df = pd.read_csv(fb)
    
with tentaclio.open(comments_df_path, "r") as fb:
    comments_df = pd.read_csv(fb)

In [None]:
df = artifacts.dataframes.get(twitter_df_path).dataframe

In [None]:
# load AraVec Spacy model
nlp = spacy.load("./spacy.aravec.model/")

In [None]:
import gensim
import re
import spacy

# Clean/Normalize Arabic Text
def clean_str(text):
    search = ["أ","إ","آ","ة","_","-","/",".","،"," و "," يا ",'"',"ـ","'","ى","\\",'\n', '\t','&quot;','?','؟','!']
    replace = ["ا","ا","ا","ه"," "," ","","",""," و"," يا","","","","ي","",' ', ' ',' ',' ? ',' ؟ ',' ! ']
    
    #remove tashkeel
    p_tashkeel = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(p_tashkeel,"", text)
    
    #remove longation
    p_longation = re.compile(r'(.)\1+')
    subst = r"\1\1"
    text = re.sub(p_longation, subst, text)
    
    text = text.replace('وو', 'و')
    text = text.replace('يي', 'ي')
    text = text.replace('اا', 'ا')
    
    for i in range(0, len(search)):
        text = text.replace(search[i], replace[i])
    
    #trim    
    text = text.strip()

    return text

In [None]:
# Define the preprocessing Class
class Preprocessor:
    def __init__(self, tokenizer, **cfg):
        self.tokenizer = tokenizer

    def __call__(self, text):
        preprocessed = clean_str(text)
        return self.tokenizer(preprocessed)

In [None]:
# Apply the `Preprocessor` Class
nlp.tokenizer = Preprocessor(nlp.tokenizer)

In [None]:
df[102710:102720]

In [None]:
test_df = df[:5].copy()

In [None]:
oov_words = []
is_oov = []
sum_tokens = []
for doc in nlp.pipe(df["text"].astype('unicode').values):
    is_oov.append(sum(n.is_oov for n in doc if not str.isascii(str(n))))
    sum_tokens.append(len(doc))
    oov_words.append(" ".join([str(n) for n in doc if n.is_oov and not str.isascii(str(n)) ]))
        

In [None]:
oov_words = []
for doc in nlp.pipe(df["text"].astype('unicode').values):
    oov_words.append(" ".join([str(n) for n in doc if n.is_oov and not str.isascii(str(n)) ]))

In [75]:
oov_words

['جريوال تحتفظون',
 '',
 '',
 '',
 '« « «',
 '',
 'ب«جدري',
 'يااهل فوليتم لارذل المفحوصين وشاورمجيه للعوره فاستاسد القطرجالكم القوراب',
 '',
 '',
 'مشروعن بتخففوا',
 '',
 '',
 'ستزود هيمارس',
 'جافلين وستينغر',
 '',
 '',
 '',
 '',
 '',
 'كوزو اوكاموتو',
 '',
 '',
 '',
 '',
 'ممافيدلك بيعجبكن ياانتو منكن',
 'جعجعات',
 'نعطيهن لتتبلور',
 'سنتردد',
 '',
 '٣٠ ٩ الحزب',
 'غروندبرغ',
 'مصر ل250 و150',
 '« طاجيكستان',
 '',
 '',
 '',
 'didn’t',
 '«',
 'مصر «',
 'يحميلنا ️Elias',
 'للرابح والطاغي 🧡',
 'وانغلاق',
 'الاقتصادي+',
 '«',
 'لاسيره',
 'والعونيه',
 'باكستان و«طالبان',
 'كزدوره',
 '',
 '«',
 'ستزود هيمارس',
 'سوری کے کو اسلامی ٹچ والے مشورے مسلمہ کے لیڈر کے چہرے پہ پڑا اتار دیا فوبیا ڈرامہ تقریریں فلاپ کر دیں میڈیا ٹیم پشاور منجانب جمیل داودزئی',
 '',
 'ل80',
 '',
 '« اللاافق',
 '',
 '',
 'وو',
 '🤦',
 '',
 'لعقاراتها الجنسيه',
 '',
 'عحسابن بيوقفني وبفكرو',
 '',
 'ديكنسن تناقضاتنا',
 'التغيري الحزب اليهودانت الطاءفيه الطاءفيه اسقطوهماما طاءفي',
 'التغيري الحزب اليهودانت الطاءفيه الطاءف

In [None]:
df["num_oov"] = is_oov
df["num_tokens"] = sum_tokens

In [None]:
df["oov_words"] = oov_words

In [None]:
df["percentage_oov"] = df["num_oov"] / df["num_tokens"]

In [79]:
# within the text snippets, on average 5% of the tokens in the text snippets is out of vocabulary
# This is a relatively low number
df["percentage_oov"].describe()

count    102718.000000
mean          0.052952
std           0.076947
min           0.000000
25%           0.000000
50%           0.035714
75%           0.076923
max           0.909091
Name: percentage_oov, dtype: float64

In [80]:
# There are many genuine arabic OOV words that we would want to doublecheck with an expert
# Such as "حزب", "ديبايت" , and "كاريش" but a lot of the OOV words are emoji's
df["oov_words"].str.split(expand=True).stack().value_counts()[:50]

«                4846
حزب              3488
ديبايت           2132
🤣                1630
كاريش            1127
الحزب            1101
هناك              880
؛                 858
هوكشتاين          686
🛑                 683
مصر               671
🇸                 572
وهناك             566
🇾                 562
🇱                 499
🇪                 457
🇧                 427
🖤                 416
مصادر             408
مينينديز          357
🤦                 354
مصانع             354
🏻‍                345
العقو             324
مصدر              313
زيلينسكي          305
🧡                 296
انك               291
🇵                 285
الشراونه          277
مصرف              260
🤍                 260
🇮                 251
منك               248
🥰                 245
كوخافي            235
سيفيرودونيتسك     230
لmtv              220
لالجديد           212
المصارف           199
الخيرᅠ            199
قسد               196
🇺                 194
٢٩                191
التغيرين          183
مصلحه     