# Data Preprocessing

In this notebook we will preprocess our reviews and remove unnecessary strings.


In [1]:
import re
import wandb
import pandas as pd
from typing import Optional
from dotenv import load_dotenv
from arabert.preprocess import ArabertPreprocessor
from sklearn.model_selection import train_test_split

from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from collections import defaultdict

In [2]:
load_dotenv()
wandb.login()
tqdm.pandas()

[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
# wandb configs
PROJECT_NAME = 'ARABIC_DISAMBIGUATION_STUDY'
JOB_TYPE = 'DATA_PREPROCESSING'
TAGS = ['ARABERT_PREPROCESSOR', 'FARASAPY', 'REGEX']
NOTES = 'Preprocessing data for sentiment calssification with disambiguation'
RUN_NAME = 'Data_Preprocessing'
config = defaultdict(dict)

In [4]:
seed = 42
arabert_model = 'aubmindlab/bert-base-arabertv2'
data_path = Path('../data/ArSen-20_publish.csv')
processed_data_path = Path('../data/processed_data/')

In [5]:
if data_path.suffix == '.csv':
    data = pd.read_csv(data_path)

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   tweet id         20000 non-null  float64
 1   label            20000 non-null  object 
 2   author id        20000 non-null  float64
 3   created_at       20000 non-null  object 
 4   lang             20000 non-null  object 
 5   like_count       20000 non-null  int64  
 6   quote_count      20000 non-null  int64  
 7   reply_count      20000 non-null  int64  
 8   retweet_count    20000 non-null  int64  
 9   tweet            20000 non-null  object 
 10  user_verified    20000 non-null  bool   
 11  followers_count  20000 non-null  int64  
 12  following_count  20000 non-null  int64  
 13  tweet_count      20000 non-null  int64  
 14  listed_count     20000 non-null  int64  
 15  name             20000 non-null  object 
 16  username         20000 non-null  object 
 17  user_created

In [7]:
data.head(2)

Unnamed: 0,tweet id,label,author id,created_at,lang,like_count,quote_count,reply_count,retweet_count,tweet,user_verified,followers_count,following_count,tweet_count,listed_count,name,username,user_created_at,description
0,1.4374e+18,neutral,1.43017e+18,2021-09-13 13:03:40+00:00,ar,0,0,0,1,أفكار تحليل وتواصل بلا حدود: هدية للجامية http...,False,1,0,1707,0,Majid Khatiri,MajidKhatiri1,2021-08-24T14:10:47.000Z,ولكن إذا حم القضاء على امرئ فليس له بر يقيه ول...
1,1.24461e+18,neutral,1.15576e+18,2020-03-30 12:46:36+00:00,ar,2,0,0,3,"سبب تسمية كورونا:\n COVID-19\n""CO""\nأول حرفي...",False,60,118,460,0,صالح الوكاع Saleh Alwakkaa,saleh_alwakkaa,2019-07-29T08:49:30.000Z,أحارب الجوع و الجهل ما استطعت


In [8]:
data = data[['tweet', 'label']]

In [9]:
i = 100  # 14
print(data.loc[i, 'tweet'])

In [10]:
arabert_prep = ArabertPreprocessor(model_name=arabert_model)



In [11]:
print(arabert_prep.preprocess(data.loc[i, 'tweet']))

In [12]:
def clean_text(tweet: str, is_arabert: Optional[bool] = False) -> str:
    '''Preprocess and clean text using Farasapy and regex rules.'''
    text = arabert_prep.preprocess(tweet)
    if not is_arabert:
        text = arabert_prep.unpreprocess(text)
    # regex patterns
    link = r'\[رابط\]'
    user = r'\[مستخدم\]'
    mail = r'\[بريد\]'
    underscore = r'_'
    hashtag = r'#'
    space = r'\s+'
    patterns = [link, user, mail, underscore, hashtag, space]
    for p in patterns:
        text = re.sub(p, ' ', text)
    return text

In [13]:
print(clean_text(data.loc[i, 'tweet']))

In [14]:
data['text'] = data.tweet.progress_apply(clean_text, is_arabert=False)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [15]:
data['text_arabert'] = data.tweet.progress_apply(clean_text, is_arabert=True)

  0%|          | 0/20000 [00:00<?, ?it/s]

In [16]:
for k, v in data.loc[i].to_dict().items():
    print(f'{k}:\n{v}')

In [17]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet         20000 non-null  object
 1   label         20000 non-null  object
 2   text          20000 non-null  object
 3   text_arabert  20000 non-null  object
dtypes: object(4)
memory usage: 625.1+ KB


In [18]:
train, valid = train_test_split(
    data, test_size=0.2, stratify=data.label.to_list(), random_state=seed)
valid, test = train_test_split(
    valid, test_size=0.5, stratify=valid.label.to_list(), random_state=seed)

In [19]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16000 entries, 2456 to 6123
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet         16000 non-null  object
 1   label         16000 non-null  object
 2   text          16000 non-null  object
 3   text_arabert  16000 non-null  object
dtypes: object(4)
memory usage: 625.0+ KB


In [20]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 11818 to 2623
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet         2000 non-null   object
 1   label         2000 non-null   object
 2   text          2000 non-null   object
 3   text_arabert  2000 non-null   object
dtypes: object(4)
memory usage: 78.1+ KB


In [21]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2000 entries, 17770 to 12684
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   tweet         2000 non-null   object
 1   label         2000 non-null   object
 2   text          2000 non-null   object
 3   text_arabert  2000 non-null   object
dtypes: object(4)
memory usage: 78.1+ KB


In [22]:
train.label.value_counts(normalize=True).to_dict()

{'neutral': 0.863125, 'negative': 0.093, 'positive': 0.043875}

In [23]:
data_artifact = wandb.Artifact(name='Sentiment-Classification-Dataset', type='dataset')

data_dict = {'train': train, 'valid': valid, 'test': test}

for n, d in data_dict.items():
    processed_data_path.mkdir(exist_ok=True)
    split_path = processed_data_path.joinpath(f'{n}.csv')
    config['data'][n] = {
        'size': len(d),
        'local_path': split_path,
        'label_distribution': d.label.value_counts(normalize=True).to_dict(),
    }
    d.to_csv(split_path, index=False)
    data_artifact.add_file(local_path=split_path, name=n)

In [24]:
print(config)

In [25]:
run = wandb.init(project=PROJECT_NAME, job_type=JOB_TYPE, name=RUN_NAME, notes=NOTES, tags=TAGS, config=config)

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011367479422218215, max=1.0…

In [26]:
_ = run.log_artifact(data_artifact)

In [27]:
run.finish()

VBox(children=(Label(value='17.393 MB of 17.412 MB uploaded\r'), FloatProgress(value=0.9989003562983617, max=1…