# Data Preprocessing

In this notebook we will preprocess our reviews and remove unnecessary strings.


In [1]:
import re
import os
import wandb
import pandas as pd
from typing import Optional
from dotenv import load_dotenv
from arabert.preprocess import ArabertPreprocessor
from sklearn.model_selection import train_test_split

from huggingface_hub import login
from datasets import load_dataset

from rich import print
from pathlib import Path
from tqdm.auto import tqdm
from collections import defaultdict

In [2]:
tqdm.pandas()
load_dotenv()
wandb.login()
login(token=os.getenv('HF_TOKEN'),
      add_to_git_credential=True, write_permission=True)  # HF

[34m[1mwandb[0m: Currently logged in as: [33me_hossam96[0m. Use [1m`wandb login --relogin`[0m to force relogin


Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /home/hossam/.cache/huggingface/token
Login successful


In [3]:
# wandb configs
PROJECT_NAME = 'ARABIC_DISAMBIGUATION_STUDY'
JOB_TYPE = 'DATA_PREPROCESSING'
TAGS = ['ARABERT_PREPROCESSOR', 'FARASAPY', 'REGEX', 'TOPIC']
NOTES = 'Preprocessing data for topic classification with disambiguation'
RUN_NAME = 'Data_Preprocessing_SANAD'
config = defaultdict(dict)

In [4]:
seed = 42
arabert_model = 'aubmindlab/bert-base-arabertv2'
# data_path = Path('../data/ArSen-20_publish.csv')
data_ckpt = 'arbml/SANAD'
processed_data_path = Path('../data/processed_data/sanad/')

In [5]:
# if data_path.suffix == '.csv':
#     data = pd.read_csv(data_path)

In [6]:
data = load_dataset(data_ckpt, trust_remote_code=True)
data

Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['Article', 'label'],
        num_rows: 141807
    })
})

In [7]:
print(data['train'][0])

In [8]:
label_names = data['train'].features['label'].names
print(label_names)

In [9]:
data = data['train'].to_pandas()
data.head(2)

Unnamed: 0,Article,label
0,يحيي كل من العروسي وعواطف وعمار والعنبري أمجاد...,5
1,أخبارنا المغربية ـ هدى جميعي\nتحول فنان مغربي ...,5


In [10]:
data['label'] = data.label.progress_apply(lambda x: label_names[x])

  0%|          | 0/141807 [00:00<?, ?it/s]

In [11]:
data.head(2)

Unnamed: 0,Article,label
0,يحيي كل من العروسي وعواطف وعمار والعنبري أمجاد...,Culture
1,أخبارنا المغربية ـ هدى جميعي\nتحول فنان مغربي ...,Culture


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141807 entries, 0 to 141806
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Article  141807 non-null  object
 1   label    141807 non-null  object
dtypes: object(2)
memory usage: 2.2+ MB


In [13]:
data = data.dropna()
data = data[[len(t) > 0 for t in data.Article.to_list()]]
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 141729 entries, 0 to 141806
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   Article  141729 non-null  object
 1   label    141729 non-null  object
dtypes: object(2)
memory usage: 3.2+ MB


In [14]:
print(data.label.value_counts(normalize=True).to_dict())

In [15]:
# select subsample of the data
_, data = train_test_split(data, test_size=10_000, random_state=seed, stratify=data.label.to_list())
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 35816 to 84237
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Article  10000 non-null  object
 1   label    10000 non-null  object
dtypes: object(2)
memory usage: 234.4+ KB


In [16]:
data.head(2)

Unnamed: 0,Article,label
35816,ارتفعت الأسهم الأميركية اليوم الجمعة، وسجلت مك...,Finance
110582,يعلن المدرب الهولندي فان مارفيك، مدرب المنتخب ...,Sports


In [17]:
data = data.rename(columns={'Article': 'text'}).reset_index(drop=True)

In [18]:
i = 100  # 14
print(data.loc[i, 'text'])

In [19]:
arabert_prep = ArabertPreprocessor(model_name=arabert_model)



In [20]:
print(arabert_prep.preprocess(data.loc[i, 'text']))

In [21]:
def clean_text(tweet: str, is_arabert: Optional[bool] = False) -> str:
    '''Preprocess and clean text using Farasapy and regex rules.'''
    text = arabert_prep.preprocess(tweet)
    if not is_arabert:
        text = arabert_prep.unpreprocess(text)
    # regex patterns
    link = r'\[رابط\]'
    user = r'\[مستخدم\]'
    mail = r'\[بريد\]'
    underscore = r'_'
    hashtag = r'#'
    space = r'\s+'
    patterns = [link, user, mail, underscore, hashtag, space]
    for p in patterns:
        text = re.sub(p, ' ', text)
    return text

In [22]:
print(clean_text(data.loc[i, 'text']))

In [23]:
data['processed_text'] = data.text.progress_apply(clean_text, is_arabert=False)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [24]:
data['processed_text_arabert'] = data.text.progress_apply(clean_text, is_arabert=True)

  0%|          | 0/10000 [00:00<?, ?it/s]

In [25]:
for k, v in data.loc[i].to_dict().items():
    print(f'{k}:\n{v}')

In [26]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   text                    10000 non-null  object
 1   label                   10000 non-null  object
 2   processed_text          10000 non-null  object
 3   processed_text_arabert  10000 non-null  object
dtypes: object(4)
memory usage: 312.6+ KB


In [27]:
train, valid = train_test_split(
    data, test_size=0.2, stratify=data.label.to_list(), random_state=seed)
valid, test = train_test_split(
    valid, test_size=0.5, stratify=valid.label.to_list(), random_state=seed)

In [28]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8000 entries, 8672 to 4794
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   text                    8000 non-null   object
 1   label                   8000 non-null   object
 2   processed_text          8000 non-null   object
 3   processed_text_arabert  8000 non-null   object
dtypes: object(4)
memory usage: 312.5+ KB


In [29]:
valid.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 2073 to 3629
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   text                    1000 non-null   object
 1   label                   1000 non-null   object
 2   processed_text          1000 non-null   object
 3   processed_text_arabert  1000 non-null   object
dtypes: object(4)
memory usage: 39.1+ KB


In [30]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 458 to 1178
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   text                    1000 non-null   object
 1   label                   1000 non-null   object
 2   processed_text          1000 non-null   object
 3   processed_text_arabert  1000 non-null   object
dtypes: object(4)
memory usage: 39.1+ KB


In [31]:
print(train.label.value_counts(normalize=True).to_dict())

In [32]:
data_artifact = wandb.Artifact(
    name='Topic-Classification-Dataset', type='dataset')

data_dict = {'train': train, 'valid': valid, 'test': test}

for n, d in data_dict.items():
    processed_data_path.mkdir(exist_ok=True)
    split_path = processed_data_path.joinpath(f'{n}.csv')
    config['data'][n] = {
        'size': len(d),
        'local_path': split_path,
        'label_distribution': d.label.value_counts(normalize=True).to_dict(),
    }
    d.to_csv(split_path, index=False)
    data_artifact.add_file(local_path=split_path, name=n)

In [33]:
print(config)

In [34]:
run = wandb.init(project=PROJECT_NAME, job_type=JOB_TYPE,
                 name=RUN_NAME, notes=NOTES, tags=TAGS, config=config)

In [35]:
_ = run.log_artifact(data_artifact)

In [36]:
run.finish()

VBox(children=(Label(value='83.393 MB of 83.412 MB uploaded (0.004 MB deduped)\r'), FloatProgress(value=0.9997…