In [1]:
import os
import pandas as pd
import re
import os
import string

import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, BertModel
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
DATA_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/data"
MODEL_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/models"

In [3]:
# task = "sarcasm"
# task = "sentiment"
task = "stance"

num_labels = {
    "sarcasm": 2,
    "sentiment": 3,
    "stance": 2
}

In [4]:
bert_models = [
    "aubmindlab/bert-base-arabertv02-twitter", 
    "aubmindlab/bert-base-arabertv02",
    "UBC-NLP/MARBERT",
    "CAMeL-Lab/bert-base-arabic-camelbert-da"
]

model_name = bert_models[3]
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels[task])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at CAMeL-Lab/bert-base-arabic-camelbert-da and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
df = pd.read_csv(os.path.join(DATA_PATH, "Mawqif_AllTargets_Train.csv"))
df = df.dropna(subset=[task])
df.head()

Unnamed: 0,ID,text,target,stance,stance:confidence,against_reason,favor_reason,none_reason,sarcasm,sarcasm:confidence,sentiment,sentiment:confidence,datetime,Date
0,1,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترن...,Women empowerment,Against,0.5116,A_Explicit,,,No,1.0,Negative,1.0,2021-01-16 03:19:19+00:00,16/01/2021
2,4,هذا ما يُعرّف بـ'فوبيا المرأة المُتمكنة' آفة ف...,Women empowerment,Favor,0.8171,,F_Explicit,,Yes,0.8145,Negative,0.8251,2022-04-02 07:45:42+00:00,02/04/2022
3,6,#LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في الم...,Digital Transformation,Favor,1.0,,F_Explicit,,No,1.0,Positive,0.7531,2022-02-02 18:24:09+00:00,02/02/2022
4,7,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Digital Transformation,Favor,0.7559,,F_Explicit,,No,1.0,Neutral,0.8116,2022-03-27 10:36:04+00:00,27/03/2022
5,8,فخورين بنساء الوطن 🇸🇦 وكلنا فخر بتقدم تمكين ا...,Women empowerment,Favor,1.0,,F_Explicit,,No,1.0,Positive,1.0,2021-03-08 14:54:45+00:00,08/03/2021


In [6]:
mapping_sarcasm = {"No": 0, "Yes": 1}
mapping_stance = {"Favor": 1, "Against": 0}
mapping_sentiment = {"Negative": 0, "Neutral": 1, "Positive": 2}

df['sarcasm'] = df['sarcasm'].map(lambda x: mapping_sarcasm[x])
df['sentiment'] = df['sentiment'].map(lambda x: mapping_sentiment[x])
df['stance'] = df['stance'].map(lambda x: mapping_stance[x])

In [7]:
df = df[["text", task]]
df = df.rename(columns={task: "labels"})

In [8]:
arabic_punctuations = '''`÷×؛<>()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_hash_URL_MEN(text):
    text = re.sub(r'#',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'URL','',text)
    text = re.sub(r'MENTION','',text)
    return text

def normalize_arabic(text):
    text = re.sub("[إآ]", "ا", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def process_tweet(tweet):     
    tweet=remove_hash_URL_MEN(tweet)
    tweet = re.sub('@[^\s]+', ' ', str(tweet))
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',str(tweet))    
    tweet= normalize_arabic(str(tweet))
    
    return tweet

arabert_prep = ArabertPreprocessor(model_name=model_name)
df.text = df.text.apply(lambda x: process_tweet(x))
df.text = df.text.apply(lambda x: arabert_prep.preprocess(x))



In [9]:
df.head()

Unnamed: 0,text,labels
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترند...,0
2,هذا ما يعرف ب ' فوبيا المرأة المتمكنة ' افة فك...,1
3,LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في المجا...,1
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,1
5,فخورين بنساء الوطن وكلنا فخر بتقدم تمكين المرأ...,1


In [10]:
def tokenize_text(df):
    return tokenizer(df["text"], padding="max_length", max_length=128, truncation=True)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/3169 [00:00<?, ? examples/s]

In [11]:
train_val_split = dataset.train_test_split(test_size=0.15)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

In [12]:
val_dataset

Dataset({
    features: ['text', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 476
})

In [13]:
def accuracy_metric(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (labels == preds).mean().item()}

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: accuracy_metric(pred),
)

trainer.train()

  0%|          | 0/6740 [00:00<?, ?it/s]

{'loss': 0.5603, 'learning_rate': 4.6290801186943624e-05, 'epoch': 0.74}
{'loss': 0.519, 'learning_rate': 4.258160237388724e-05, 'epoch': 1.48}
{'loss': 0.6118, 'learning_rate': 3.887240356083086e-05, 'epoch': 2.23}
{'loss': 0.6424, 'learning_rate': 3.516320474777448e-05, 'epoch': 2.97}
{'loss': 0.6418, 'learning_rate': 3.14540059347181e-05, 'epoch': 3.71}
{'loss': 0.6352, 'learning_rate': 2.774480712166172e-05, 'epoch': 4.45}
{'loss': 0.6209, 'learning_rate': 2.4035608308605344e-05, 'epoch': 5.19}
{'loss': 0.577, 'learning_rate': 2.0326409495548962e-05, 'epoch': 5.93}
{'loss': 0.486, 'learning_rate': 1.661721068249258e-05, 'epoch': 6.68}
{'loss': 0.4965, 'learning_rate': 1.29080118694362e-05, 'epoch': 7.42}
{'loss': 0.5019, 'learning_rate': 9.198813056379822e-06, 'epoch': 8.16}
{'loss': 0.5189, 'learning_rate': 5.489614243323442e-06, 'epoch': 8.9}
{'loss': 0.4186, 'learning_rate': 1.7804154302670625e-06, 'epoch': 9.64}
{'train_runtime': 720.7349, 'train_samples_per_second': 37.365, 't

TrainOutput(global_step=6740, training_loss=0.5517127424743834, metrics={'train_runtime': 720.7349, 'train_samples_per_second': 37.365, 'train_steps_per_second': 9.352, 'train_loss': 0.5517127424743834, 'epoch': 10.0})

In [15]:
model = model.to("mps")
trainer.evaluate()

  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.7906274199485779,
 'eval_accuracy': 0.8067226890756303,
 'eval_runtime': 2.4487,
 'eval_samples_per_second': 194.389,
 'eval_steps_per_second': 24.503,
 'epoch': 10.0}

In [33]:
def predict_sarcasm(new_tweet):
    new_encoding = tokenizer(new_tweet, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**new_encoding)
    predicted_class = torch.argmax(output.logits, dim=-1).item()
    if predicted_class == 0: return "No"
    return "Yes"

model = model.to("cpu")
new_tweet = "أنا أؤيد قرار الحكومة الجديدة"
new_tweet = process_tweet(new_tweet)
new_tweet = arabert_prep.preprocess(new_tweet)
predicted_sarcasm = predict_sarcasm(new_tweet)
print(f"Predicted {task} for '{new_tweet}': {predicted_sarcasm}")

Predicted stance for 'أنا أؤيد قرار الحكومة الجديدة': Yes


In [34]:
trainer.save_model(f"../models/STL_ARABERT_TWITTER_{task}")