In [1]:
import os
import pandas as pd
import re
import os
import string

import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import warnings
warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
DATA_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/data"
MODEL_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/models"

In [None]:
task = "sarcasm"
task = "sentiment"
task = "stance"

num_labels = {
    "sarcasm": 2,
    "sentiment": 3,
    "stance": 2
}

In [3]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels[task])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['classifier.weight', 'classifier.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
df = pd.read_csv(os.path.join(DATA_PATH, "Mawqif_AllTargets_Train.csv"))
df = df.dropna(subset=[task])
df.head()

Unnamed: 0,text,sarcasm
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترن...,No
1,روح حلل محد يم تطعيم كورونا شف الحرم البارح م...,Yes
2,هذا ما يُعرّف بـ'فوبيا المرأة المُتمكنة' آفة ف...,Yes
3,#LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في الم...,No
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,No


In [5]:
df.sarcasm.unique()

array(['No', 'Yes'], dtype=object)

In [6]:
mapping_sarcasm = {"No": 0, "Yes": 1}
mapping_stance = {"Favor": 1, "Against": 0}
mapping_sentiment = {"Negative": 0, "Neutral": 1, "Positive": 2}

df['sarcasm'] = df['sarcasm'].map(lambda x: mapping_sarcasm[x])
df['sentiment'] = df['sentiment'].map(lambda x: mapping_sentiment[x])
df['stance'] = df['stance'].map(lambda x: mapping_stance[x])

In [None]:
df = df[["text", task]]

In [7]:
arabic_punctuations = '''`÷×؛<>()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_hash_URL_MEN(text):
    text = re.sub(r'#',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'URL','',text)
    text = re.sub(r'MENTION','',text)
    return text

def normalize_arabic(text):
    text = re.sub("[إآ]", "ا", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def process_tweet(tweet):     
    tweet=remove_hash_URL_MEN(tweet)
    tweet = re.sub('@[^\s]+', ' ', str(tweet))
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',str(tweet))    
    tweet= normalize_arabic(str(tweet))
    
    return tweet

arabert_prep = ArabertPreprocessor(model_name=model_name)
df.text = df.text.apply(lambda x: process_tweet(x))
df.text = df.text.apply(lambda x: arabert_prep.preprocess(x))

In [8]:
df.head()

Unnamed: 0,text,sarcasm,label
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترند...,No,0
1,روح حلل محد يم تطعيم كورونا شف الحرم البارح مل...,Yes,1
2,هذا ما يعرف ب ' فوبيا المرأة المتمكنة ' افة فك...,Yes,1
3,LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في المجا...,No,0
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,No,0


In [9]:
def tokenize_text(df):
    return tokenizer(df["text"], padding="max_length", max_length=128, truncation=True)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/3502 [00:00<?, ? examples/s]

In [10]:
train_val_split = dataset.train_test_split(test_size=0.15)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

In [11]:
def accuracy_metric(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (labels == preds).mean().item()}

In [12]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: accuracy_metric(pred),
)

trainer.train()

Step,Training Loss
500,0.1794
1000,0.0823
1500,0.0289
2000,0.0048
2500,0.0087
3000,0.0017
3500,0.0007


TrainOutput(global_step=3720, training_loss=0.041251400245293494, metrics={'train_runtime': 749.2499, 'train_samples_per_second': 39.72, 'train_steps_per_second': 4.965, 'total_flos': 1957546251878400.0, 'train_loss': 0.041251400245293494, 'epoch': 10.0})

In [13]:
trainer.evaluate()

{'eval_loss': 0.4181869626045227,
 'eval_accuracy': 0.9505703422053232,
 'eval_runtime': 3.0478,
 'eval_samples_per_second': 172.584,
 'eval_steps_per_second': 21.655,
 'epoch': 10.0}

In [14]:
def predict_sarcasm(new_tweet):
    new_encoding = tokenizer(new_tweet, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**new_encoding)
    predicted_class = torch.argmax(output.logits, dim=-1).item()
    if predicted_class == 0: return "No"
    return "Yes"

model = model.to("cpu")
new_tweet = "أنا أؤيد قرار الحكومة الجديدة"
new_tweet = process_tweet(new_tweet)
new_tweet = arabert_prep.preprocess(new_tweet)
predicted_sarcasm = predict_sarcasm(new_tweet)
print(f"Predicted {task} for '{new_tweet}': {predicted_sarcasm}")

Predicted sarcasm for 'أنا أؤيد قرار الحكومة الجديدة': No


In [15]:
trainer.save_model(f"../models/STL_ARABERT_TWITTER_{task}")