In [1]:
import os
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import pickle
import re
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
from datasets import ClassLabel, load_dataset, Dataset, DatasetDict
import string
from typing import Dict ,List
import transformers

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from transformers import BertTokenizerFast
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
DATA_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/data"
MODEL_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/models"

In [3]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['classifier.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
df = pd.read_csv(os.path.join(DATA_PATH, "cleaned.csv"))
df = df[["text", "stance"]]
df = df.dropna(subset=["stance"])
df.head()

Unnamed: 0,text,stance
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترن...,Against
2,هذا ما يُعرّف بـ'فوبيا المرأة المُتمكنة' آفة ف...,Favor
3,#LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في الم...,Favor
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Favor
5,فخورين بنساء الوطن 🇸🇦 وكلنا فخر بتقدم تمكين ا...,Favor


In [5]:
mapping_stance = {"Favor": 1, "Against": 0}
df['label'] = df['stance'].map(lambda x: mapping_stance[x])

In [6]:
arabic_punctuations = '''`÷×؛<>()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_hash_URL_MEN(text):
    text = re.sub(r'#',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'URL','',text)
    text = re.sub(r'MENTION','',text)
    return text

def normalize_arabic(text):
    text = re.sub("[إآ]", "ا", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def process_tweet(tweet):     
    tweet=remove_hash_URL_MEN(tweet)
    tweet = re.sub('@[^\s]+', ' ', str(tweet))
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',str(tweet))    
    tweet= normalize_arabic(str(tweet))
    
    return tweet

arabert_prep = ArabertPreprocessor(model_name=model_name)
df.text = df.text.apply(lambda x: process_tweet(x))
df.text = df.text.apply(lambda x: arabert_prep.preprocess(x))

In [7]:
df.head()

Unnamed: 0,text,stance,label
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترند...,Against,0
2,هذا ما يعرف ب ' فوبيا المرأة المتمكنة ' افة فك...,Favor,1
3,LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في المجا...,Favor,1
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Favor,1
5,فخورين بنساء الوطن 🇸 🇦 وكلنا فخر بتقدم تمكين ا...,Favor,1


In [8]:
def tokenize_text(df):
    return tokenizer(df["text"], padding="max_length", max_length=128, truncation=True)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_text, batched=True)

Map:   0%|          | 0/3169 [00:00<?, ? examples/s]

In [9]:
train_val_split = dataset.train_test_split(test_size=0.15)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

In [10]:
def accuracy_metric(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (labels == preds).mean().item()}

In [11]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: accuracy_metric(pred),
)

trainer.train()

Step,Training Loss
500,0.3298
1000,0.1264
1500,0.0236
2000,0.0101
2500,0.0017
3000,0.0009


TrainOutput(global_step=3370, training_loss=0.07307359941043955, metrics={'train_runtime': 678.0142, 'train_samples_per_second': 39.719, 'train_steps_per_second': 4.97, 'total_flos': 1771395180211200.0, 'train_loss': 0.07307359941043955, 'epoch': 10.0})

In [12]:
trainer.evaluate()

{'eval_loss': 1.2029073238372803,
 'eval_accuracy': 0.8592436974789915,
 'eval_runtime': 2.6499,
 'eval_samples_per_second': 179.63,
 'eval_steps_per_second': 22.642,
 'epoch': 10.0}

In [20]:
def predict_stance(new_tweet):
    new_encoding = tokenizer(new_tweet, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**new_encoding)
    predicted_class = torch.argmax(output.logits, dim=-1).item()
    predicted_label = "Against" if predicted_class == 0 else "Favor"
    return predicted_label

model = model.to("cpu")
new_tweet = "أنا أؤيد قرار الحكومة الجديدة"
new_tweet = process_tweet(new_tweet)
new_tweet = arabert_prep.preprocess(new_tweet)
predicted_stance = predict_stance(new_tweet)
print(f"Predicted stance for '{new_tweet}': {predicted_stance}")

Predicted stance for 'أنا أؤيد قرار الحكومة الجديدة': Favor


In [23]:
trainer.save_model("../models/STL_ARABERT_TWITTER_stance")