In [1]:
import os
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
import pickle
import re
import os
import sys
from dataclasses import dataclass, field
from typing import Optional
import datasets
from datasets import ClassLabel, load_dataset, Dataset, DatasetDict, concatenate_datasets
import string
from typing import Dict ,List
import transformers

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from transformers import BertTokenizerFast
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from arabert.preprocess import ArabertPreprocessor
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

import warnings

warnings.filterwarnings("ignore")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
DATA_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/data"
MODEL_PATH = "/Users/gufran/Developer/Projects/AI/MawqifStanceDetection/models"

In [3]:
model_name = "aubmindlab/bert-base-arabertv02-twitter"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at aubmindlab/bert-base-arabertv02-twitter and are newly initialized: ['classifier.weight', 'classifier.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
df_supp = pd.read_csv(os.path.join(DATA_PATH, "ASTAD_train.csv"))
df_supp = df_supp[["text", "sentiment"]]
df_supp = df_supp.sample(frac=1, random_state=42)
df_supp = df_supp.reset_index().drop(["index"], axis=1)
df_supp.head()

Unnamed: 0,text,sentiment
0,لخسارة وزنك بشكل طبيعي ما الك غير مجموعة كلين ...,Negative
1,لا يكلف الله نفسا إلا وسعها 🌹,Positive
2,وكل السعوديه بدو ..حتى الحاضره منهم من اصل بدو...,Negative
3,المونافري طلع برهوش .. سدينا 😤,Negative
4,منو خر بيتهم مثلنا 😩,Negative


In [7]:
df = pd.read_csv(os.path.join(DATA_PATH, "cleaned.csv"))
df = df[["text", "sentiment"]]
df = df.dropna(subset=["sentiment"])
df.head()

Unnamed: 0,text,sentiment
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترن...,Negative
1,روح حلل محد يم تطعيم كورونا شف الحرم البارح م...,Neutral
2,هذا ما يُعرّف بـ'فوبيا المرأة المُتمكنة' آفة ف...,Negative
3,#LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في الم...,Positive
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Neutral


In [8]:
df.sentiment.unique()

array(['Negative', 'Neutral', 'Positive'], dtype=object)

In [9]:
mapping_sentiment = {"Negative": 0, "Neutral": 1, "Positive": 2}
df['label'] = df['sentiment'].map(lambda x: mapping_sentiment[x])
df_supp['label'] = df_supp['sentiment'].map(lambda x: mapping_sentiment[x])

In [10]:
arabic_punctuations = '''`÷×؛<>()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

def remove_hash_URL_MEN(text):
    text = re.sub(r'#',' ',text)
    text = re.sub(r'_',' ',text)
    text = re.sub(r'URL','',text)
    text = re.sub(r'MENTION','',text)
    return text

def normalize_arabic(text):
    text = re.sub("[إآ]", "ا", text)
    text = re.sub("گ", "ك", text)
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def process_tweet(tweet):     
    tweet=remove_hash_URL_MEN(tweet)
    tweet = re.sub('@[^\s]+', ' ', str(tweet))
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',str(tweet))    
    tweet= normalize_arabic(str(tweet))
    
    return tweet

arabert_prep = ArabertPreprocessor(model_name=model_name)
df.text = df.text.apply(lambda x: process_tweet(x))
df.text = df.text.apply(lambda x: arabert_prep.preprocess(x))

df_supp.text = df_supp.text.apply(lambda x: process_tweet(x))
df_supp.text = df_supp.text.apply(lambda x: arabert_prep.preprocess(x))

In [11]:
df.head()

Unnamed: 0,text,sentiment,label
0,عشان يلمع صورته ويعنني تمكين المرأة ويصير ترند...,Negative,0
1,روح حلل محد يم تطعيم كورونا شف الحرم البارح مل...,Neutral,1
2,هذا ما يعرف ب ' فوبيا المرأة المتمكنة ' افة فك...,Negative,0
3,LEAP22 مؤتمر يجمع اشهر وابرز المؤثرين في المجا...,Positive,2
4,خصوصية البيانات وحمايتها في المنظمة مطلب ولكن ...,Neutral,1


In [12]:
df_supp.head()

Unnamed: 0,text,sentiment,label
0,لخسارة وزنك بشكل طبيعي ما الك غير مجموعة كلين ...,Negative,0
1,لا يكلف الله نفسا الا وسعها 🌹,Positive,2
2,وكل السعوديه بدو . . حتى الحاضره منهم من اصل ب...,Negative,0
3,المونافري طلع برهوش . . سدينا 😤,Negative,0
4,منو خر بيتهم مثلنا 😩,Negative,0


In [13]:
def tokenize_text(df):
    return tokenizer(df["text"], padding="max_length", max_length=128, truncation=True)

dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_text, batched=True)

train_dataset = Dataset.from_pandas(df_supp)
dataset_supp = dataset_supp.map(tokenize_text, batched=True)

Map:   0%|          | 0/3502 [00:00<?, ? examples/s]

Map:   0%|          | 0/45273 [00:00<?, ? examples/s]

In [15]:
train_val_split = dataset.train_test_split(test_size=0.15)
train_dataset = train_val_split["train"]
val_dataset = train_val_split["test"]

In [16]:
train_supp_dataset = concatenate_datasets([train_dataset, train_dataset])

In [17]:
def accuracy_metric(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {"accuracy": (labels == preds).mean().item()}

In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    save_steps=1000,
    eval_steps=500,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset= train_supp_dataset, #train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda pred: accuracy_metric(pred),
)

trainer.train()

TypeError: TrainingArguments.__init__() missing 1 required positional argument: 'output_dir'

In [13]:
trainer.evaluate()

{'eval_loss': 0.9622489213943481,
 'eval_accuracy': 0.4695817490494297,
 'eval_runtime': 3.2279,
 'eval_samples_per_second': 162.954,
 'eval_steps_per_second': 20.447,
 'epoch': 5.0}

In [14]:
def predict_sentiment(new_tweet):
    new_encoding = tokenizer(new_tweet, padding="max_length", max_length=128, truncation=True, return_tensors="pt")
    with torch.no_grad():
        output = model(**new_encoding)
    predicted_class = torch.argmax(output.logits, dim=-1).item()
    mapping_sentiment = {"Negative": 0, "Neutral": 1, "Positive": 2}
    if predicted_class == 0: return "Negative"
    if predicted_class == 1: return "Neutral"
    return "Positive"

model = model.to("cpu")
new_tweet = "أنا أؤيد قرار الحكومة الجديدة"
new_tweet = process_tweet(new_tweet)
new_tweet = arabert_prep.preprocess(new_tweet)
predicted_sentiment = predict_sentiment(new_tweet)
print(f"Predicted sentiment for '{new_tweet}': {predicted_sentiment}")

Predicted sentiment for 'أنا أؤيد قرار الحكومة الجديدة': Neutral


In [15]:
trainer.save_model("../models/STL_ARABERT_TWITTER_sentiment")