# Setup Dependencies

In [1]:
# Run this cell for one time and then restart the kernal and don't run it agian
# !pip install optuna==2.3.0
# !pip install transformers==4.2.1
# !pip install farasapy
# !pip install pyarabic
# !git clone https://github.com/aub-mind/arabert

In [2]:
import glob
import pandas as pd
import torch
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from sklearn.model_selection import train_test_split

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
torch.cuda.empty_cache()

# Data  Processing

In [3]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob.glob("data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)

In [4]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(path_to_model, return_dict=True, num_labels=len(label_map))

In [5]:
# You could add any metric you want
def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)

    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
      'macro_f1' : macro_f1,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
      }

In [6]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [7]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(BERTDataset).__init__()
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map
      

    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())


        
        input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
          )
    
        attention_mask = [1] * len(input_ids)

        # Zero-pad to the max length.
        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
    
        return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [8]:
df = get_SMADC_folder_data()

In [9]:
df.columns = ["text", "label"]
label_list_df = ["EGY", "NOR","LEV","GLF","IRQ"]
#print(df["label"].value_counts())
train_set, test_set = train_test_split(df, test_size=0.8, random_state=42)
data_dilect = Dataset("Dilect", train_set, test_set, label_list_df)

In [10]:
model_name= "AraBERTv0.2-large"
max_len = 100
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
arabert_prep = ArabertPreprocessor(model_name=model_name)
#arabert_prep.preprocess(df.head(1)["text"][0])



In [11]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060
Sun Sep 12 14:09:33 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 466.11       Driver Version: 466.11       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| 53%   48C    P0    41W / 170W |   1599MiB / 12288MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+------------

In [12]:
data_dilect.train["text"] = data_dilect.train["text"].apply(arabert_prep.preprocess)
data_dilect.test["text"] = data_dilect.test["text"].apply(arabert_prep.preprocess)
#print(data_dilect.train["text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dilect.train["text"] = data_dilect.train["text"].apply(arabert_prep.preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dilect.test["text"] = data_dilect.test["text"].apply(arabert_prep.preprocess)


In [13]:
path_to_model = "aubmindlab/bert-large-arabertv02"
label_map = { v:index for index, v in enumerate(data_dilect.label_list) }
#print(label_map)
train_dataset = BERTDataset(data_dilect.train["text"].to_list(),data_dilect.train["label"].to_list(),path_to_model,max_len,label_map)
test_dataset = BERTDataset(data_dilect.test["text"].to_list(),data_dilect.test["label"].to_list(),path_to_model,max_len,label_map)
#print(train_dataset[0])

# Training

In [14]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 128
training_args.per_device_eval_batch_size = 128
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 5


steps_per_epoch = (len(data_dilect.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 10000
training_args.save_total = 10 
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

1100
5500


In [15]:
trainer = Trainer(
    model = model_init(),
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Some weights of the model checkpoint at aubmindlab/bert-large-arabertv02 were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification we

In [16]:
trainer.train()

***** Running training *****
  Num examples = 281691
  Num Epochs = 5
  Instantaneous batch size per device = 128
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 2
  Total optimization steps = 5500


RuntimeError: CUDA out of memory. Tried to allocate 80.00 MiB (GPU 0; 12.00 GiB total capacity; 10.38 GiB already allocated; 0 bytes free; 10.43 GiB reserved in total by PyTorch)

# Saving Model

In [None]:
trainer.save_model("big_model")

# Testing Model

In [None]:
text_test = ["آني ذاك الظلت عيونه عليج وماكدر يخطيله خطوه"
              ,"هو فعلا مفيش مدرب مصري يصلح وفعلا تاكيس جونياس كويس لاكن لو قدرنا نجيب مدرب اجنبي افضل يبقي تمام ولو اتحاد الكوره موصر علي مدرب مصري يبقي حسام حسن مفيش غيره",
             "هاض عرس سوري مش اردني تحياتي لك من ادلب",
             "وا ماكدبوش ملي قالو لا تيقة فيك اليام ولاد ناس فيك قلالو و لبنات كرهو الغرام",
             "اذا في يوم من الايام صرت قد كلامك تعال انا موجود ",
             "في ظلام الليل نسير نافيغي زماني نزدم وندير ندير نحقق الاماني",
            "يا خبر النهاردة بفلوس بكره ينزل عليه أوكازيون",
            "انفخ يا شريم قال ماكو برطم",
            "كثرة الدق يفج اللحام"]
text_label = ["IRQ","EGY","LEV","NOR","GLF","NOR","EGY","GLF","GLF"]

In [None]:
text_token = []
for text in text_test:
    text_token.append(arabert_prep.preprocess(text))
text_token_set = BERTDataset(text_token,text_label,path_to_model,max_len,label_map)

In [None]:
raw_pred, _, _ = trainer.predict(text_token_set)
y_pred = np.argmax(raw_pred, axis=1)
y_pred

In [None]:
#"EGY": 0,
#"NOR": 1,
#"LEV": 2,
#"GLF": 3,
#"IRQ": 4,

In [None]:
trainer.evaluate(test_dataset)