# Setup Dependencies

In [1]:
# Run this cell for one time and then restart the kernal and don't run it agian
# !pip install optuna==2.3.0
# !pip install transformers==4.2.1
# !pip install farasapy
# !pip install pyarabic
# !git clone https://github.com/aub-mind/arabert

In [46]:
import glob
import pandas as pd
import torch
from arabert.preprocess import ArabertPreprocessor
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
from sklearn.model_selection import train_test_split

from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer , TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.utils import resample
import logging
import torch
import optuna 
logging.basicConfig(level=logging.WARNING)
logger = logging.getLogger(__name__)
torch.cuda.empty_cache()

# Data  Processing

In [47]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob.glob("data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)

In [48]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(path_to_model, return_dict=True, num_labels=len(label_map))

In [49]:
# You could add any metric you want
def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)

    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
      'macro_f1' : macro_f1,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
      }

In [50]:
class Dataset:
    def __init__(
        self,
        name,
        train,
        test,
        label_list,
    ):
        self.name = name
        self.train = train
        self.test = test
        self.label_list = label_list

In [51]:
class BERTDataset(Dataset):
    def __init__(self, text, target, model_name, max_len, label_map):
        super(BERTDataset).__init__()
        self.text = text
        self.target = target
        self.tokenizer_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_len = max_len
        self.label_map = label_map
      

    def __len__(self):
        return len(self.text)

    def __getitem__(self,item):
        text = str(self.text[item])
        text = " ".join(text.split())


        
        input_ids = self.tokenizer.encode(
          text,
          add_special_tokens=True,
          max_length=self.max_len,
          truncation='longest_first'
          )
    
        attention_mask = [1] * len(input_ids)

        # Zero-pad to the max length.
        padding_length = self.max_len - len(input_ids)
        input_ids = input_ids + ([self.tokenizer.pad_token_id] * padding_length)
        attention_mask = attention_mask + ([0] * padding_length)
    
        return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=self.label_map[self.target[item]])

In [52]:
df = get_SMADC_folder_data()

In [53]:
df.columns = ["text", "label"]
label_list_df = ["EGY", "NOR","LEV","GLF","IRQ"]
#print(df["label"].value_counts())
train_set, test_set = train_test_split(df, test_size=0.8, random_state=42)
data_dilect = Dataset("Dilect", train_set, test_set, label_list_df)

In [54]:
model_name="bert-base-arabertv2"
max_len = 200
text = "ولن نبالغ إذا قلنا إن هاتف أو كمبيوتر المكتب في زمننا هذا ضروري"
arabert_prep = ArabertPreprocessor(model_name=model_name)
#arabert_prep.preprocess(df.head(1)["text"][0])



In [55]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))
    !nvidia-smi

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 3060
Wed Sep  8 17:37:47 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 466.11       Driver Version: 466.11       CUDA Version: 11.3     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0  On |                  N/A |
| 53%   50C    P2    44W / 170W |   4648MiB / 12288MiB |      4%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+------------

In [56]:
data_dilect.train["text"] = data_dilect.train["text"].apply(lambda text:arabert_prep.preprocess(text))
data_dilect.test["text"] = data_dilect.test["text"].apply(lambda text:arabert_prep.preprocess(text))
#print(data_dilect.train["text"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dilect.train["text"] = data_dilect.train["text"].apply(lambda text:arabert_prep.preprocess(text))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_dilect.test["text"] = data_dilect.test["text"].apply(lambda text:arabert_prep.preprocess(text))


In [57]:
path_to_model = "aubmindlab/bert-base-arabertv02"
label_map = { v:index for index, v in enumerate(data_dilect.label_list) }
#print(label_map)
train_dataset = BERTDataset(data_dilect.train["text"].to_list(),data_dilect.train["label"].to_list(),path_to_model,max_len,label_map)
test_dataset = BERTDataset(data_dilect.test["text"].to_list(),data_dilect.test["label"].to_list(),path_to_model,max_len,label_map)
#print(train_dataset[0])

loading configuration file https://huggingface.co/aubmindlab/bert-base-arabertv02/resolve/main/config.json from cache at C:\Users\mohnd/.cache\huggingface\transformers\411eec8d9e12bf4c11eebebb4c5fecd46da787616f45bcfd6cb187e0917afae0.2f0d0092105af7b8b42b899ffb7f801dc48e93516d509483f6cfbd86155d49ea
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}

loading file https://huggingface.co/aubmindlab/bert-base-arabertv02/resolve/main/

# Training

In [58]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
training_args.adam_epsilon = 1e-8
training_args.learning_rate = 5e-5
training_args.fp16 = True
training_args.per_device_train_batch_size = 128
training_args.per_device_eval_batch_size = 128
training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 25


steps_per_epoch = (len(data_dilect.train)// (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps))
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
warmup_ratio = 0.1
training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.evaluation_strategy = EvaluationStrategy.EPOCH
# training_args.logging_steps = 200
training_args.save_steps = 100000 #don't want to save any model, there is probably a better way to do this :)
training_args.seed = 42
training_args.disable_tqdm = False
training_args.lr_scheduler_type = 'cosine'

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


1100
27500


In [27]:
trainer = Trainer(
    model = trainer.model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

Using amp fp16 backend


In [28]:
trainer.train()

***** Running training *****
  Num examples = 281691
  Num Epochs = 25
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 2
  Total optimization steps = 220075


Step,Training Loss,Validation Loss


  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./train\checkpoint-100000
Configuration saved in ./train\checkpoint-100000\config.json
Model weights saved in ./train\checkpoint-100000\pytorch_model.bin
  nn.utils.clip_grad_norm_(
Saving model checkpoint to ./train\checkpoint-200000
Configuration saved in ./train\checkpoint-200000\config.json
Model weights saved in ./train\checkpoint-200000\pytorch_model.bin
  nn.utils.clip_grad_norm_(


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=220075, training_loss=0.12656088520258268, metrics={'train_runtime': 89098.1632, 'train_samples_per_second': 79.04, 'train_steps_per_second': 2.47, 'total_flos': 7.2380871746901e+17, 'train_loss': 0.12656088520258268, 'epoch': 25.0})

# Saving Model

In [30]:
trainer.save_model("25_epoch")

Saving model checkpoint to 25_epoch
Configuration saved in 25_epoch\config.json
Model weights saved in 25_epoch\pytorch_model.bin


# Testing Model

In [31]:
text_test = ["آني ذاك الظلت عيونه عليج وماكدر يخطيله خطوه"
              ,"هو فعلا مفيش مدرب مصري يصلح وفعلا تاكيس جونياس كويس لاكن لو قدرنا نجيب مدرب اجنبي افضل يبقي تمام ولو اتحاد الكوره موصر علي مدرب مصري يبقي حسام حسن مفيش غيره",
             "هاض عرس سوري مش اردني تحياتي لك من ادلب",
             "وا ماكدبوش ملي قالو لا تيقة فيك اليام ولاد ناس فيك قلالو و لبنات كرهو الغرام",
             "اذا في يوم من الايام صرت قد كلامك تعال انا موجود ",
             "في ظلام الليل نسير نافيغي زماني نزدم وندير ندير نحقق الاماني",
            "يا خبر النهاردة بفلوس بكره ينزل عليه أوكازيون",
            "انفخ يا شريم قال ماكو برطم",
            "كثرة الدق يفج اللحام"]
text_label = ["IRQ","EGY","LEV","NOR","GLF","NOR","EGY","GLF","GLF"]

In [32]:
text_token = []
for text in text_test:
    text_token.append(arabert_prep.preprocess(text))
text_token_set = BERTDataset(text_token,text_label,path_to_model,max_len,label_map)

loading configuration file https://huggingface.co/aubmindlab/bert-base-arabertv02/resolve/main/config.json from cache at C:\Users\mohnd/.cache\huggingface\transformers\411eec8d9e12bf4c11eebebb4c5fecd46da787616f45bcfd6cb187e0917afae0.2f0d0092105af7b8b42b899ffb7f801dc48e93516d509483f6cfbd86155d49ea
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.10.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 64000
}

loading file https://huggingface.co/aubmindlab/bert-base-arabertv02/resolve/main/

In [33]:
raw_pred, _, _ = trainer.predict(text_token_set)
y_pred = np.argmax(raw_pred, axis=1)
y_pred

***** Running Prediction *****
  Num examples = 9
  Batch size = 16


array([4, 0, 2, 1, 3, 1, 0, 0, 1], dtype=int64)

In [34]:
#"EGY": 0,
#"NOR": 1,
#"LEV": 2,
#"GLF": 3,
#"IRQ": 4,

In [59]:
trainer = Trainer(
    model = epoch25,
    args = training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)
trainer.evaluate(test_dataset)

Using amp fp16 backend
***** Running Evaluation *****
  Num examples = 1126765
  Batch size = 128


{'eval_loss': 1.4585100412368774,
 'eval_macro_f1': 0.8017849717637787,
 'eval_macro_precision': 0.8067391787039732,
 'eval_macro_recall': 0.7971792630112084,
 'eval_accuracy': 0.8348067254485185,
 'eval_runtime': 5409.7077,
 'eval_samples_per_second': 208.286,
 'eval_steps_per_second': 1.627}

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
accuracy_score()

In [35]:
test_dataset[100]

InputFeatures(input_ids=[2, 9016, 17, 3824, 17, 141, 1045, 838, 17, 138, 113, 17, 31917, 17, 3824, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], attention_mask=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [33]:
epoch25 = AutoModelForSequenceClassification.from_pretrained("25_epoch")