In [1]:
import os
import logging
from glob import glob
from datetime import datetime
from collections import namedtuple
from pickle import dump, load

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import transformers
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
import optuna

from arabert.preprocess import ArabertPreprocessor
from utilities import *



# Parameters

In [11]:
# Preprocessing params
sequence_length: int = 32 # Not sure what happens when setting it with a number different from what its trained on

# Model 
# model_name = "bashar-talafha/multi-dialect-bert-base-arabic"
model_name: str = "aubmindlab/bert-large-arabertv2"
model_names = [
    "aubmindlab/bert-base-arabertv2",
    # "aubmindlab/bert-base-arabertv2",
    # "aubmindlab/bert-base-arabertv02-twitter"
    # "bashar-talafha/multi-dialect-bert-base-arabic"
]

pretrained_classifier_names = ["2021-12-05-train-0.898193359375"]
# pretrained_classifier_names = [name[name.rindex("\\")+1:] for name in glob("models\\finalized_models\\*")]
# pretrained_classifier_names.pop(-1)

# Data
df: pd.DataFrame = get_annotated_data_folder_data()
dfs = {
    "annotated_data": get_annotated_data_folder_data(),
    "arabic_dialects": get_arabic_dialects_dataset_folder_data(),
    "dart": get_dart_folder_data()
}

# Paths
code_folder_path: str = ""

# Training 
batch_size: int = 128

# Etc
open_tensorboard: bool = True

In [12]:
def test(model_name, pretrained_classifier_name, df):
    # Model
    pretrained_classifier_path = join(code_folder_path, "models", "finalized_models", pretrained_classifier_name)
    pretrained_classifier = AutoModelForSequenceClassification.from_pretrained(pretrained_classifier_path)
    arabert_prep = ArabertPreprocessor(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Preprocessing
    temp_df = get_SMADC_folder_data(code_folder_path)
    classes = temp_df["Region"].unique()
    num_labels = len(classes)
    class_to_index = {class_:index for class_, index in zip(classes, range(num_labels))}
    index_to_class = {index:class_ for class_, index in zip(classes, range(num_labels))}
    temp_df["Labels"] = temp_df["Region"].apply(class_to_index.get)

    df["Labels"] = df["Region"].apply(class_to_index.get)
    df["Text"] = df["Text"].apply(arabert_prep.preprocess)
    df_encoding = tokenize(tokenizer, df["Text"].to_list(), sequence_length)

    test_set = Dialect_dataset(df_encoding, df["Labels"].to_list())

    trainer = Trainer(
        model=pretrained_classifier, 
        compute_metrics=compute_metrics, 
        args=generate_training_args("models", do_warmup=False, batch_size=batch_size)
    )
    prediction = trainer.predict(test_set)
    return prediction.metrics

In [13]:
import gc
gc.collect()
torch.cuda.empty_cache()
results = []
for model_name_ in model_names:
    for pretrained_classifier_name_ in pretrained_classifier_names:
        for name_df, df_ in dfs.items():
            r = test(model_name_, pretrained_classifier_name_, df_)
            results.append((model_name_, pretrained_classifier_name_, name_df, df_, 
                 r["test_loss"], r["test_macro_f1"], r["test_macro_precision"], r["test_macro_recall"], r["test_accuracy"]
            ))
            gc.collect()
            torch.cuda.empty_cache()

loading configuration file models\finalized_models\2021-12-05-train-0.898193359375\config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-large-arabertv2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "NOR",
    "1": "EGY",
    "2": "LEV",
    "3": "GLF",
    "4": "IRQ"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "EGY": 1,
    "GLF": 3,
    "IRQ": 4,
    "LEV": 2,
    "NOR": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab

In [14]:
df_results = pd.DataFrame(results, columns=["Model name", "Pretrained classifier name", "Dataset", "df", "Loss", "Macro F1", "Macro precision", "Macro recall", "Accuracy"])
df_results.sort_values("Macro F1", ascending=False)

Unnamed: 0,Model name,Pretrained classifier name,Dataset,df,Loss,Macro F1,Macro precision,Macro recall,Accuracy
2,aubmindlab/bert-base-arabertv2,2021-12-05-train-0.898193359375,dart,...,1.070189,0.764301,0.77594,0.766432,0.761811
0,aubmindlab/bert-base-arabertv2,2021-12-05-train-0.898193359375,annotated_data,Region ...,1.513531,0.62462,0.613687,0.679687,0.721758
1,aubmindlab/bert-base-arabertv2,2021-12-05-train-0.898193359375,arabic_dialects,...,1.658692,0.457107,0.469979,0.447111,0.642535


In [6]:
df_results.groupby("Dataset").apply(lambda df: df.sort_values("Macro F1").iloc[-1])

Unnamed: 0_level_0,Model name,Pretrained classifier name,Dataset,df,Loss,Macro F1,Macro precision,Macro recall,Accuracy
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
annotated_data,aubmindlab/bert-large-arabertv2,2022-03-23-train-bert-large-arabertv2-0.946777...,annotated_data,Region ...,7.294701,0.063544,0.123827,0.082429,0.068684
arabic_dialects,aubmindlab/bert-large-arabertv2,2022-03-23-train-bert-large-arabertv2-0.946777...,arabic_dialects,...,6.864985,0.082905,0.139608,0.073939,0.079095
dart,aubmindlab/bert-large-arabertv2,2022-03-23-train-bert-large-arabertv2-0.946777...,dart,...,6.710032,0.066968,0.074348,0.066111,0.067589


In [7]:
df_displayed = df_results.groupby("Dataset").apply(lambda df: df.sort_values("Macro F1").iloc[-1]).copy()
cols = ["Model name", "Dataset", "Macro F1", "Macro precision", "Macro recall"]
df_displayed["Model name"] = df_displayed["Model name"].apply(lambda name: name[name.rindex("/")+1:])
df_displayed["SMADC Accuracy"] = df_displayed["Pretrained classifier name"].apply(lambda name: round(float(name[name.rindex("-")+1:]), 6))
print(df_displayed[cols].to_latex(index=False,))

\begin{tabular}{llrrr}
\toprule
          Model name &         Dataset &  Macro F1 &  Macro precision &  Macro recall \\
bert-large-arabertv2 &  annotated\_data &  0.063544 &         0.123827 &      0.082429 \\
\midrule
bert-large-arabertv2 & arabic\_dialects &  0.082905 &         0.139608 &      0.073939 \\
bert-large-arabertv2 &            dart &  0.066968 &         0.074348 &      0.066111 \\
\bottomrule
\end{tabular}



  print(df_displayed[cols].to_latex(index=False,))


In [8]:
df_acc = df_results.groupby("Model name").apply(lambda df: df.sort_values("Macro F1").iloc[-1]).copy()
# df_acc["Model name"] = df_acc["Model name"].apply(lambda name: name[name.rindex("/")+1:])
df_acc["SMADC Accuracy"] = df_acc["Pretrained classifier name"].apply(lambda name: round(float(name[name.rindex("-")+1:]), 6))
print(df_acc[["Model name", "SMADC Accuracy"]].to_latex(index=False))

\begin{tabular}{lr}
\toprule
                     Model name &  SMADC Accuracy \\
aubmindlab/bert-large-arabertv2 &        0.946777 \\
\bottomrule
\end{tabular}



  print(df_acc[["Model name", "SMADC Accuracy"]].to_latex(index=False))


In [9]:
df_displayed = df_results[["Model name", "Pretrained classifier name"]].copy()
df_displayed["Model name"] = df_displayed["Model name"].apply(lambda name: name[name.rindex("/")+1:])
df_displayed["SMADC Accuracy"] = df_displayed["Pretrained classifier name"].apply(lambda name: round(float(name[name.rindex("-")+1:]), 6))

print(df_displayed[["Model name", "SMADC Accuracy"]].to_latex(index=False,))

\begin{tabular}{lr}
\toprule
          Model name &  SMADC Accuracy \\
\midrule
bert-large-arabertv2 &        0.946777 \\
bert-large-arabertv2 &        0.946777 \\
bert-large-arabertv2 &        0.946777 \\
\bottomrule
\end{tabular}



  print(df_displayed[["Model name", "SMADC Accuracy"]].to_latex(index=False,))


In [10]:
df_displayed

Unnamed: 0,Model name,Pretrained classifier name,SMADC Accuracy
0,bert-large-arabertv2,2022-03-23-train-bert-large-arabertv2-0.946777...,0.946777
1,bert-large-arabertv2,2022-03-23-train-bert-large-arabertv2-0.946777...,0.946777
2,bert-large-arabertv2,2022-03-23-train-bert-large-arabertv2-0.946777...,0.946777


# Delete anything after this cell

In [18]:
# Config
seed: int = 1

# Data 
data_proportion: float = 1.0 # propotion of data to be loaded in df
load_data: bool = False 
save_data: bool = False
test_validation_proportion: float = 0.013 # test and validation proportion from df

# Model 
model_name = "aubmindlab/bert-large-arabertv2"
from_pretrained_classifier: bool = False
pretrained_classifier_name: str = "2021-12-05-train-0.898193359375"
    
# Preprocessing 
sequence_length: int = 32
tokenize_in_batches: bool = False # Helps reduce memory footprint

# Paths
code_folder_path: str = ""

# Training 
validation_size: int = 4096
batch_size: int = 64
learning_rate: float = 1e-5
epochs: int = 4
warmup_ratio: float = 0.2
save_model_while_training: bool = True # maybe doesn't work, transformers is terrible
do_warmup: bool = True
eval_while_training: bool = True # maybe doesn't work, transformers is terrible
save_model_after_finish: bool = True # maybe doesn't work, transformers is terrible

# Etc
open_tensorboard: bool = True

In [21]:
pretrained_classifier_path = join(code_folder_path, "models", "finalized_models", "2021-12-05-train-0.898193359375")
pretrained_classifier = AutoModelForSequenceClassification.from_pretrained(pretrained_classifier_path)
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file models\finalized_models\2021-12-05-train-0.898193359375\config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-large-arabertv2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "NOR",
    "1": "EGY",
    "2": "LEV",
    "3": "GLF",
    "4": "IRQ"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "EGY": 1,
    "GLF": 3,
    "IRQ": 4,
    "LEV": 2,
    "NOR": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab

In [23]:
validate

Unnamed: 0,Text,Region
0,يا عم أنجز تعالى خلي +نا ناخد إجاز +ة,EGY
1,هه طاح حض +ك,IRQ
2,في أشياء تبقى حلو +ه ب+ ال+ سر تستلذ في +ها أن...,GLF
3,أين ال+ مصدقيه في ال+ توظيف منذو زمن طويل و+ أ...,GLF
4,ألف مبرووك حمددلله,GLF
...,...,...
4091,هب +ة شتردين من +ي ل+ خاطر الله هه,IRQ
4092,يسلم ذوق +ك هي +ك ال+ مفروض على ال+ فنان ال+ م...,LEV
4093,اللي هو ازاى يعنى,EGY
4094,حرام علي +ك يا وزير ال+ تموين و+ ال+ رز اللي ي...,EGY


In [30]:
# # test(model_name_, pretrained_classifier_name_, df_)
# df = get_SMADC_folder_data()
# df["Text"] = df["Text"].apply(arabert_prep.preprocess)

# train, test = train_test_split(df, test_size=test_validation_proportion, random_state=1)
# validate, test = train_test_split(test, test_size=len(test)-validation_size, random_state=1)
# train.reset_index(drop=True, inplace=True)
# validate.reset_index(drop=True, inplace=True)
# test.reset_index(drop=True, inplace=True)

# # Tokenize
# validate_encoding = tokenize(tokenizer, validate["Text"].to_list(), sequence_length)
# test_encoding = tokenize(tokenizer, test["Text"].to_list(), sequence_length)
# train_encoding = tokenize(tokenizer, list(train["Text"]), sequence_length)

classes = df["Region"].unique()
num_labels = len(classes)
class_to_index = {class_:index for class_, index in zip(classes, range(len(classes)))}
index_to_class = {index:class_ for class_, index in zip(classes, range(len(classes)))}
test["Labels"] = test["Region"].apply(class_to_index.get)
validate["Labels"] = validate["Region"].apply(class_to_index.get)
train["Labels"] = train["Region"].apply(class_to_index.get)

# Make Dataset 
validate_dataset = Dialect_dataset(validate_encoding, validate["Labels"].to_list())
test_dataset = Dialect_dataset(test_encoding, test["Labels"].to_list())
train_dataset = Dialect_dataset(train_encoding, train["Labels"].to_list())

In [31]:
res = Trainer(
        model=pretrained_classifier, 
        compute_metrics=compute_metrics, 
        args=generate_training_args("models", do_warmup=False, batch_size=batch_size)
    ).evaluate(test_dataset)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp fp16 backend
***** Running Evaluation *****
  Num examples = 14214
  Batch size = 64
100%|██████████| 223/223 [00:26<00:00,  7.18it/s]Trainer is attempting to log a value of "{'NOR': {'precision': 0.9333740831295844, 'recall': 0.9666086406076911, 'f1-score': 0.9497006919070202, 'support': 6319}, 'IRQ': {'precision': 0.8625565610859729, 'recall': 0.8630447085455575, 'f1-score': 0.8628005657708628, 'support': 1767}, 'LEV': {'precision': 0.8424396442185514, 'recall': 0.8236024844720496, 'f1-score': 0.8329145728643216, 'support': 1610}, 'EGY': {'precision': 0.8195848855774348, 'recall': 0.7754279959718026, 'f1-score': 0.7968952134540751, 'support': 1986}, 'GLF': {'precision': 0.86647611269

In [33]:
res["eval_report"]

{'NOR': {'precision': 0.9333740831295844,
  'recall': 0.9666086406076911,
  'f1-score': 0.9497006919070202,
  'support': 6319},
 'IRQ': {'precision': 0.8625565610859729,
  'recall': 0.8630447085455575,
  'f1-score': 0.8628005657708628,
  'support': 1767},
 'LEV': {'precision': 0.8424396442185514,
  'recall': 0.8236024844720496,
  'f1-score': 0.8329145728643216,
  'support': 1610},
 'EGY': {'precision': 0.8195848855774348,
  'recall': 0.7754279959718026,
  'f1-score': 0.7968952134540751,
  'support': 1986},
 'GLF': {'precision': 0.8664761126990609,
  'recall': 0.8380726698262243,
  'f1-score': 0.852037743425015,
  'support': 2532},
 'accuracy': 0.8879273955255382,
 'macro avg': {'precision': 0.8648862573421209,
  'recall': 0.853351299884665,
  'f1-score': 0.858869757484259,
  'support': 14214},
 'weighted avg': {'precision': 0.8864548474769547,
  'recall': 0.8879273955255382,
  'f1-score': 0.8869222734248672,
  'support': 14214}}