In [181]:
import os
import logging
from glob import glob
from datetime import datetime
from collections import namedtuple
from pickle import dump, load

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import transformers
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
import optuna

from arabert.preprocess import ArabertPreprocessor
from utilities import *

# Parameters

In [158]:
# Preprocessing params
sequence_length: int = 32 # Not sure what happens when setting it with a number different from what its trained on

# Model 
model_name = "aubmindlab/bert-base-arabertv2"
# model_name: str = "aubmindlab/bert-large-arabertv2"
model_names = [
    "aubmindlab/bert-large-arabertv2",
    "aubmindlab/bert-base-arabertv2",
    "aubmindlab/bert-base-arabertv02-twitter"
]

pretrained_classifier_name: str = "2021-09-30-train-0.8921535648994515"
pretrained_classifier_names = [name[name.rindex("\\")+1:] for name in glob("models\\finalized_models\\*")]
pretrained_classifier_names.pop(-1)

# Data
df: pd.DataFrame = get_annotated_data_folder_data()
dfs = {
    "annotated_data": get_annotated_data_folder_data(),
    "arabic_dialects": get_arabic_dialects_dataset_folder_data(),
    "dart": get_dart_folder_data()
}

# Paths
code_folder_path: str = ""

# Training 
batch_size: int = 128

# Etc
open_tensorboard: bool = True

In [159]:
def test(model_name, pretrained_classifier_name, df):
    # Model
    pretrained_classifier_path = join(code_folder_path, "models", "finalized_models", pretrained_classifier_name)
    pretrained_classifier = AutoModelForSequenceClassification.from_pretrained(pretrained_classifier_path)
    arabert_prep = ArabertPreprocessor(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Preprocessing
    temp_df = get_SMADC_folder_data(code_folder_path)
    classes = temp_df["Region"].unique()
    num_labels = len(classes)
    class_to_index = {class_:index for class_, index in zip(classes, range(num_labels))}
    index_to_class = {index:class_ for class_, index in zip(classes, range(num_labels))}
    temp_df["Labels"] = temp_df["Region"].apply(class_to_index.get)

    df["Labels"] = df["Region"].apply(class_to_index.get)
    df["Text"] = df["Text"].apply(arabert_prep.preprocess)
    df_encoding = tokenize(tokenizer, df["Text"].to_list(), sequence_length)

    test_set = Dialect_dataset(df_encoding, df["Labels"].to_list())

    trainer = Trainer(
        model=pretrained_classifier, 
        compute_metrics=compute_metrics, 
        args=generate_training_args("models", do_warmup=False, batch_size=batch_size)
    )
    prediction = trainer.predict(test_set)
    return prediction.metrics

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()
results = []
for model_name_ in model_names:
    for pretrained_classifier_name_ in pretrained_classifier_names:
        for name_df, df_ in dfs.items():
            r = test(model_name_, pretrained_classifier_name_, df_)
            results.append((model_name_, pretrained_classifier_name_, name_df, df_, 
                 r["test_loss"], r["test_macro_f1"], r["test_macro_precision"], r["test_macro_recall"], r["test_accuracy"]
            ))
            gc.collect()
            torch.cuda.empty_cache()

In [161]:
for model_name_ in ["aubmindlab/bert-base-arabertv02-twitter"]:
    for pretrained_classifier_name_ in pretrained_classifier_names:
        for name_df, df_ in dfs.items():
            r = test(model_name_, pretrained_classifier_name_, df_)
            results.append((model_name_, pretrained_classifier_name_, name_df, df_, 
                 r["test_loss"], r["test_macro_f1"], r["test_macro_precision"], r["test_macro_recall"], r["test_accuracy"]
            ))
            gc.collect()
            torch.cuda.empty_cache()

loading configuration file models\finalized_models\2021-09-30-train-0.8921535648994515\config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-base-arabertv2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "EGY",
    "1": "GLF",
    "2": "IRQ",
    "3": "LEV",
    "4": "NOR"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "EGY": 0,
    "GLF": 1,
    "IRQ": 2,
    "LEV": 3,
    "NOR": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_si

In [163]:
df_results = pd.DataFrame(results, columns=["Model name", "Pretrained classifier name", "Dataset", "df", "Loss", "Macro F1", "Macro precision", "Macro recall", "Accuracy"])
df_results.sort_values("Macro F1", ascending=False)

Unnamed: 0,Model name,Pretrained classifier name,Dataset,df,Loss,Macro F1,Macro precision,Macro recall,Accuracy
20,aubmindlab/bert-base-arabertv2,2021-12-04-train-0.87158203125,dart,...,0.783378,0.760816,0.7665,0.759033,0.756127
5,aubmindlab/bert-large-arabertv2,2021-12-04-train-0.87158203125,dart,...,0.788159,0.760212,0.765969,0.758317,0.755344
2,aubmindlab/bert-large-arabertv2,2021-09-30-train-0.8921535648994515,dart,...,1.22497,0.751637,0.767417,0.751757,0.748136
8,aubmindlab/bert-large-arabertv2,2021-12-05-train-0.898193359375,dart,...,1.114726,0.731175,0.732053,0.732794,0.727254
23,aubmindlab/bert-base-arabertv2,2021-12-05-train-0.898193359375,dart,...,1.114564,0.731025,0.731854,0.732672,0.727089
17,aubmindlab/bert-base-arabertv2,2021-09-30-train-0.8921535648994515,dart,...,1.323409,0.702196,0.703231,0.704094,0.69904
0,aubmindlab/bert-large-arabertv2,2021-09-30-train-0.8921535648994515,annotated_data,Region ...,1.668126,0.608431,0.597582,0.660434,0.702917
3,aubmindlab/bert-large-arabertv2,2021-12-04-train-0.87158203125,annotated_data,Region ...,1.051848,0.593083,0.596362,0.653978,0.700269
18,aubmindlab/bert-base-arabertv2,2021-12-04-train-0.87158203125,annotated_data,Region ...,1.05234,0.591879,0.595272,0.653304,0.698814
21,aubmindlab/bert-base-arabertv2,2021-12-05-train-0.898193359375,annotated_data,Region ...,1.626672,0.572821,0.575651,0.642667,0.675832


In [173]:
df_results.groupby("Dataset").apply(lambda df: df.sort_values("Macro F1").iloc[-1])

Unnamed: 0_level_0,Model name,Pretrained classifier name,Dataset,df,Loss,Macro F1,Macro precision,Macro recall,Accuracy
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
annotated_data,aubmindlab/bert-large-arabertv2,2021-09-30-train-0.8921535648994515,annotated_data,Region ...,1.668126,0.608431,0.597582,0.660434,0.702917
arabic_dialects,aubmindlab/bert-large-arabertv2,2021-09-30-train-0.8921535648994515,arabic_dialects,...,1.839932,0.44906,0.45921,0.44077,0.635834
dart,aubmindlab/bert-base-arabertv2,2021-12-04-train-0.87158203125,dart,...,0.783378,0.760816,0.7665,0.759033,0.756127


In [183]:
df_displayed = df_results.groupby("Dataset").apply(lambda df: df.sort_values("Macro F1").iloc[-1]).copy()
cols = ["Model name", "SMADC Accuracy", "Dataset", "Macro F1", "Macro precision", "Macro recall"]
df_displayed["Model name"] = df_displayed["Model name"].apply(lambda name: name[name.rindex("/")+1:])
df_displayed["SMADC Accuracy"] = df_displayed["Pretrained classifier name"].apply(lambda name: round(float(name[name.rindex("-")+1:]), 6))
print(df_displayed[cols].to_latex(index=False,))

\begin{tabular}{lrlrrr}
\toprule
          Model name &  SMADC Accuracy &         Dataset &  Macro F1 &  Macro precision &  Macro recall \\
bert-large-arabertv2 &        0.892154 &  annotated\_data &  0.608431 &         0.597582 &      0.660434 \\
\midrule
bert-large-arabertv2 &        0.892154 & arabic\_dialects &  0.449060 &         0.459210 &      0.440770 \\
 bert-base-arabertv2 &        0.871582 &            dart &  0.760816 &         0.766500 &      0.759033 \\
\bottomrule
\end{tabular}



In [186]:
df_displayed = df_results[["Model name", "Pretrained classifier name"]].copy()
df_displayed["Model name"] = df_displayed["Model name"].apply(lambda name: name[name.rindex("/")+1:])
df_displayed["SMADC Accuracy"] = df_displayed["Pretrained classifier name"].apply(lambda name: round(float(name[name.rindex("-")+1:]), 6))

print(df_displayed[["Model name", "SMADC Accuracy"]].to_latex(index=False,))

ValueError: could not convert string to float: 'twitter'

In [187]:
df_displayed

Unnamed: 0,Model name,Pretrained classifier name
0,bert-large-arabertv2,2021-09-30-train-0.8921535648994515
1,bert-large-arabertv2,2021-09-30-train-0.8921535648994515
2,bert-large-arabertv2,2021-09-30-train-0.8921535648994515
3,bert-large-arabertv2,2021-12-04-train-0.87158203125
4,bert-large-arabertv2,2021-12-04-train-0.87158203125
5,bert-large-arabertv2,2021-12-04-train-0.87158203125
6,bert-large-arabertv2,2021-12-05-train-0.898193359375
7,bert-large-arabertv2,2021-12-05-train-0.898193359375
8,bert-large-arabertv2,2021-12-05-train-0.898193359375
9,bert-large-arabertv2,2021-12-09-train-0.963134765625
