In [16]:
import os
import logging
from glob import glob
from datetime import datetime
from collections import namedtuple
from pickle import dump, load

import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import transformers
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
import optuna

from arabert.preprocess import ArabertPreprocessor
from utilities import *

# Parameters

In [17]:
# Preprocessing params
sequence_length: int = 32 # Not sure what happens when setting it with a number different from what its trained on

# Model 
model_name = "aubmindlab/bert-base-arabertv2"
# model_name: str = "aubmindlab/bert-large-arabertv2"
pretrained_classifier_name: str = "2021-09-30-train-0.8921535648994515"

# Data
df: pd.DataFrame = get_annotated_data_folder_data()

# Paths
code_folder_path: str = ""

# Training 
batch_size: int = 128

# Etc
open_tensorboard: bool = True

## Don't touch

In [18]:
# Model
pretrained_classifier_path = join(code_folder_path, "models", "finalized_models", pretrained_classifier_name) 
pretrained_classifier = AutoModelForSequenceClassification.from_pretrained(pretrained_classifier_path)
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Preprocessing
temp_df = get_SMADC_folder_data(code_folder_path)
classes = temp_df["Region"].unique()
num_labels = len(classes)
class_to_index = {class_:index for class_, index in zip(classes, range(num_labels))}
index_to_class = {index:class_ for class_, index in zip(classes, range(num_labels))}
temp_df["Labels"] = temp_df["Region"].apply(class_to_index.get)

loading configuration file models\finalized_models\2021-09-30-train-0.8921535648994515\config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-base-arabertv2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "EGY",
    "1": "GLF",
    "2": "IRQ",
    "3": "LEV",
    "4": "NOR"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "EGY": 0,
    "GLF": 1,
    "IRQ": 2,
    "LEV": 3,
    "NOR": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_si

# Preprocessing & loading model

In [19]:
%%time

df["Labels"] = df["Region"].apply(class_to_index.get)
df["Text"] = df["Text"].apply(arabert_prep.preprocess)
df_encoding = tokenize(tokenizer, df["Text"].to_list(), sequence_length)

test_set = Dialect_dataset(df_encoding, df["Labels"].to_list())

Wall time: 38.9 s


In [20]:

temp_df.groupby("Region").sample(n=1, random_state=1)

Unnamed: 0,Text,Region,Labels
155371,ههههههه جريدة مسخرة ومعرضاها أوي,EGY,0
167543,عشان ماعندهم حراج مثل حراج ابن قاسم نبتت تسذا,GLF,1
76120,ساجد زويد,IRQ,2
101870,الله يتغمده برحمته وجميع شهدائنا الابرار يارب,LEV,3
70711,الطرامواي اصبح تايتانيك بحلة مغربية هههههه,NOR,4


In [21]:
df.groupby("Region").sample(n=1, random_state=1)

Unnamed: 0,Region,Text,Labels
8825,EGY,بس كفاي +ه ان ال+ أعمى يشوف ان ال+ أهلي خد ال+...,0
5308,GLF,و+ الله ال+ مشروع جبار و+ ال+ ملك عبدالله وطني...,1
128,IRQ,ما ودي أقول وشل +ون تسم +ون +ها مسلم +ه ل+ اني...,2
3179,LEV,أنا ما ب+ عرف اح +نا في ال+ أردن اذ +ا جبن +ا ...,3
1155,NOR,يا بن شيخ +ة يا بن شيخ +ة يابن شيخ +ة ماتخلطش ...,4


# Testing

In [22]:
trainer = Trainer(
    model=pretrained_classifier, 
    compute_metrics=compute_metrics, 
    args=generate_training_args("models", do_warmup=False, batch_size=batch_size)
)
prediction = trainer.predict(test_set)
prediction.metrics

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using amp fp16 backend
100%|██████████| 210/210 [07:13<00:00,  2.06s/it]
***** Running Prediction *****
  Num examples = 26804
  Batch size = 128
100%|██████████| 210/210 [00:14<00:00, 15.11it/s]

{'test_loss': 1.6681255102157593,
 'test_macro_f1': 0.6084307295532555,
 'test_macro_precision': 0.5975817387564133,
 'test_macro_recall': 0.6604344877841063,
 'test_accuracy': 0.7029174750037308,
 'test_runtime': 15.1074,
 'test_samples_per_second': 1774.236,
 'test_steps_per_second': 13.901}

In [23]:
y_pred = [np.argmax(pred) for pred in prediction[0]]
y_true = df["Labels"]
accuracy_score(y_true , y_pred)

0.7029174750037308

In [24]:
text = " الاثنين غير متمكنين من توظيف اراءهم في منطوق رصين وقوي ومقنع، الاحظ الارتباك وتكرار الكلام وربما مدير الحوار لم يوفق الى القدره على استخلاص الافكار وتوجيه مسار المناظرة بشكل يجعل المتناظرين والمستمعين قادرين على تحديد وجهة النظر السليمة" 
predict_dialect(pretrained_classifier_path, text, tokenizer, arabert_prep.preprocess, sequence_length)

loading configuration file models\finalized_models\2021-09-30-train-0.8921535648994515\config.json
Model config BertConfig {
  "_name_or_path": "aubmindlab/bert-base-arabertv2",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "EGY",
    "1": "GLF",
    "2": "IRQ",
    "3": "LEV",
    "4": "NOR"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "EGY": 0,
    "GLF": 1,
    "IRQ": 2,
    "LEV": 3,
    "NOR": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "type_vocab_si

'NOR'

100%|██████████| 210/210 [00:32<00:00, 15.11it/s]