In [9]:
import logging
from os import system
from glob import glob
from os.path import join
from pickle import dump, load
from datetime import datetime
from collections import namedtuple


import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import transformers
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
import optuna 

from arabert.preprocess import ArabertPreprocessor

import tensorboard_analysis 
from utilities import *

# Parameters

In [10]:
# Config
device = torch.device("cuda")
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
logging.disable(logging.WARNING)
seed = 1

# Data params
data_proportion = 1.0 # propotion of data to be loaded in df
load_data = True 
save_data = False
test_validation_proportion = 0.1 # test and validation proportion from df

# Model params
# model_name = "aubmindlab/bert-base-arabertv2"
model_name = "aubmindlab/bert-large-arabertv2"
    
# Preprocessing params
sequence_length = 64
tokenize_in_batches = False # Helps reduce memory footprint
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Paths
code_folder_path = ""
train_path= f"./models/{str(datetime.today().date())}-train"
search_path = f"./models/{str(datetime.today().date())}-search"
dataset_string = "{}_dataset-seqlen" + str(sequence_length)

# Training params
validation_size = 4096
batch_size = 32
learning_rate = 1e-5
epochs = 5
warmup_ratio = 0.1
save_model_while_training = True # doesn't work, transformers is terrible
do_warmup = True
eval_while_training = True
save_model_after_finish = True

# Etc
open_tensorboard = True

# Data (loading, preprocessing, tokenizing)

In [11]:
%%time
# Date to dataframe ~(2.9 s)
df = get_SMADC_folder_data(code_folder_path)
df = df.sample(frac=data_proportion)

# Encode Y ~(307 ms)
classes = df["Region"].unique()
num_labels = len(classes)
class_to_index = {class_:index for class_, index in zip(classes, range(len(classes)))}
index_to_class = {index:class_ for class_, index in zip(classes, range(len(classes)))}
df["Labels"] = df["Region"].apply(class_to_index.get)

Wall time: 11.6 s


### Further preprocessing

In [12]:
if not load_data:
    # Preprocess X ~(16min 22s)
    df["Text"] = df["Text"].apply(arabert_prep.preprocess)

    # split and ~(323ms)
    train, test = train_test_split(df, test_size=test_validation_proportion, random_state=seed)
    validate, test = train_test_split(test, test_size=len(test)-validation_size, random_state=seed)
    train.reset_index(drop=True, inplace=True)
    validate.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    # Tokenize
    if tokenize_in_batches:
        validate_encoding = batch_tokenize(tokenizer, validate["Text"], 10, sequence_length)
        test_encoding = batch_tokenize(tokenizer, test["Text"], 100, sequence_length)
        train_encoding = batch_tokenize(tokenizer, train["Text"], 500, sequence_length)
    else:
        validate_encoding = tokenize(tokenizer, validate["Text"].to_list(), sequence_length)
        test_encoding = tokenize(tokenizer, test["Text"].to_list(), sequence_length)
        train_encoding = tokenize(tokenizer, list(train["Text"]), sequence_length)

    # Make Dataset 
    validate_dataset = Dialect_dataset(validate_encoding, validate["Labels"].to_list())
    test_dataset = Dialect_dataset(test_encoding, test["Labels"].to_list())
    train_dataset = Dialect_dataset(train_encoding, train["Labels"].to_list())

# Save & load preprocessed data

In [13]:
if save_data:
    save_preprocessed_data(validate_dataset, "preprocessed_validation")
    save_preprocessed_data(test_dataset, "preprocessed_test")
    save_preprocessed_data(train_dataset, "preprocessed_train")

if load_data:
    # ~(3mins)
    validate_dataset = load_preprocessed_data("preprocessed_validation")
    test_dataset = load_preprocessed_data("preprocessed_test")
    train_dataset = load_preprocessed_data("preprocessed_train")

Wall time: 3min 3s


# Training

In [14]:
def generate_training_args(output_dir, epochs=5, do_warmup=True, warmup_ratio=0.05, save_model=True, eval_while_training=True):
    training_args = TrainingArguments(output_dir)

    training_args.adam_epsilon = 1e-8
    training_args.learning_rate = learning_rate

    training_args.fp16 = True

    training_args.per_device_train_batch_size = batch_size
    training_args.per_device_eval_batch_size = batch_size

    training_args.gradient_accumulation_steps = 1
    
    if epochs:
        training_args.num_train_epochs = epochs

    steps_per_epoch = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    total_steps = steps_per_epoch * training_args.num_train_epochs
    
    if do_warmup:
        warmup_ratio = warmup_ratio
        training_args.warmup_steps = total_steps * warmup_ratio 
    
    training_args.logging_steps = 1024
    
    if eval_while_training:
        training_args.evaluation_strategy = EvaluationStrategy.STEPS
        training_args.eval_strategy = EvaluationStrategy.STEPS
        training_args.evaluate_during_training = True
        training_args.load_best_model_at_end = True
        training_args.eval_steps = 1024 # defaults to logging_steps
        training_args.metric_for_best_model= "eval_loss"
    
    if save_model:
        training_args.save_steps = 1024
        training_args.save_total_limit = 10
        training_args.save_strategy = "steps"


    training_args.seed = seed

    training_args.lr_scheduler_type = 'cosine'

    return training_args

training_args = generate_training_args(
                    train_path, epochs=epochs, do_warmup=do_warmup, 
                    warmup_ratio=warmup_ratio, save_model=save_model_while_training, 
                    eval_while_training=eval_while_training
                    )

# Begin train

In [15]:
trainer = Trainer(
    model_init=lambda:model_init(model_name, len(classes), label2id=class_to_index, id2label=index_to_class),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
)

In [16]:
if open_tensorboard:
    from tensorboard import program
    tb = program.TensorBoard()
    tb.configure(argv=[None, '--logdir', join(code_folder_path, f"models")])
    print(f"Tensorflow listening on {tb.launch()}")

Tensorflow listening on http://localhost:6006/


In [17]:
class EvaluatePleaseCallback(transformers.TrainerCallback):
    def on_save(self, args, state, control, model, **kwargs):
        trainer.evaluate()
 
trainer.add_callback(EvaluatePleaseCallback())
trainer.train()

  0%|          | 101/198065 [01:05<24:19:41,  2.26it/s]

# Analysis

In [None]:
trainer.evaluate()


 11%|█         | 21588/198065 [3:34:24<21:11:39,  2.31it/s]

{'eval_loss': 0.49099254608154297, 'eval_macro_f1': 0.7990344686502608, 'eval_macro_precision': 0.82856805555997, 'eval_macro_recall': 0.7783906309962003, 'eval_accuracy': 0.831787109375, 'eval_runtime': 14.9573, 'eval_samples_per_second': 273.845, 'eval_steps_per_second': 8.558, 'epoch': 0.54}


{'eval_loss': 0.49099254608154297,
 'eval_macro_f1': 0.7990344686502608,
 'eval_macro_precision': 0.82856805555997,
 'eval_macro_recall': 0.7783906309962003,
 'eval_accuracy': 0.831787109375,
 'eval_runtime': 14.9573,
 'eval_samples_per_second': 273.845,
 'eval_steps_per_second': 8.558,
 'epoch': 0.54}

# Save & load

In [None]:
if save_model_after_finish:
    trainer.save_model(f'models/finalized_models/{trainer.args.output_dir.split("/")[-1]}-{trainer.evaluate()["eval_accuracy"]}')

100%|██████████| 128/128 [00:13<00:00,  9.34it/s]


# Hyperparameter search

In [None]:
# def objective(trial: optuna.Trial):     
    
#     training_args = generate_training_args(search_path, epochs=None, do_warmup=False, save_model=False, eval_while_training=False)
#     training_args.learning_rate= trial.suggest_loguniform('learning_rate', low=4e-5, high=0.01)
#     training_args.weight_decay= trial.suggest_loguniform('weight_decay', 4e-5, 0.01)
#     training_args.num_train_epochs= trial.suggest_int('num_train_epochs', low=2, high=5)
    
#     trainer = Trainer(
#         model_init=lambda:model_init(model_name, len(classes), label2id=class_to_index, id2label=index_to_class),
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=validate_dataset,
#         compute_metrics=compute_metrics,
#         callbacks=[TensorBoardCallback()]
#     )
    
#     result = trainer.train()     
    
#     return result.training_loss # Or result.training_loss["metric_name"] ps: change direction in study if necessary
   
# We want to minimize the loss! 
# study = optuna.create_study(study_name='hyper-parameter-search', direction='minimize') 
# study.optimize(func=objective, n_trials=15)

# print(study.best_value) 
# print(study.best_params) 
# print(study.best_trial)

# trainer.hyperparameter_search(n_trials=10)
