In [1]:
import logging
from os import system
from glob import glob
from os.path import join
from pickle import dump, load
from datetime import datetime
from collections import namedtuple


import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
import transformers
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
import optuna 

from arabert.preprocess import ArabertPreprocessor

import tensorboard_analysis 
from utilities import *

# Parameters

In [4]:
# Config
seed: int = 1

# Data 
data_proportion: float = 1.0 # propotion of data to be loaded in df
load_data: bool = True 
save_data: bool = False
test_validation_proportion: float = 0.02 # test and validation proportion from df

# Model 
# model_name = "aubmindlab/bert-base-arabertv2"
model_name: str = "aubmindlab/bert-large-arabertv2"
from_pretrained_classifier: bool = True
pretrained_classifier_name: str = "2021-12-09-train-0.963134765625"
    
# Preprocessing 
sequence_length: int = 32
tokenize_in_batches: bool = False # Helps reduce memory footprint

# Paths
code_folder_path: str = ""

# Training 
validation_size: int = 4096
batch_size: int = 64
learning_rate: float = 1e-5
epochs: int = 10
warmup_ratio: float = 0.1
save_model_while_training: bool = True # maybe doesn't work, transformers is terrible
do_warmup: bool = False
eval_while_training: bool = True # maybe doesn't work, transformers is terrible
save_model_after_finish: bool = True # maybe doesn't work, transformers is terrible

# Etc
open_tensorboard: bool = True

## Don't touch 

In [6]:
# Config
device = torch.device("cuda")
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
logging.disable(logging.WARNING)

# General
date = str(datetime.today().date())

# Preprocessing
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Model 
pretrained_classifier_path = join(code_folder_path, "models", "finalized_models", pretrained_classifier_name)
pretrained_classifier = AutoModelForSequenceClassification.from_pretrained(pretrained_classifier_path)

# Paths
train_path= f"./models/{date}-train"
search_path = f"./models/{date}-search"
dataset_string = "{}_dataset-seqlen" + str(sequence_length)


In [7]:
assert 0 < data_proportion <= 1, "data_proportion must be right side inclusively between 0 and 1"
assert 0 <= warmup_ratio <= 1, "warmup_ratio must be inclusively between 0 and 1"
assert 0 < test_validation_proportion < 1, "test_validation_proportion must be exclusively between 0 and 1"
assert 0 < sequence_length, "sequence_length must be positive"
assert 0 < epochs, "epochs must be positive"
assert 0 < batch_size, "batch_size must be positive"
assert 0 < validation_size, "validation_size must be positive"


# Data (loading, preprocessing, tokenizing)

In [8]:
%%time
# Date to dataframe ~(2.9 s)
df = get_SMADC_folder_data(code_folder_path)
df = df.sample(frac=data_proportion)

# Encode Y ~(307 ms)
classes = df["Region"].unique()
num_labels = len(classes)
class_to_index = {class_:index for class_, index in zip(classes, range(len(classes)))}
index_to_class = {index:class_ for class_, index in zip(classes, range(len(classes)))}
df["Labels"] = df["Region"].apply(class_to_index.get)

Wall time: 3.59 s


### Further preprocessing

In [9]:
if not load_data:
    # Preprocess X ~(16min 22s)
    df["Text"] = df["Text"].apply(arabert_prep.preprocess)

    # split and ~(323ms)
    train, test = train_test_split(df, test_size=test_validation_proportion, random_state=seed)
    validate, test = train_test_split(test, test_size=len(test)-validation_size, random_state=seed)
    train.reset_index(drop=True, inplace=True)
    validate.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)

    # Tokenize
    if tokenize_in_batches:
        validate_encoding = batch_tokenize(tokenizer, validate["Text"], 10, sequence_length)
        test_encoding = batch_tokenize(tokenizer, test["Text"], 100, sequence_length)
        train_encoding = batch_tokenize(tokenizer, train["Text"], 500, sequence_length)
    else:
        validate_encoding = tokenize(tokenizer, validate["Text"].to_list(), sequence_length)
        test_encoding = tokenize(tokenizer, test["Text"].to_list(), sequence_length)
        train_encoding = tokenize(tokenizer, list(train["Text"]), sequence_length)

    # Make Dataset 
    validate_dataset = Dialect_dataset(validate_encoding, validate["Labels"].to_list())
    test_dataset = Dialect_dataset(test_encoding, test["Labels"].to_list())
    train_dataset = Dialect_dataset(train_encoding, train["Labels"].to_list())

# Save & load preprocessed data

In [10]:
if save_data:
    save_preprocessed_data(validate_dataset, "preprocessed_validation")
    save_preprocessed_data(test_dataset, "preprocessed_test")
    save_preprocessed_data(train_dataset, "preprocessed_train")

if load_data:
    # ~(3mins)
    validate_dataset = load_preprocessed_data("preprocessed_validation")
    test_dataset = load_preprocessed_data("preprocessed_test")
    train_dataset = load_preprocessed_data("preprocessed_train")

# Training

In [11]:
def generate_training_args(output_dir, epochs=5, do_warmup=True, warmup_ratio=0.05, save_model=True, eval_while_training=True):
    training_args = TrainingArguments(output_dir)

    training_args.adam_epsilon = 1e-8
    training_args.learning_rate = learning_rate

    training_args.fp16 = True

    training_args.per_device_train_batch_size = batch_size
    training_args.per_device_eval_batch_size = batch_size

    training_args.gradient_accumulation_steps = 1
    
    if epochs:
        training_args.num_train_epochs = epochs

    steps_per_epoch = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    total_steps = steps_per_epoch * training_args.num_train_epochs
    
    if do_warmup:
        warmup_ratio = warmup_ratio
        training_args.warmup_steps = total_steps * warmup_ratio 
    
    training_args.logging_steps = 10 ** 4
    
    if eval_while_training:
        training_args.evaluation_strategy = "steps"
        training_args.evaluate_during_training = True
        training_args.load_best_model_at_end = True
        training_args.eval_steps = 10 ** 4 # defaults to logging_steps
        training_args.metric_for_best_model = "macro_f1"
    
    if save_model:
        training_args.save_steps = 10 ** 4
        training_args.save_total_limit = 120
        training_args.save_strategy = "steps"

    training_args.seed = seed

    # training_args.lr_scheduler_type = 'cosine'


    return training_args

training_args = generate_training_args(
                    train_path, epochs=epochs, do_warmup=do_warmup, 
                    warmup_ratio=warmup_ratio, save_model=save_model_while_training, 
                    eval_while_training=eval_while_training
                    )

# Begin train

In [12]:
if from_pretrained_classifier:
    trainer = Trainer(
    pretrained_classifier,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
    )
else:
    trainer = Trainer(
        model_init=lambda:model_init(model_name, len(classes), label2id=class_to_index, id2label=index_to_class),
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
    )

In [13]:
if open_tensorboard:
    from tensorboard import program
    tb = program.TensorBoard()
    tb.configure(argv=[None, '--logdir', join(code_folder_path, f"models")])
    print(f"Tensorflow listening on {tb.launch()}")

Tensorflow listening on http://localhost:6006/


In [None]:
class EvaluatePleaseCallback(transformers.TrainerCallback):
    def on_save(self, args, state, control, model, **kwargs):
        trainer.evaluate()
 
trainer.add_callback(EvaluatePleaseCallback())
trainer.train()

  5%|▍         | 10000/215670 [1:10:07<22:48:25,  2.50it/s]

{'loss': 0.8199, 'learning_rate': 4.6313641843642774e-06, 'epoch': 0.46}


  5%|▍         | 10009/215670 [1:10:10<22:54:43,  2.49it/s]ERROR:tensorboard:File models\2021-12-08-train\runs\Dec08_13-51-22_DESKTOP-QN3OJOT\events.out.tfevents.1638960710.DESKTOP-QN3OJOT.3312.0 updated even though the current file is models\2021-12-08-train\runs\Dec08_13-51-22_DESKTOP-QN3OJOT\events.out.tfevents.1638960710.DESKTOP-QN3OJOT.3312.2
  9%|▉         | 20000/215670 [2:20:47<23:27:04,  2.32it/s]

{'loss': 0.5325, 'learning_rate': 9.267365297227119e-06, 'epoch': 0.93}


 14%|█▍        | 30000/215670 [3:28:57<22:14:07,  2.32it/s]

{'loss': 0.4682, 'learning_rate': 9.566418002720192e-06, 'epoch': 1.39}


 19%|█▊        | 40000/215670 [4:38:28<20:52:30,  2.34it/s]

{'loss': 0.4365, 'learning_rate': 9.051333305856654e-06, 'epoch': 1.85}


 23%|██▎       | 50000/215670 [5:49:57<20:15:31,  2.27it/s]

{'loss': 0.3917, 'learning_rate': 8.536351646539999e-06, 'epoch': 2.32}


 28%|██▊       | 60000/215670 [6:59:50<17:41:38,  2.44it/s]

{'loss': 0.3699, 'learning_rate': 8.021266949676463e-06, 'epoch': 2.78}


 32%|███▏      | 70000/215670 [8:09:08<16:26:06,  2.46it/s]

{'loss': 0.3384, 'learning_rate': 7.506285290359808e-06, 'epoch': 3.25}


 37%|███▋      | 80000/215670 [9:16:22<15:12:23,  2.48it/s]

{'loss': 0.3145, 'learning_rate': 6.991252112269712e-06, 'epoch': 3.71}


 42%|████▏     | 90000/215670 [10:24:15<13:55:23,  2.51it/s]

{'loss': 0.2995, 'learning_rate': 6.476321971726498e-06, 'epoch': 4.17}


 46%|████▋     | 100000/215670 [11:30:48<12:48:31,  2.51it/s]

{'loss': 0.2722, 'learning_rate': 5.961237274862961e-06, 'epoch': 4.64}


 51%|█████     | 110000/215670 [12:37:21<11:43:44,  2.50it/s]

{'loss': 0.2615, 'learning_rate': 5.446307134319747e-06, 'epoch': 5.1}


 56%|█████▌    | 120000/215670 [13:44:05<10:37:38,  2.50it/s]

{'loss': 0.2353, 'learning_rate': 4.9313769937765325e-06, 'epoch': 5.56}


 60%|██████    | 130000/215670 [14:50:39<9:29:35,  2.51it/s]

{'loss': 0.2349, 'learning_rate': 4.416292296912996e-06, 'epoch': 6.03}


 65%|██████▍   | 140000/215670 [15:57:17<8:24:23,  2.50it/s]

{'loss': 0.208, 'learning_rate': 3.9012591188229e-06, 'epoch': 6.49}


 70%|██████▉   | 150000/215670 [17:03:49<7:16:59,  2.50it/s]

{'loss': 0.2078, 'learning_rate': 3.3862259407328037e-06, 'epoch': 6.96}


 74%|███████▍  | 160000/215670 [18:10:23<6:10:46,  2.50it/s]

{'loss': 0.1863, 'learning_rate': 2.8711927626427074e-06, 'epoch': 7.42}


 79%|███████▉  | 170000/215670 [19:17:00<5:03:26,  2.51it/s]

{'loss': 0.1874, 'learning_rate': 2.356159584552611e-06, 'epoch': 7.88}


 83%|████████▎ | 180000/215670 [20:23:44<3:58:36,  2.49it/s]

{'loss': 0.1731, 'learning_rate': 1.841126406462515e-06, 'epoch': 8.35}


 88%|████████▊ | 190000/215670 [21:31:53<3:07:40,  2.28it/s]

{'loss': 0.1684, 'learning_rate': 1.32614474714586e-06, 'epoch': 8.81}


 93%|█████████▎| 200000/215670 [22:40:13<1:45:50,  2.47it/s]

{'loss': 0.1619, 'learning_rate': 8.11163087829205e-07, 'epoch': 9.27}


 97%|█████████▋| 210000/215670 [23:51:17<1:16:42,  1.23it/s]

{'loss': 0.1577, 'learning_rate': 2.9623294728599104e-07, 'epoch': 9.74}


100%|██████████| 215670/215670 [24:58:54<00:00,  2.40it/s]

{'train_runtime': 89935.0351, 'train_samples_per_second': 153.476, 'train_steps_per_second': 2.398, 'train_loss': 0.3019960471451429, 'epoch': 10.0}





TrainOutput(global_step=215670, training_loss=0.3019960471451429, metrics={'train_runtime': 89935.0351, 'train_samples_per_second': 153.476, 'train_steps_per_second': 2.398, 'train_loss': 0.3019960471451429, 'epoch': 10.0})

# Analysis

In [None]:
trainer.evaluate(test_dataset)


 31%|███       | 66052/215670 [7:54:12<17:32:41,  2.37it/s]

{'eval_loss': 0.19428153336048126, 'eval_macro_f1': 0.9271287626651207, 'eval_macro_precision': 0.9325099078535887, 'eval_macro_recall': 0.9221597530877496, 'eval_accuracy': 0.9421782836254881, 'eval_runtime': 44.1897, 'eval_samples_per_second': 544.788, 'eval_steps_per_second': 8.531, 'epoch': 3.06}


{'eval_loss': 0.19428153336048126,
 'eval_macro_f1': 0.9271287626651207,
 'eval_macro_precision': 0.9325099078535887,
 'eval_macro_recall': 0.9221597530877496,
 'eval_accuracy': 0.9421782836254881,
 'eval_runtime': 44.1897,
 'eval_samples_per_second': 544.788,
 'eval_steps_per_second': 8.531,
 'epoch': 3.06}

# Save & load

In [14]:
if save_model_after_finish:
    trainer.save_model(f'models/finalized_models/{trainer.args.output_dir.split("/")[-1]}-{trainer.evaluate()["eval_accuracy"]}')


 31%|███       | 66052/215670 [7:21:13<17:32:41,  2.37it/s]

{'eval_loss': 0.19872522354125977, 'eval_macro_f1': 0.9268216135235413, 'eval_macro_precision': 0.9336408905837104, 'eval_macro_recall': 0.9205536321367577, 'eval_accuracy': 0.943115234375, 'eval_runtime': 7.3076, 'eval_samples_per_second': 560.514, 'eval_steps_per_second': 8.758, 'epoch': 3.06}


# Hyperparameter search

In [None]:
# def objective(trial: optuna.Trial):     
    
#     training_args = generate_training_args(search_path, epochs=None, do_warmup=False, save_model=False, eval_while_training=False)
#     training_args.learning_rate= trial.suggest_loguniform('learning_rate', low=4e-5, high=0.01)
#     training_args.weight_decay= trial.suggest_loguniform('weight_decay', 4e-5, 0.01)
#     training_args.num_train_epochs= trial.suggest_int('num_train_epochs', low=2, high=5)
    
#     trainer = Trainer(
#         model_init=lambda:model_init(model_name, len(classes), label2id=class_to_index, id2label=index_to_class),
#         args=training_args,
#         train_dataset=train_dataset,
#         eval_dataset=validate_dataset,
#         compute_metrics=compute_metrics,
#         callbacks=[TensorBoardCallback()]
#     )
    
#     result = trainer.train()     
    
#     return result.training_loss # Or result.training_loss["metric_name"] ps: change direction in study if necessary
   
# We want to minimize the loss! 
# study = optuna.create_study(study_name='hyper-parameter-search', direction='minimize') 
# study.optimize(func=objective, n_trials=15)

# print(study.best_value) 
# print(study.best_params) 
# print(study.best_trial)

# trainer.hyperparameter_search(n_trials=10)
