In [74]:
import os
import logging
from glob import glob
from datetime import datetime
from collections import namedtuple
from pickle import dump, load

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
import transformers
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, EarlyStoppingCallback, BatchEncoding
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer, TrainingArguments
from transformers.integrations import TensorBoardCallback
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
import optuna 

from arabert.preprocess import ArabertPreprocessor
import tensorboard_analysis 

# Parameters

In [2]:
# Config
device = torch.device("cuda")
logger = logging.getLogger(__name__)
logger.setLevel(logging.ERROR)
logging.disable(logging.WARNING)

# Data params
validation_size = 4096

# Model params
model_name = "aubmindlab/bert-base-arabertv2"
    
# Preprocessing params
sequence_length = 128
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Etc
train_path= f"./models/{str(datetime.today().date())}-train"
search_path = f"./models/{str(datetime.today().date())}-search"
dataset_string = "{}_dataset-seqlen" + str(sequence_length)

# Functions

In [3]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob("data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)


def get_music_df():
    files = ["GLF","LEV","NOR","IRQ"]
    dataframes = []
    
    for file in files:
        temp_df = pd.read_csv(f'../extra_data/d7_data/{file}.txt', encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = file
        dataframes.append(temp_df)
    
    return pd.concat(dataframes)


def tokenize(batch):
    """
    Tokenizes a list of strings
    """
    return tokenizer.batch_encode_plus(
        batch,
        add_special_tokens=True,
        padding="max_length",
        max_length=sequence_length,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True,
        return_token_type_ids=False,
    )


def batch_tokenize_iter(data, batch_size):
    len_data = len(data)
    batch_num = len_data // batch_size
    batch_rest = len_data / batch_size - batch_num
    
    for i in range(batch_size):
        yield tokenize(data[i * batch_num:(i+1) * batch_num].to_list())
        
    if batch_rest:
        yield tokenize(data[batch_num:].to_list())


def batch_tokenize(data, batch_size):
    bt = batch_tokenize_iter(data, batch_size)
    for i, tokenization in enumerate(bt):
        if not i:
            encoding = tokenization
            continue
        encoding["input_ids"] = torch.cat([encoding["input_ids"], tokenization["input_ids"]])
        encoding["attention_mask"] = torch.cat([encoding["attention_mask"], tokenization["attention_mask"]])
    return encoding


def preprocess_sample(sample):
    """Sample list of strings"""
    return tokenize(list(arabert_prep.preprocess(text) for text in sample))


def save_preprocessed_data(dataset, dataset_name):
    with open(f"preprocessed_data/{dataset_name}.pkl", "wb") as file:
        dump(dataset, file)
        
def load_preprocessed_data(dataset_name):
    with open(f"preprocessed_data/{dataset_name}.pkl", "rb") as file:
        temp = load(file)
    return temp


def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)

    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
      'macro_f1' : macro_f1,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
    }


def model_init(model_name, num_labels, label2id, id2label):
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=num_labels, label2id=label2id, id2label=id2label)

In [4]:
%%time
# Dataset class
class Dialect_dataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        super(Dialect_dataset).__init__()
        self.X = X
        self.Y = Y
        
    def __getitem__(self, key):
        return InputFeatures(self.X["input_ids"][key], self.X["attention_mask"][key], label=self.Y[key])
        
    def __len__(self):
        return len(self.X["input_ids"])

Wall time: 0 ns


# Data (loading, preprocessing, tokenizing)

In [5]:
%%time
# Date to dataframe (2.9 s)
df = get_SMADC_folder_data()

# Encode Y (307 ms)
classes = df["Region"].unique()
num_labels = len(classes)
class_to_index = {class_:index for class_, index in zip(classes, range(len(classes)))}
index_to_class = {index:class_ for class_, index in zip(classes, range(len(classes)))}
df["Labels"] = df["Region"].apply(class_to_index.get)

Wall time: 3.14 s


### Further preprocessing (If you want to load data, skip until loading section)

In [6]:
%%time 
# Preprocess X (16min 22s)
df["Text"] = df["Text"].apply(arabert_prep.preprocess)

# split and (323ms)
train, test = train_test_split(df, test_size=0.1, random_state=1)
validate, test = train_test_split(test, test_size=len(test)-validation_size, random_state=1)
train.reset_index(drop=True, inplace=True)
validate.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

Wall time: 16min 39s


##### Tokenize everything at once

In [7]:
%%time
validate_encoding = tokenize(validate["Text"].to_list())
test_encoding = tokenize(test["Text"].to_list())
train_encoding = tokenize(list(train["Text"]))

Wall time: 2min 35s


##### Tokenize in batches

In [14]:
%%time
# validate_encoding = batch_tokenize(validate["Text"], 10)
# test_encoding = batch_tokenize(test["Text"], 100)
# train_encoding = batch_tokenize(train["Text"], 500)

Wall time: 9min 19s


In [8]:
%%time    
# Make Dataset 
validate_dataset = Dialect_dataset(validate_encoding, validate["Labels"].to_list())
test_dataset = Dialect_dataset(test_encoding, test["Labels"].to_list())
train_dataset = Dialect_dataset(train_encoding, train["Labels"].to_list())

Wall time: 50 ms


# Save & load preprocessed data

In [26]:
%%time
# save_preprocessed_data(validate_dataset, "preprocessed_validation")
# save_preprocessed_data(test_dataset, "preprocessed_test")
# save_preprocessed_data(train_dataset, "preprocessed_train")

Wall time: 9.89 s


In [15]:
%%time
# validate_dataset = load_preprocessed_data("preprocessed_validation")
# test_dataset = load_preprocessed_data("preprocessed_test")
# train_dataset = load_preprocessed_data("preprocessed_train")

Wall time: 5.1 s


# Training

In [9]:
def generate_training_args(output_dir, epochs=5, warmup=True, save_model=True, eval_while_training=True):
    training_args = TrainingArguments(output_dir)

    training_args.adam_epsilon = 1e-8
    training_args.learning_rate = 5e-5

    training_args.fp16 = True

    training_args.per_device_train_batch_size = 32
    training_args.per_device_eval_batch_size = 32

    training_args.gradient_accumulation_steps = 1
    
    if epochs:
        training_args.num_train_epochs = epochs

    steps_per_epoch = len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
    total_steps = steps_per_epoch * training_args.num_train_epochs
    
    if warmup:
        warmup_ratio = 0.05
        training_args.warmup_steps = total_steps * warmup_ratio 
    
    training_args.logging_steps = 1024
    
    if eval_while_training:
        training_args.evaluation_strategy = EvaluationStrategy.STEPS
        training_args.eval_strategy = EvaluationStrategy.STEPS
        training_args.evaluate_during_training = True
        training_args.load_best_model_at_end = True
        training_args.eval_steps = 1024 # defaults to logging_steps
        training_args.metric_for_best_model= "eval_loss"
    
    if save_model:
        training_args.save_steps = 1024
        training_args.save_total_limit = 10
        training_args.save_strategy = EvaluationStrategy.STEPS


    training_args.seed = 1

    training_args.lr_scheduler_type = 'cosine'

    return training_args

training_args = generate_training_args(train_path)

# Begin train

In [15]:
search_path

'./models/2021-10-01-search'

In [19]:
trainer = Trainer(
    model_init=lambda:model_init(model_name, len(classes), label2id=class_to_index, id2label=index_to_class),
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3), TensorBoardCallback()]
)

In [20]:
class EvaluatePleaseCallback(transformers.TrainerCallback):
    def on_save(self, args, state, control, model, **kwargs):
        trainer.evaluate()

trainer.add_callback(EvaluatePleaseCallback())
trainer.train()

Step,Training Loss,Validation Loss,Macro F1,Macro Precision,Macro Recall,Accuracy
2482,0.7278,0.606918,0.750767,0.782908,0.73401,0.793457
2482,0.7278,0.606918,0.750767,0.782908,0.73401,0.793457


  nn.utils.clip_grad_norm_(


KeyboardInterrupt: 

# Analysis

In [68]:
trainer.evaluate()

# Save & load

In [18]:
# trainer.save_model(f'models/finalized_models/{trainer.args.output_dir.split("/")[-1]}-{trainer.evaluate()["eval_accuracy"]}')

In [19]:
# loaded_model = AutoModelForSequenceClassification.from_pretrained("models/finalized_models/2021-09-30-train-0.8921535648994515")

# Hyperparameter search

In [None]:
def objective(trial: optuna.Trial):     
    
    training_args = generate_training_args(search_path, epochs=None, warmup=False, save_model=False, eval_while_training=False)
    training_args.learning_rate= trial.suggest_loguniform('learning_rate', low=4e-5, high=0.01)
    training_args.weight_decay= trial.suggest_loguniform('weight_decay', 4e-5, 0.01)
    training_args.num_train_epochs= trial.suggest_int('num_train_epochs', low=2, high=5)
    
    trainer = Trainer(
        model_init=lambda:model_init(model_name, len(classes), label2id=class_to_index, id2label=index_to_class),
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validate_dataset,
        compute_metrics=compute_metrics,
        callbacks=[TensorBoardCallback()]
    )
    
    result = trainer.train()     
    
    return result.training_loss # Or result.training_loss["metric_name"] ps: change direction in study if necessary
   
# We want to minimize the loss! 
study = optuna.create_study(study_name='hyper-parameter-search', direction='minimize') 
study.optimize(func=objective, n_trials=15)

In [None]:
print(study.best_value) 
print(study.best_params) 
print(study.best_trial)

In [None]:
#trainer.hyperparameter_search(n_trials=100)