In [1]:
from glob import glob

import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, precision_score , recall_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from transformers import AutoModel, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer
from transformers.data.processors import SingleSentenceClassificationProcessor
from transformers import Trainer, TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from transformers.data.processors.utils import InputFeatures
import optuna 

from arabert.preprocess import ArabertPreprocessor


In [2]:
# Params
device = torch.device("cuda")
model_name = "aubmindlab/bert-base-arabertv2"
model = AutoModel.from_pretrained(model_name)
for param in model.parameters():
    param.requires_grad = False
arabert_prep = ArabertPreprocessor(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
sequence_length = 100

Some weights of the model checkpoint at aubmindlab/bert-base-arabertv2 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def get_SMADC_folder_data():
    """Returns a dataframe with Text and Region columns. Requires tree like this data/SMADC/*.txt"""
    files = glob("data/SMADC/*.txt")
    dataframes = []

    for file in files:
        region = file[-7:-4]
        temp_df = pd.read_csv(file, encoding="utf8", delimiter="\n", names=["Text"])
        temp_df["Region"] = region
        dataframes.append(temp_df)
        
    return pd.concat(dataframes)


def tokenize(batch, tokenizer):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    batch = tokenizer.batch_encode_plus(
        batch,
        add_special_tokens=True,
        padding=True,
        max_length=sequence_length,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True,
        return_token_type_ids=False
    )
    batch["input_ids"].to(device)
    batch["attention_mask"].to(device)
    return batch


def preprocess_sample(sample, tokenizer):
    """Sample list of strings"""
    return tokenize(list(arabert_prep.preprocess(text) for text in sample), tokenizer)


def compute_metrics(p): 
    preds = np.argmax(p.predictions, axis=1)
    assert len(preds) == len(p.label_ids)

    macro_f1 = f1_score(p.label_ids,preds,average='macro')
    macro_precision = precision_score(p.label_ids,preds,average='macro')
    macro_recall = recall_score(p.label_ids,preds,average='macro')
    acc = accuracy_score(p.label_ids,preds)
    return {
      'macro_f1' : macro_f1,
      'macro_precision': macro_precision,
      'macro_recall': macro_recall,
      'accuracy': acc
    }


def model_init(model_name, num_labels):
    return AutoModelForSequenceClassification.from_pretrained(model_name, return_dict=True, num_labels=num_labels)

In [None]:
%%time 
# Data logic (load, arabic_preprocess, tokenize, dataset)

# Date to dataframe (2.9 s)
df = get_SMADC_folder_data()

# Encode Y (307 ms)
classes = df["Region"].unique()
num_labels = len(classes)
class_to_index = {class_:index for class_, index in zip(classes, range(len(classes)))}
index_to_class = {index:class_ for class_, index in zip(classes, range(len(classes)))}
all_labels = torch.tensor(df["Region"].map(class_to_index.get).values)
all_labels.to(device)

# Preprocess X (16min 22s)
df["Text"] = df["Text"].apply(arabert_prep.preprocess)

# tokenize and split (2min 26s)
x_train, x_test, y_train, y_test = train_test_split(df["Text"], all_labels, random_state=1)
x_train, x_test = tokenize(x_train.to_list(), tokenizer), tokenize(x_test.to_list(), tokenizer)

# Dataset class
class Dialect_dataset(torch.utils.data.Dataset):
    def __init__(self, X, Y):
        super(Dialect_dataset).__init__()
        self.X = X
        self.Y = Y
        
    def __getitem__(self, key):
        return InputFeatures(self.X["input_ids"][key], self.X["attention_mask"][key], label=self.Y[key])
        
    def __len__(self):
        return len(self.Y)
    
# Make Dataset
train_data = Dialect_dataset(x_train, y_train)
test_data = Dialect_dataset(x_test, y_test)

In [None]:
dialect_classifier = model_init(model_name, len(classes))
dialect_classifier.to(device)

In [None]:
training_args = TrainingArguments("./train")
training_args.evaluate_during_training = True
# training_args.adam_epsilon = 1e-8
# training_args.learning_rate = 5e-5
# training_args.fp16 = True
training_args.per_device_train_batch_size = 32
training_args.per_device_eval_batch_size = 32
# training_args.gradient_accumulation_steps = 2
training_args.num_train_epochs= 5


steps_per_epoch = len(training_data) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)
total_steps = steps_per_epoch * training_args.num_train_epochs
print(steps_per_epoch)
print(total_steps)
#Warmup_ratio
# warmup_ratio = 0.1
# training_args.warmup_steps = total_steps*warmup_ratio # or you can set the warmup steps directly 

training_args.logging_steps = 200
training_args.save_steps = 10000
training_args.save_total = 10 
training_args.seed = 1
training_args.lr_scheduler_type = 'cosine'

In [None]:
trainer = Trainer(
    model=dialect_classifier,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()