In [1]:
import json
import pandas as pd
import os
from datetime import datetime
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from train_seq import tokenize_sequence_classification
import numpy as np
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_path = '/home/lgiordano/LUCA/checkthat_GITHUB/data/formatted/train_sentences.json'
with open(train_data_path, 'r', encoding='utf8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [3]:
### This code balances positive and negative samples for each language by down-sampling the larger group ###
date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

langs = set(sample['data']['lang'] for sample in data)

sampled_dfs = []
for lang in langs:
    df_lang = df[df['data'].apply(lambda x: x['lang'] == lang)]
    df_pos_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 1)]
    df_neg_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 0)]
    if len(df_neg_lang) > len(df_pos_lang):
        df_neg_lang = df_neg_lang.sample(len(df_pos_lang))
    df_lang_sampled = pd.concat([df_pos_lang, df_neg_lang])
    sampled_dfs.append(df_lang_sampled)
df_sampled = pd.concat(sampled_dfs, ignore_index=True)
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = Dataset.from_pandas(df_sampled)

In [4]:
### This code splits the balanced dataset in train/test splits and tokenizes both with dynamic padding

#model_name = 'bert-base-multilingual-cased'
#model_name = 'xlm-roberta-base'
model_name = 'microsoft/mdeberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

split_ratio = 0.2
split_seed = 42
batch_size = 16

datadict = dataset.train_test_split(split_ratio, seed=split_seed)
datadict = datadict.map(lambda x: tokenize_sequence_classification(x, tokenizer),
                            batch_size=batch_size,
                            batched=True
                            )

columns = [
            'input_ids',
            'token_type_ids', #non per xlm-roberta
            'attention_mask',
            'labels'
            ]
datadict.set_format('torch', columns = columns)

train_data = datadict['train']
val_data = datadict['test']

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

Map:   0%|          | 0/33345 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 33345/33345 [00:11<00:00, 2784.50 examples/s]
Map: 100%|██████████| 8337/8337 [00:03<00:00, 2772.95 examples/s]


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(output_dir='/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/mdeberta-v3-base-NEW_2nd',
                                  save_total_limit=2,
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  save_only_model=True,
                                  metric_for_best_model='eval_macro-f1',
                                  logging_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  learning_rate=5e-5,
                                  optim='adamw_torch',
                                  num_train_epochs=10)

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    results = classification_report(labels, preds, output_dict=True)
    results['macro-f1'] = results['macro avg']['f1-score']

    models_dir = '/home/lgiordano/LUCA/checkthat_GITHUB/models/M1'
    model_name_simple = model_name.split('/')[-1]
    model_save_name = f'{model_name_simple}_{date_time}'
    model_save_dir = os.path.join(models_dir, model_save_name)
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    with open(os.path.join(model_save_dir, 'results.json'), 'w', encoding='utf8') as f:
        json.dump(results, f, ensure_ascii = False)

    return results

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg,Macro-f1
1,0.5995,0.586325,"{'precision': 0.6880593756821655, 'recall': 0.7840796019900498, 'f1-score': 0.7329380304615741, 'support': 4020.0}","{'precision': 0.7689030883919062, 'recall': 0.6689830901088719, 'f1-score': 0.7154713241669763, 'support': 4317.0}",0.724481,"{'precision': 0.7284812320370359, 'recall': 0.7265313460494609, 'f1-score': 0.7242046773142752, 'support': 8337.0}","{'precision': 0.729921233396925, 'recall': 0.7244812282595658, 'f1-score': 0.7238935575008234, 'support': 8337.0}",0.724205
2,0.539,0.562402,"{'precision': 0.7642919964819701, 'recall': 0.6485074626865671, 'f1-score': 0.7016552280985063, 'support': 4020.0}","{'precision': 0.7131546894031668, 'recall': 0.8137595552466991, 'f1-score': 0.7601428107757222, 'support': 4317.0}",0.734077,"{'precision': 0.7387233429425685, 'recall': 0.7311335089666331, 'f1-score': 0.7308990194371143, 'support': 8337.0}","{'precision': 0.7378124769114779, 'recall': 0.7340770061173084, 'f1-score': 0.7319408097726746, 'support': 8337.0}",0.730899
3,0.514,0.600969,"{'precision': 0.7872619829284307, 'recall': 0.5965174129353233, 'f1-score': 0.6787432776677045, 'support': 4020.0}","{'precision': 0.6934416934416935, 'recall': 0.8498957609451008, 'f1-score': 0.7637385512073274, 'support': 4317.0}",0.72772,"{'precision': 0.7403518381850621, 'recall': 0.723206586940212, 'f1-score': 0.7212409144375159, 'support': 8337.0}","{'precision': 0.738680695929001, 'recall': 0.727719803286554, 'f1-score': 0.7227548640741519, 'support': 8337.0}",0.721241
4,0.4887,0.64086,"{'precision': 0.6888504753673294, 'recall': 0.7930348258706468, 'f1-score': 0.7372802960222017, 'support': 4020.0}","{'precision': 0.7756807764896199, 'recall': 0.6664350243224462, 'f1-score': 0.7169200099676054, 'support': 4317.0}",0.72748,"{'precision': 0.7322656259284746, 'recall': 0.7297349250965465, 'f1-score': 0.7271001529949035, 'support': 8337.0}","{'precision': 0.7338122613748774, 'recall': 0.7274799088401104, 'f1-score': 0.7267374922681304, 'support': 8337.0}",0.7271


TrainOutput(global_step=16676, training_loss=0.5353185077684621, metrics={'train_runtime': 2296.5062, 'train_samples_per_second': 145.199, 'train_steps_per_second': 18.154, 'total_flos': 1.0078943670885072e+16, 'train_loss': 0.5353185077684621, 'epoch': 4.0})

In [8]:
trainer.evaluate()

{'eval_loss': 0.5624021887779236,
 'eval_0': {'precision': 0.7642919964819701,
  'recall': 0.6485074626865671,
  'f1-score': 0.7016552280985063,
  'support': 4020.0},
 'eval_1': {'precision': 0.7131546894031668,
  'recall': 0.8137595552466991,
  'f1-score': 0.7601428107757222,
  'support': 4317.0},
 'eval_accuracy': 0.7340770061173084,
 'eval_macro avg': {'precision': 0.7387233429425685,
  'recall': 0.7311335089666331,
  'f1-score': 0.7308990194371143,
  'support': 8337.0},
 'eval_weighted avg': {'precision': 0.7378124769114779,
  'recall': 0.7340770061173084,
  'f1-score': 0.7319408097726746,
  'support': 8337.0},
 'eval_macro-f1': 0.7308990194371143,
 'eval_runtime': 20.7636,
 'eval_samples_per_second': 401.52,
 'eval_steps_per_second': 50.232,
 'epoch': 4.0}