In [1]:
import json
import pandas as pd
import os
from datetime import datetime
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from train_seq import tokenize_sequence_classification
import numpy as np
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_path = '/home/lgiordano/LUCA/checkthat_GITHUB/data/formatted/train_sentences.json'
with open(train_data_path, 'r', encoding='utf8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [3]:
### This code balances positive and negative samples for each language by down-sampling the larger group ###
date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

langs = set(sample['data']['lang'] for sample in data)

sampled_dfs = []
for lang in langs:
    df_lang = df[df['data'].apply(lambda x: x['lang'] == lang)]
    df_pos_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 1)]
    df_neg_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 0)]
    if len(df_neg_lang) > len(df_pos_lang):
        df_neg_lang = df_neg_lang.sample(len(df_pos_lang))
    df_lang_sampled = pd.concat([df_pos_lang, df_neg_lang])
    sampled_dfs.append(df_lang_sampled)
df_sampled = pd.concat(sampled_dfs, ignore_index=True)
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = Dataset.from_pandas(df_sampled)

In [4]:
### This code splits the balanced dataset in train/test splits and tokenizes both with dynamic padding

#model_name = 'bert-base-multilingual-cased'
#model_name = 'xlm-roberta-base'
model_name = 'microsoft/mdeberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

split_ratio = 0.2
split_seed = 42
batch_size = 16

datadict = dataset.train_test_split(split_ratio, seed=split_seed)
datadict = datadict.map(lambda x: tokenize_sequence_classification(x, tokenizer),
                            batch_size=batch_size,
                            batched=True
                            )

columns = [
            'input_ids',
            'token_type_ids', #non per xlm-roberta
            'attention_mask',
            'labels'
            ]
datadict.set_format('torch', columns = columns)

train_data = datadict['train']
val_data = datadict['test']

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

Map:   0%|          | 0/33345 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 33345/33345 [00:10<00:00, 3052.07 examples/s]
Map: 100%|██████████| 8337/8337 [00:02<00:00, 2969.26 examples/s]


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(output_dir='/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/RUN_OTTOBRE/3rd_run',
                                  save_total_limit=2,
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  save_only_model=True,
                                  metric_for_best_model='eval_macro-f1',
                                  logging_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  learning_rate=2e-5, #5e-5,
                                  optim='adamw_torch',
                                  num_train_epochs=10)

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    results = classification_report(labels, preds, output_dict=True)
    results['macro-f1'] = results['macro avg']['f1-score']

    models_dir = '/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/RUN_OTTOBRE/3rd_run'
    #model_name_simple = model_name.split('/')[-1]
    model_save_name = f'{date_time}'
    model_save_dir = os.path.join(models_dir, model_save_name)
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    with open(os.path.join(model_save_dir, 'results.json'), 'w', encoding='utf8') as f:
        json.dump(results, f, ensure_ascii = False)

    return results

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg,Macro-f1
1,0.5265,0.492101,"{'precision': 0.8329952670723462, 'recall': 0.6089965397923875, 'f1-score': 0.7035979440319817, 'support': 4046.0}","{'precision': 0.7058932887153746, 'recall': 0.8848753204381263, 'f1-score': 0.7853154084798345, 'support': 4291.0}",0.75099,"{'precision': 0.7694442778938604, 'recall': 0.7469359301152569, 'f1-score': 0.7444566762559082, 'support': 8337.0}","{'precision': 0.7675767005460459, 'recall': 0.7509895645915797, 'f1-score': 0.7456573946671906, 'support': 8337.0}",0.744457
2,0.4147,0.545269,"{'precision': 0.7997214484679666, 'recall': 0.7095897182402373, 'f1-score': 0.751964379256155, 'support': 4046.0}","{'precision': 0.7524752475247525, 'recall': 0.8324399906781635, 'f1-score': 0.7904403629121486, 'support': 4291.0}",0.77282,"{'precision': 0.7760983479963595, 'recall': 0.7710148544592004, 'f1-score': 0.7712023710841518, 'support': 8337.0}","{'precision': 0.7754041342965223, 'recall': 0.7728199592179441, 'f1-score': 0.7717677192906841, 'support': 8337.0}",0.771202
3,0.3477,0.768344,"{'precision': 0.7851772287862513, 'recall': 0.7226890756302521, 'f1-score': 0.7526383526383525, 'support': 4046.0}","{'precision': 0.7567743334055929, 'recall': 0.813563271964577, 'f1-score': 0.7841419586702606, 'support': 4291.0}",0.769461,"{'precision': 0.7709757810959221, 'recall': 0.7681261737974145, 'f1-score': 0.7683901556543066, 'support': 8337.0}","{'precision': 0.7705584421629569, 'recall': 0.7694614369677342, 'f1-score': 0.7688530549872692, 'support': 8337.0}",0.76839
4,0.2766,0.943653,"{'precision': 0.8024193548387096, 'recall': 0.6885813148788927, 'f1-score': 0.7411545623836127, 'support': 4046.0}","{'precision': 0.7410071942446043, 'recall': 0.8401305057096248, 'f1-score': 0.7874617737003058, 'support': 4291.0}",0.766583,"{'precision': 0.771713274541657, 'recall': 0.7643559102942588, 'f1-score': 0.7643081680419592, 'support': 8337.0}","{'precision': 0.7708109128200811, 'recall': 0.7665827036104114, 'f1-score': 0.7649885846650004, 'support': 8337.0}",0.764308


TrainOutput(global_step=16676, training_loss=0.3913612868980871, metrics={'train_runtime': 3115.9491, 'train_samples_per_second': 107.014, 'train_steps_per_second': 13.38, 'total_flos': 1.0137960744541704e+16, 'train_loss': 0.3913612868980871, 'epoch': 4.0})

In [8]:
trainer.evaluate()

{'eval_loss': 0.5452693700790405,
 'eval_0': {'precision': 0.7997214484679666,
  'recall': 0.7095897182402373,
  'f1-score': 0.751964379256155,
  'support': 4046.0},
 'eval_1': {'precision': 0.7524752475247525,
  'recall': 0.8324399906781635,
  'f1-score': 0.7904403629121486,
  'support': 4291.0},
 'eval_accuracy': 0.7728199592179441,
 'eval_macro avg': {'precision': 0.7760983479963595,
  'recall': 0.7710148544592004,
  'f1-score': 0.7712023710841518,
  'support': 8337.0},
 'eval_weighted avg': {'precision': 0.7754041342965223,
  'recall': 0.7728199592179441,
  'f1-score': 0.7717677192906841,
  'support': 8337.0},
 'eval_macro-f1': 0.7712023710841518,
 'eval_runtime': 34.1259,
 'eval_samples_per_second': 244.301,
 'eval_steps_per_second': 30.563,
 'epoch': 4.0}