In [1]:
import json
import pandas as pd
import os
from datetime import datetime
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from train_seq import tokenize_sequence_classification
import numpy as np
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_path = '/home/lgiordano/LUCA/checkthat_GITHUB/data/formatted/train_sentences.json'
with open(train_data_path, 'r', encoding='utf8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [3]:
data_path_dict = {
'sl': '/home/lgiordano/LUCA/checkthat_GITHUB/data/train_sent_mt/sl/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-slv_Latn_tok_regex_en-sl/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-slv_Latn_tok_regex_en-sl_mdeberta-v3-base_mdeberta_xlwa_en-sl_ME3_2024-05-04-12-12-14_ls.json',
'ru': '/home/lgiordano/LUCA/checkthat_GITHUB/data/train_sent_mt/ru/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-rus_Cyrl_tok_regex_en-ru/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-rus_Cyrl_tok_regex_en-ru_mdeberta-v3-base_mdeberta_xlwa_en-ru_ME3_2024-05-04-12-09-20_ls.json',
'pt': '/home/lgiordano/LUCA/checkthat_GITHUB/data/train_sent_mt/pt/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-por_Latn_tok_regex_en-pt/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-por_Latn_tok_regex_en-pt_mdeberta-v3-base_mdeberta_xlwa_en-pt_ME3_2024-05-04-12-07-45_ls.json',
'it': '/home/lgiordano/LUCA/checkthat_GITHUB/data/train_sent_mt/it/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-ita_Latn_tok_regex_en-it/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-ita_Latn_tok_regex_en-it_mdeberta-v3-base_mdeberta_xlwa_en-it_ME3_2024-05-04-12-05-00_ls.json',
'es': '/home/lgiordano/LUCA/checkthat_GITHUB/data/train_sent_mt/es/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-spa_Latn_tok_regex_en-es/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-spa_Latn_tok_regex_en-es_mdeberta-v3-base_mdeberta_xlwa_en-es_ME3_2024-05-04-12-01-43_ls.json',
'bg': '/home/lgiordano/LUCA/checkthat_GITHUB/data/train_sent_mt/bg/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-bul_Cyrl_tok_regex_en-bg/train_gold_sentences_translated_nllb-200-3.3B_eng_Latn-bul_Cyrl_tok_regex_en-bg_mdeberta-v3-base_mdeberta_xlwa_en-bg_ME3_2024-05-04-11-58-52_ls.json',
}

dataset_aug = []
for key in data_path_dict:
    with open(data_path_dict[key], 'r', encoding='utf8') as f:
        dataset_aug_buffer = json.load(f)
        for sample in dataset_aug_buffer:
            del sample['data']['text_en']
            sample['data']['text'] = sample['data'][f'text_{key}']
            del sample['data'][f'text_{key}']
            sample['data']['lang'] = key
            sample['data']['label'] = sample['data'].pop('labels')
        dataset_aug += dataset_aug_buffer

df_aug = pd.DataFrame(dataset_aug)

In [4]:
df = pd.concat([df, df_aug])

In [5]:
### This code balances positive and negative samples for each language by down-sampling the larger group ###
date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

langs = set(list(sample['data']['lang'] for sample in data) + (list(data_path_dict.keys())))
sampled_dfs = []
for lang in langs:
    df_lang = df[df['data'].apply(lambda x: x['lang'] == lang)]
    df_pos_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 1)]
    df_neg_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 0)]
    if len(df_neg_lang) > len(df_pos_lang):
        df_neg_lang = df_neg_lang.sample(len(df_pos_lang))
    df_lang_sampled = pd.concat([df_pos_lang, df_neg_lang])
    sampled_dfs.append(df_lang_sampled)
df_sampled = pd.concat(sampled_dfs, ignore_index=True)
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = Dataset.from_pandas(df_sampled)

In [6]:
### This code splits the balanced dataset in train/test splits and tokenizes both with dynamic padding

#model_name = 'bert-base-multilingual-cased'
#model_name = 'xlm-roberta-base'
model_name = 'microsoft/mdeberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

split_ratio = 0.2
split_seed = 42
batch_size = 16

datadict = dataset.train_test_split(split_ratio, seed=split_seed)
datadict = datadict.map(lambda x: tokenize_sequence_classification(x, tokenizer),
                            batch_size=batch_size,
                            batched=True
                            )

columns = [
            'input_ids',
            'token_type_ids', #non per xlm-roberta
            'attention_mask',
            'labels'
            ]
datadict.set_format('torch', columns = columns)

train_data = datadict['train'] # no aug: 33,345 samples, with aug: 90,720
val_data = datadict['test'] # no aug: 8,337 samples, with aug: 22,681

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

Map:   0%|          | 0/90720 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 90720/90720 [00:35<00:00, 2561.46 examples/s]
Map: 100%|██████████| 22681/22681 [00:08<00:00, 2541.28 examples/s]


In [7]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(output_dir='/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/RUN_OTTOBRE/aug/aug, lr 5e-5',
                                  save_total_limit=1000,
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  save_only_model=True,
                                  metric_for_best_model='eval_macro-f1',
                                  logging_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  learning_rate=5e-5, #2e-5,
                                  optim='adamw_torch',
                                  num_train_epochs=10)

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    results = classification_report(labels, preds, output_dict=True)
    results['macro-f1'] = results['macro avg']['f1-score']

    models_dir = '/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/RUN_OTTOBRE/aug/aug, lr 5e-5'
    #model_name_simple = model_name.split('/')[-1]
    model_save_name = f'{date_time}_aug'
    model_save_dir = os.path.join(models_dir, model_save_name)
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    with open(os.path.join(model_save_dir, 'results.json'), 'w', encoding='utf8') as f:
        json.dump(results, f, ensure_ascii = False)

    return results

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [8]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [9]:
trainer.train()

Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg,Macro-f1
1,0.6258,0.695137,"{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11260.0}","{'precision': 0.5035492262245933, 'recall': 1.0, 'f1-score': 0.6698140871503138, 'support': 11421.0}",0.503549,"{'precision': 0.25177461311229665, 'recall': 0.5, 'f1-score': 0.3349070435751569, 'support': 22681.0}","{'precision': 0.25356182323138665, 'recall': 0.5035492262245933, 'f1-score': 0.3372843652988728, 'support': 22681.0}",0.334907
2,0.667,0.95634,"{'precision': 0.49645077377540675, 'recall': 1.0, 'f1-score': 0.6635043163136031, 'support': 11260.0}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11421.0}",0.496451,"{'precision': 0.24822538688770338, 'recall': 0.5, 'f1-score': 0.33175215815680154, 'support': 22681.0}","{'precision': 0.2464633707822001, 'recall': 0.49645077377540675, 'f1-score': 0.32939723123721043, 'support': 22681.0}",0.331752
3,0.6527,0.977164,"{'precision': 0.49645077377540675, 'recall': 1.0, 'f1-score': 0.6635043163136031, 'support': 11260.0}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 11421.0}",0.496451,"{'precision': 0.24822538688770338, 'recall': 0.5, 'f1-score': 0.33175215815680154, 'support': 22681.0}","{'precision': 0.2464633707822001, 'recall': 0.49645077377540675, 'f1-score': 0.32939723123721043, 'support': 22681.0}",0.331752


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=34020, training_loss=0.6485080869249706, metrics={'train_runtime': 3874.4686, 'train_samples_per_second': 234.148, 'train_steps_per_second': 29.269, 'total_flos': 2.253763384809504e+16, 'train_loss': 0.6485080869249706, 'epoch': 3.0})

In [10]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.6951372027397156,
 'eval_0': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 11260.0},
 'eval_1': {'precision': 0.5035492262245933,
  'recall': 1.0,
  'f1-score': 0.6698140871503138,
  'support': 11421.0},
 'eval_accuracy': 0.5035492262245933,
 'eval_macro avg': {'precision': 0.25177461311229665,
  'recall': 0.5,
  'f1-score': 0.3349070435751569,
  'support': 22681.0},
 'eval_weighted avg': {'precision': 0.25356182323138665,
  'recall': 0.5035492262245933,
  'f1-score': 0.3372843652988728,
  'support': 22681.0},
 'eval_macro-f1': 0.3349070435751569,
 'eval_runtime': 38.4416,
 'eval_samples_per_second': 590.012,
 'eval_steps_per_second': 73.774,
 'epoch': 3.0}