In [1]:
import json
import pandas as pd
import os
from datetime import datetime
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from train_seq import tokenize_sequence_classification
import numpy as np
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_data_path = '/home/lgiordano/LUCA/checkthat_GITHUB/data/formatted/train_sentences.json'
with open(train_data_path, 'r', encoding='utf8') as f:
    data = json.load(f)
df = pd.DataFrame(data)

In [3]:
### This code balances positive and negative samples for each language by down-sampling the larger group ###
date_time = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

langs = set(sample['data']['lang'] for sample in data)

sampled_dfs = []
for lang in langs:
    df_lang = df[df['data'].apply(lambda x: x['lang'] == lang)]
    df_pos_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 1)]
    df_neg_lang = df_lang[df_lang['data'].apply(lambda x: x['label'] == 0)]
    if len(df_neg_lang) > len(df_pos_lang):
        df_neg_lang = df_neg_lang.sample(len(df_pos_lang))
    df_lang_sampled = pd.concat([df_pos_lang, df_neg_lang])
    sampled_dfs.append(df_lang_sampled)
df_sampled = pd.concat(sampled_dfs, ignore_index=True)
df_sampled = df_sampled.sample(frac=1, random_state=42).reset_index(drop=True)

dataset = Dataset.from_pandas(df_sampled)

In [4]:
### This code splits the balanced dataset in train/test splits and tokenizes both with dynamic padding

#model_name = 'bert-base-multilingual-cased'
#model_name = 'xlm-roberta-base'
model_name = 'microsoft/mdeberta-v3-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)

split_ratio = 0.2
split_seed = 42
batch_size = 16

datadict = dataset.train_test_split(split_ratio, seed=split_seed)
datadict = datadict.map(lambda x: tokenize_sequence_classification(x, tokenizer),
                            batch_size=batch_size,
                            batched=True
                            )

columns = [
            'input_ids',
            'token_type_ids', #non per xlm-roberta
            'attention_mask',
            'labels'
            ]
datadict.set_format('torch', columns = columns)

train_data = datadict['train']
val_data = datadict['test']

collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, padding='longest')

Map:   0%|          | 0/33345 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 33345/33345 [00:10<00:00, 3044.53 examples/s]
Map: 100%|██████████| 8337/8337 [00:02<00:00, 3046.51 examples/s]


In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(output_dir=f'/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/RUN_OTTOBRE/no aug, lr 2e-5/{date_time}',
                                  save_total_limit=1000,
                                  save_strategy='epoch',
                                  load_best_model_at_end=True,
                                  save_only_model=True,
                                  metric_for_best_model='eval_macro-f1',
                                  logging_strategy='epoch',
                                  evaluation_strategy='epoch',
                                  learning_rate=2e-5,
                                  optim='adamw_torch',
                                  num_train_epochs=10)

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    preds = np.argmax(logits, axis=-1)
    results = classification_report(labels, preds, output_dict=True)
    results['macro-f1'] = results['macro avg']['f1-score']

    models_dir = '/home/lgiordano/LUCA/checkthat_GITHUB/models/M1/RUN_OTTOBRE/no aug, lr 2e-5'
    #model_name_simple = model_name.split('/')[-1]
    model_save_name = f'{date_time}'
    model_save_dir = os.path.join(models_dir, model_save_name)
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    with open(os.path.join(model_save_dir, 'results.json'), 'w', encoding='utf8') as f:
        json.dump(results, f, ensure_ascii = False)

    return results

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/mdeberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=collate_fn,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping]
)

In [7]:
trainer.train()

Epoch,Training Loss,Validation Loss,0,1,Accuracy,Macro avg,Weighted avg,Macro-f1
1,0.5325,0.493489,"{'precision': 0.8237003551824346, 'recall': 0.6398294456985202, 'f1-score': 0.720214568040655, 'support': 3987.0}","{'precision': 0.7259541984732825, 'recall': 0.8744827586206897, 'f1-score': 0.7933263816475495, 'support': 4350.0}",0.762265,"{'precision': 0.7748272768278586, 'recall': 0.7571561021596049, 'f1-score': 0.7567704748441022, 'support': 8337.0}","{'precision': 0.7726993018437263, 'recall': 0.7622646035744273, 'f1-score': 0.7583621498074766, 'support': 8337.0}",0.75677
2,0.4296,0.503855,"{'precision': 0.7771084337349398, 'recall': 0.744168547780286, 'f1-score': 0.7602818705957719, 'support': 3987.0}","{'precision': 0.7742863465368445, 'recall': 0.8043678160919541, 'f1-score': 0.7890404780696809, 'support': 4350.0}",0.775579,"{'precision': 0.7756973901358921, 'recall': 0.77426818193612, 'f1-score': 0.7746611743327264, 'support': 8337.0}","{'precision': 0.7756359521094492, 'recall': 0.7755787453520451, 'f1-score': 0.7752872613252314, 'support': 8337.0}",0.774661
3,0.354,0.86375,"{'precision': 0.7129310344827586, 'recall': 0.8296965136694257, 'f1-score': 0.7668946331285499, 'support': 3987.0}","{'precision': 0.8163375710035163, 'recall': 0.6937931034482758, 'f1-score': 0.7500932024356903, 'support': 4350.0}",0.758786,"{'precision': 0.7646343027431375, 'recall': 0.7617448085588507, 'f1-score': 0.7584939177821202, 'support': 8337.0}","{'precision': 0.7668855065788718, 'recall': 0.7587861341009956, 'f1-score': 0.7581281435622864, 'support': 8337.0}",0.758494


KeyboardInterrupt: 