# Author: Seunghee Kim
- Created on: 2024-12-02

In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
import torch

In [2]:
class CFG:
    SEED = 1
    EPOCH = 3
    LR = 5e-5
    WEIGHT_DECAY = 0.01
    BATCH_SIZE = 16
    
    SAVE_STRATEGY = 'epoch'
    EVALUATION_STRATEGY = 'epoch'
    METRIC = 'f1'
    EXP_NUM = 1
    
    
    model = 'microsoft/deberta-v3-xsmall'
    
    # PATH
    train_df_path = './df_final_train.csv'
    valid_df_path = './df_final_valid.csv'
    test_df_path = './df_final_test.csv'
    
    model_output_path = f'./results_{EXP_NUM}'
    test_inference_path = f'test_predictions_{EXP_NUM}'


In [None]:
train_df = pd.read_csv(CFG.train_df_path)
valid_df = pd.read_csv(CFG.valid_df_path)
test_df = pd.read_csv(CFG.test_df_path)

# train_df = train_df.head()
# valid_df = valid_df.head()
# test_df = test_df.head()




def preprocess_data(df):
    return Dataset.from_pandas(df[['text', 'label']])

train_dataset = preprocess_data(train_df)
valid_dataset = preprocess_data(valid_df)
test_dataset = preprocess_data(test_df)

model_name = CFG.model
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
valid_dataset = valid_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.rename_column("label", "labels")
valid_dataset = valid_dataset.rename_column("label", "labels")

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
valid_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

training_args = TrainingArguments(
    output_dir=CFG.model_output_path,
    evaluation_strategy=CFG.EVALUATION_STRATEGY,
    save_strategy=CFG.SAVE_STRATEGY,
    per_device_train_batch_size=CFG.BATCH_SIZE,
    per_device_eval_batch_size=CFG.BATCH_SIZE,
    num_train_epochs=CFG.EPOCH,
    weight_decay=CFG.WEIGHT_DECAY,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model=CFG.METRIC,
    learning_rate=CFG.LR,
    greater_is_better=True,
    seed=CFG.SEED
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

eval_results = trainer.evaluate()
print("Validation Results:\n", eval_results)

test_predictions = trainer.predict(test_dataset)

test_preds = torch.argmax(torch.tensor(test_predictions.predictions), dim=-1).numpy()
test_labels = test_predictions.label_ids


def compute_metrics_for_test(labels, preds):
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

test_metrics = compute_metrics_for_test(test_labels, test_preds)

print("Test Inference Results:", test_metrics)

test_df['predicted_label'] = test_preds
test_df.to_csv(CFG.test_inference_path, index=False)



Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-xsmall and are newly initialized: ['classifier.weight', 'pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mgyg9325[0m ([33munig[0m). Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/1689 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.0867, 'learning_rate': 3.5198342214328006e-05, 'epoch': 0.89}


  0%|          | 0/57 [00:00<?, ?it/s]

{'eval_loss': 0.008164696395397186, 'eval_accuracy': 0.9988888888888889, 'eval_precision': 1.0, 'eval_recall': 0.9977777777777778, 'eval_f1': 0.9988876529477196, 'eval_runtime': 11.6954, 'eval_samples_per_second': 76.953, 'eval_steps_per_second': 4.874, 'epoch': 1.0}
{'loss': 0.0097, 'learning_rate': 2.039668442865601e-05, 'epoch': 1.78}
