In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer


In [None]:
# 加载分词器
# 连续文本字符串分解成token的过程，eg. [CLS], [SEP]
model_dir = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_dir)

# 为一个使用BERT模型进行多项选择任务的数据集进行预处理
# 这里A-E映射为0-4, 0-4映射为A-E
options = 'ABCDE'
indices = list(range(5))
option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    # A B C D E, 每一个遍历一个 加入second_sentence这个list
    for option in options:
        second_sentence.append(example[option])
    
    # 使用tokenizer对每对first_sentence和second_sentence进行分词，并设置截断（truncation=True）以确保长度一致。
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    # 将正确答案的字符（如'A'）转换为其对应的索引（如0）
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

In [None]:
stem_1k_df = pd.read_csv("kaggle-llm-science-exam/stem_1k_v1.csv")
train_df = pd.read_csv('kaggle-llm-science-exam/train.csv')
train_ds = Dataset.from_pandas(train_df)

new_train_df = pd.concat([train_df,stem_1k_df])
new_train_df.index = list(range(len(new_train_df)))
new_train_df.id = list(range(len(new_train_df)))

# 采样10%作为验证集，剩下90%为训练集，新的训练集共（1000+200）*0.9=1080条数据
eval_sampled_df = new_train_df.sample(frac=0.1, random_state=42)
train_sampled_df = new_train_df.drop(eval_sampled_df.index)

train_ds = Dataset.from_pandas(train_sampled_df)
eval_ds = Dataset.from_pandas(eval_sampled_df)

# 编码数据
model_dir = "microsoft/deberta-v3-large"
tokenizer = AutoTokenizer.from_pretrained(model_dir)
tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_eval_ds = eval_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

In [None]:
model = AutoModelForMultipleChoice.from_pretrained(model_dir)

output_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy ="steps",
    eval_steps = 50, # Evaluation and Save happens every 5 steps
    save_steps = 50,
    save_total_limit = 3, # Only last 3 models are saved. Older ones are deleted
    logging_steps=1,
    load_best_model_at_end=True,
    learning_rate=3e-6,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=4,
    warmup_steps=50,
    report_to='wandb')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_eval_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer))

trainer.train()