In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer


In [2]:
# train dataset
# 一个问题，五个可能的答案(A-E),最终一个正确答案
train_df = pd.read_csv('kaggle-llm-science-exam/train.csv')
train_ds = Dataset.from_pandas(train_df)

In [3]:
# 加载分词器
# 连续文本字符串分解成token的过程，eg. [CLS], [SEP]
model_dir = 'bert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [4]:
# 为一个使用BERT模型进行多项选择任务的数据集进行预处理
# 这里A-E映射为0-4, 0-4映射为A-E
options = 'ABCDE'
indices = list(range(5))
option_to_index = {option: index for option, index in zip(options, indices)}
index_to_option = {index: option for option, index in zip(options, indices)}

def preprocess(example):
    first_sentence = [example['prompt']] * 5
    second_sentence = []
    # A B C D E, 每一个遍历一个 加入second_sentence这个list
    for option in options:
        second_sentence.append(example[option])
    
    # 使用tokenizer对每对first_sentence和second_sentence进行分词，并设置截断（truncation=True）以确保长度一致。
    tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)
    # 将正确答案的字符（如'A'）转换为其对应的索引（如0）
    tokenized_example['label'] = option_to_index[example['answer']]
    return tokenized_example

# batched=False表示每次处理一个样本。remove_columns参数用于删除原数据集中不再需要的列。
tokenized_train_ds = train_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [5]:
# 检查传入BERT的数据结构
print(tokenized_train_ds[0])
# input_ids: 五个列表每个列表代表一种选项（A到E）与问题的组合的分词后的数字ID。
# 每个ID代表一个特定的单词或子词单元。例如，101代表特殊的开始标记[CLS]，102代表特殊的结束标记[SEP]。

# token_type_ids：这个列表也包含5个列表，每个列表对应于input_ids中的一个列表。
# 这些ID标识输入中的哪些部分是问题，哪些部分是选项。通常0代表第一部分（问题），1代表第二部分（选项）

# attention_mask：包含5个列表，每个列表对应于input_ids中的一个列表。它们都是由1组成的，表示在处理时应该“注意”所有的token。

# label: 这是一个整数，表示正确答案选项的索引。在这个示例中，label为3，这意味着正确答案是选项D

{'id': 0, 'input_ids': [[101, 5979, 1104, 1103, 1378, 8477, 14702, 4856, 1103, 3772, 1104, 12556, 22293, 8102, 1811, 25082, 113, 150, 11414, 2137, 114, 1113, 1103, 4379, 107, 3764, 2927, 15136, 1596, 3367, 107, 6187, 1874, 10224, 3457, 1107, 15593, 13687, 136, 102, 150, 11414, 2137, 1110, 170, 2749, 1115, 13822, 1103, 4379, 3764, 2927, 15136, 1596, 3367, 1107, 15593, 13687, 1118, 2112, 10164, 1103, 3796, 1104, 170, 1207, 1532, 1104, 2187, 1270, 107, 22520, 1843, 2187, 119, 107, 102], [101, 5979, 1104, 1103, 1378, 8477, 14702, 4856, 1103, 3772, 1104, 12556, 22293, 8102, 1811, 25082, 113, 150, 11414, 2137, 114, 1113, 1103, 4379, 107, 3764, 2927, 15136, 1596, 3367, 107, 6187, 1874, 10224, 3457, 1107, 15593, 13687, 136, 102, 150, 11414, 2137, 1110, 170, 2749, 1115, 6986, 1103, 6187, 1874, 10224, 3457, 1206, 1103, 4379, 3764, 2927, 15136, 1596, 3367, 1107, 15593, 13687, 1105, 1103, 7140, 10537, 4267, 20623, 14971, 1121, 170, 5318, 1104, 1213, 1275, 1106, 170, 5318, 1104, 1164, 1406, 119, 10

In [6]:
# DataCollator
# DataCollator相当于Transform，处理数据，为了让数据更好的传入模型
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        label_name = "label" if 'label' in features[0].keys() else 'labels'
        # 将标签和输入数据分开处理
        labels = [feature.pop(label_name) for feature in features] 
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [7]:
# 模型训练
model = AutoModelForMultipleChoice.from_pretrained(model_dir)
output_dir = 'finetuned_bert'
training_args = TrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none')

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_train_ds,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer))

trainer.train()

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,1.554691
2,No log,1.305511
3,No log,1.026481


TrainOutput(global_step=150, training_loss=1.4502632649739584, metrics={'train_runtime': 187.7864, 'train_samples_per_second': 3.195, 'train_steps_per_second': 0.799, 'total_flos': 134627373808680.0, 'train_loss': 1.4502632649739584, 'epoch': 3.0})

In [8]:
# 保证test与train的数据结构一样
test_df = pd.read_csv('kaggle-llm-science-exam/test.csv')
test_df['answer'] = 'A'  
test_ds = Dataset.from_pandas(test_df)
tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])
test_df.head()

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Unnamed: 0,id,prompt,A,B,C,D,E,answer
0,0,Which of the following statements accurately d...,MOND is a theory that reduces the observed mis...,MOND is a theory that increases the discrepanc...,MOND is a theory that explains the missing bar...,MOND is a theory that reduces the discrepancy ...,MOND is a theory that eliminates the observed ...,A
1,1,Which of the following is an accurate definiti...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,Dynamic scaling refers to the non-evolution of...,Dynamic scaling refers to the evolution of sel...,A
2,2,Which of the following statements accurately d...,The triskeles symbol was reconstructed as a fe...,The triskeles symbol is a representation of th...,The triskeles symbol is a representation of a ...,The triskeles symbol represents three interloc...,The triskeles symbol is a representation of th...,A
3,3,What is the significance of regularization in ...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,Regularizing the mass-energy of an electron wi...,A
4,4,Which of the following statements accurately d...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,The angular spacing of features in the diffrac...,A


In [9]:
# 进行预测
test_predictions = trainer.predict(tokenized_test_ds)

In [10]:
# 获取可能性最大的前三个output

def predictions_to_map_output(predictions):
    top_answer_indices = np.argsort(-predictions)[:,:3]
    top_answers = [' '.join([index_to_option[idx] for idx in row]) for row in top_answer_indices]
    return top_answers

submission_df = test_df[['id']] 
submission_df.loc[:, 'prediction'] = predictions_to_map_output(test_predictions.predictions)
submission_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df.loc[:, 'prediction'] = predictions_to_map_output(test_predictions.predictions)


Unnamed: 0,id,prediction
0,0,D B A
1,1,C D A
2,2,A C D
3,3,A C D
4,4,D E A
