# 获取数据

!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_train_self-instruct.jsonl # part of fine-tuning dataset refined by llama-3.2-1b-instruct
!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_test_public.jsonl # gsm8k public test dataset
!wget https://www.csie.ntu.edu.tw/~b10902031/gsm8k_test_private.jsonl # gsm8k private test dataset
!wget https://www.csie.ntu.edu.tw/~b10902031/ailuminate_test.csv # ailuminate test dataset (public + private)

In [1]:
from transformers import (
    AutoModelForCausalLM, # imports the model for causal language modeling
    AutoTokenizer, # imports the tokenizer for the model
    BitsAndBytesConfig, # imports the configuration for using bitsandbytes
    pipeline # imports the pipeline for text generation
)
from peft import (
    LoraConfig, # imports the configuration for LoRA
    get_peft_model, # imports the function to get the PEFT model
    PeftModel # imports the PEFT model
)
import os
import json
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = '0,1' # Sets the CUDA device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = torch.device('cuda:0') # Creates a CUDA device object
from datasets import Dataset # Imports the Dataset class from the datasets library
from trl import SFTConfig, SFTTrainer # Imports the SFTConfig and SFTTrainer classes from the trl library
import random
random.seed(42) # Sets the random seed for reproducibility
from tqdm import tqdm # Imports the tqdm library for progress bars
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_jsonlines(file_name: str):
    '''以列表形式返回文件中的每一行'''
    f = open(file_name, 'r')
    return [json.loads(line) for line in f]

In [3]:
data_path = "/home/yxlin/github/LHY_ML2025/MLHW6/dataset/"
gsm8k_train = load_jsonlines(data_path + 'gsm8k_train_self-instruct.jsonl')

# 加载模型

## Hugging Fine Tuning 框架

In [4]:
# model_path = "/home/yxlin/huggingface/Qwen2.5-7B-Instruct"
# sft_model = AutoModelForCausalLM.from_pretrained( # Loads the pre-trained model
#     pretrained_model_name_or_path=model_path,
# )
# sft_tokenizer = AutoTokenizer.from_pretrained( # Loads the tokenizer for the model
#     pretrained_model_name_or_path=model_path,
# )
# peft_config = LoraConfig(
#     r=16,
#     lora_alpha=16,
#     # TODO: Adds dropout
#     lora_dropout=0.05,
#     bias='none',
#     task_type='CAUSAL_LM',
#     target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
# )
# peft_model = get_peft_model(sft_model, peft_config)

model_path = "/home/yxlin/huggingface/Qwen2.5-7B-Instruct"

# 1. 配置 device_map="auto" 让模型自动分布在两张卡上
sft_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=model_path,
    device_map="auto",  # 关键修改：自动分配层到不同的 GPU
    torch_dtype=torch.bfloat16, # 显式指定精度，防止加载为 fp32 导致显存爆炸
    trust_remote_code=True
)

sft_tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=model_path,
    trust_remote_code=True
)

peft_config = LoraConfig(
    r=16,
    lora_alpha=16,
    lora_dropout=0.05,
    bias='none',
    task_type='CAUSAL_LM',
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj']
)

`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 4/4 [00:04<00:00,  1.06s/it]


## format dataset

In [5]:
# def nshot_chats(nshot_data: list, n: int, question: str, answer: any, mode: str) -> list: # Function to create n-shot chats
#     '''返回一个列表, 其中元素为dict, dict中是喂给模型的格式; n可以决定有多少个案例, question和answer是我们需要模型解决的问题, 若mode为train则有answer否则无'''
#     if mode not in ['train', 'test']:
#         raise AssertionError('Undefined Mode!!!')

#     chats = []
#     # TODO: Use fixed few-shot examples
#     for qna in random.sample(nshot_data, n): # Samples n examples from the n-shot data
#         chats.append(
#             {
#                 'role': 'user',
#                 'content': f'Q: {qna["question"]}' # Creates a user message with the question
#             }
#         )
#         chats.append(
#             {
#                 'role': 'assistant',
#                 'content': f'A: {qna["answer"]}' # Creates an assistant message with the answer
#             }
#         )

#     chats.append(
#         {
#             'role': 'user',
#             'content': f'Q: {question} Let\'s think step by step. At the end, you MUST write the answer as an integer after \'####\'.' # Creates a user message with the question and instructions
#         }
#     )
#     if mode == 'train':
#         chats.append(
#             {
#                 'role': 'assistant',
#                 'content': f'A: {answer}' # Creates an assistant message with the answer
#             }
#         )

#     return chats # Returns the list of chats


import random

def nshot_chats(nshot_data, n, question, answer, mode):
    if mode not in ['train', 'test']:
        raise AssertionError('Undefined Mode!!!')

    chats = []

    # Use fixed few-shot examples 核心含义是：不要在每次调用时用 random.sample 随机抽取 few-shot 样本，而是使用一组固定、可复现的 few-shot 示例.
    # 这在训练稳定性、评测一致性（尤其是 test / eval）中非常重要。
    rng = random.Random(42)
    fixed_examples = rng.sample(nshot_data, n)

    for qna in fixed_examples:
        chats.append({'role': 'user', 'content': f'Q: {qna["question"]}'})
        chats.append({'role': 'assistant', 'content': f'A: {qna["answer"]}'})

    chats.append({
        'role': 'user',
        'content': f'Q: {question} Let\'s think step by step. At the end, you MUST write the answer as an integer after \'####\'.'
    })

    if mode == 'train':
        chats.append({'role': 'assistant', 'content': f'A: {answer}'})

    return chats


In [6]:
print(sft_tokenizer.chat_template)


print("==============================")

chats = nshot_chats(nshot_data=gsm8k_train, n=1, question=gsm8k_train[0]['question'], answer=gsm8k_train[0]['answer'], mode='train')
train_sample = sft_tokenizer.apply_chat_template(chats, tokenize=False) # Applies the chat template to the chats
print(type(train_sample))

print("==============================")

print(train_sample)

{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba C

In [7]:
print(type(sft_tokenizer(train_sample)))
print(sft_tokenizer(train_sample)['input_ids'])

<class 'transformers.tokenization_utils_base.BatchEncoding'>
[151644, 8948, 198, 2610, 525, 1207, 16948, 11, 3465, 553, 54364, 14817, 13, 1446, 525, 264, 10950, 17847, 13, 151645, 198, 151644, 872, 198, 48, 25, 16046, 20118, 220, 18, 15, 8756, 504, 806, 26368, 311, 279, 36400, 594, 3081, 13, 4636, 11833, 806, 65408, 518, 279, 20336, 3081, 11, 566, 20118, 2114, 13, 2014, 633, 2114, 11, 566, 34192, 369, 220, 18, 15, 4420, 518, 220, 17, 15, 8756, 817, 6460, 13, 2585, 1657, 8756, 304, 2790, 1558, 16046, 5821, 30, 151645, 198, 151644, 77091, 198, 32, 25, 5512, 11, 1077, 594, 1477, 279, 6010, 504, 279, 26368, 311, 279, 36400, 594, 3081, 13, 220, 18, 15, 8756, 382, 5847, 11, 1077, 594, 1477, 279, 6010, 504, 279, 36400, 594, 3081, 311, 16046, 594, 2114, 13, 8704, 566, 34192, 369, 220, 18, 15, 4420, 518, 220, 17, 15, 8756, 817, 6460, 11, 582, 1184, 311, 5508, 279, 220, 18, 15, 4420, 311, 4115, 13, 220, 18, 15, 4420, 374, 220, 15, 13, 20, 4115, 13, 4695, 11, 582, 30270, 220, 15, 13, 20, 4115, 55

In [8]:
formatted_gsm8k = []
TRAIN_N_SHOT = 4 # TODO: Give model more examples

for qna in gsm8k_train: # Iterates over the GSM8K training data
    chats = nshot_chats(nshot_data=gsm8k_train, n=TRAIN_N_SHOT, question=qna['question'], answer=qna['answer'], mode='train') # Creates n-shot chats for the current example
    
    train_sample = sft_tokenizer.apply_chat_template(chats, tokenize=False) # Applies the chat template to the chats
    
    formatted_gsm8k.append( # Appends the formatted example to the list
        {
            'text': train_sample # Adds the text of the example
        }
    )

formatted_gsm8k = Dataset.from_list(formatted_gsm8k) # Creates a dataset from the list of formatted examples

# 微调

In [9]:
# # trainer
# training_arguments = SFTConfig( # Configuration for the SFT trainer
#     seed=1126,
#     data_seed=1126,
#     output_dir=f"sft",
#     per_device_train_batch_size=1,
#     gradient_accumulation_steps=4,
#     optim="paged_adamw_32bit",
#     num_train_epochs=2, # TODO: If you use fixed few-shot examples, increase epoch
#     logging_strategy="steps",
#     logging_steps=0.1,
#     save_strategy="steps",
#     save_steps=0.1,
#     lr_scheduler_type='linear',
#     learning_rate=2e-4, # TODO: Decrease learning rate
#     # TODO: Add weight decay
#     bf16=True,
#     group_by_length=True,
#     dataset_text_field='text',
#     report_to='none',
# )
# trainer = SFTTrainer( # Creates the SFT trainer
#     model=peft_model,
#     train_dataset=formatted_gsm8k,
#     peft_config=peft_config,
#     processing_class=sft_tokenizer,
#     args=training_arguments,
# )
# trainer.train() # Starts the training process


# 注意：在使用 device_map 时，TRL 的 SFTTrainer 最好直接传入 base model 和 peft_config
# 让 Trainer 内部处理 PEFT 的包装，这样兼容性更好。
# 如果你坚持手动 get_peft_model，也可以，但 Trainer 可能会报 device 相关的 warning。

training_arguments = SFTConfig(
    seed=1126,
    data_seed=1126,
    output_dir="sft",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    optim="paged_adamw_32bit",
    num_train_epochs=2, # TODO: If you use fixed few-shot examples, increase epoch
    logging_strategy="steps",
    logging_steps=10, # 建议设大一点，0.1 step 没意义
    save_strategy="steps",
    save_steps=50,    # 同上
    lr_scheduler_type='linear',
    learning_rate=2e-5, # TODO: Decrease learning rate
    weight_decay=0.01,   # TODO: Add weight decay
    bf16=True,
    group_by_length=True,
    dataset_text_field='text',
    report_to='none',
    ddp_find_unused_parameters=False, # 防止多卡训练报错
)

trainer = SFTTrainer(
    model=sft_model,      # 传入 Base Model
    train_dataset=formatted_gsm8k,
    peft_config=peft_config, # 传入 Config，SFTTrainer 会自动应用 LoRA
    processing_class=sft_tokenizer,
    args=training_arguments,
)

trainer.train()

Adding EOS to train dataset: 100%|██████████| 7472/7472 [00:00<00:00, 30837.86 examples/s]
Tokenizing train dataset: 100%|██████████| 7472/7472 [00:18<00:00, 414.60 examples/s]
Truncating train dataset: 100%|██████████| 7472/7472 [00:00<00:00, 80397.32 examples/s]
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None, 'pad_token_id': 151643}.


Step,Training Loss
10,1.0588
20,0.746
30,0.5212
40,0.3847
50,0.2979
60,0.2232
70,0.1566
80,0.1033
90,0.0585
100,0.0333


TrainOutput(global_step=3736, training_loss=0.019541105053066506, metrics={'train_runtime': 7548.046, 'train_samples_per_second': 1.98, 'train_steps_per_second': 0.495, 'total_flos': 6.52902139567276e+17, 'train_loss': 0.019541105053066506, 'entropy': 0.011605323505743096, 'num_tokens': 15302656.0, 'mean_token_accuracy': 0.9979227781295776, 'epoch': 2.0})

In [11]:
generator = pipeline( # Creates a text generation pipeline
    'text-generation',
    model=sft_model,
    tokenizer=sft_tokenizer,
    pad_token_id=sft_tokenizer.eos_token_id,
    max_new_tokens=256, # TODO: Increase max_new_tokens for longer output
    # TODO: Use greedy decoding strategy
    do_sample=True,
    temperature=0.6,
    top_p=0.9,
)
adapter_path = 'sft/checkpoint-1900' # TODO: Evaluate different checkpoints
pipeline.model = PeftModel.from_pretrained( # Loads the adapter checkpoint
    sft_model,
    adapter_path
)

Device set to use cuda:0


In [12]:
def get_response(chats: list): # Function to get the response from the model
    gen_text = generator(chats)[0]  # First return sequence
    return gen_text['generated_text'][-1]['content'] # Returns the content of the last generated text

def extract_ans_from_response(answer: str): # Function to extract the answer from the response
    answer = answer.split('####')[-1].strip() # Splits the answer by '####' and takes the last part

    for remove_char in [',', '$', '%', 'g']: # Removes unwanted characters from the answer
        answer = answer.replace(remove_char, '')

    return answer # Returns the extracted answer

In [None]:
gsm8k_predictions = []
TEST_N_SHOT = 4 # TODO: give model more examples

gsm8k_test_public = load_jsonlines(data_path + 'gsm8k_test_public.jsonl') # Loads the GSM8K public test data
gsm8k_total = len(gsm8k_test_public) # Gets the total number of examples in the public test data
gsm8k_progress_bar = tqdm(total=gsm8k_total, desc='GSM8K Public Test Data Evaluation', postfix='Current Accuracy = 0.000') # Creates a progress bar for the public test data evaluation

correct = 0

for i, qna in enumerate(gsm8k_test_public): # Iterates over the public test data

    messages = nshot_chats(nshot_data=gsm8k_train, n=TEST_N_SHOT, question=qna['question'], answer=None, mode='test') # Creates n-shot chats for the current example
    response = get_response(messages) # Gets the response from the model

    pred_ans = extract_ans_from_response(response) # Extracts the predicted answer from the response
    true_ans = extract_ans_from_response(qna["answer"]) # Extracts the true answer from the example
    if pred_ans == true_ans: # Checks if the predicted answer is correct
        correct += 1 # Increments the correct count if the prediction is correct
    gsm8k_predictions.append(pred_ans) # Appends the predicted answer to the list of predictions

    gsm8k_progress_bar.set_postfix_str(f'Current Accuracy = {correct/(i+1):.3f}') # Updates the progress bar with the current accuracy
    gsm8k_progress_bar.update() # Updates the progress bar

gsm8k_progress_bar.close() # Closes the progress bar

print(f'GSM8K Public Test Data Evaluation Complete, Total Accuracy: {correct/gsm8k_total:.3f}') # Prints the total accuracy on the public test data

GSM8K Public Test Data Evaluation:   8%|▊         | 10/132 [01:45<17:55,  8.82s/it, Current Accuracy = 0.800]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
GSM8K Public Test Data Evaluation: 100%|██████████| 132/132 [24:46<00:00, 11.26s/it, Current Accuracy = 0.758]

GSM8K Public Test Data Evaluation Complete, Total Accuracy: 0.758





FileNotFoundError: [Errno 2] No such file or directory: 'gsm8k_test_private.jsonl'

In [15]:
gsm8k_test_private = load_jsonlines(data_path + 'gsm8k_test_private.jsonl') # Loads the GSM8K private test data
gsm8k_total = len(gsm8k_test_private) # Gets the total number of examples in the private test data
gsm8k_progress_bar = tqdm(total=gsm8k_total, desc='GSM8K Private Test Data Inference') # Creates a progress bar for the private test data evaluation

for i, qna in enumerate(gsm8k_test_private): # Iterates over the private test data

    messages = nshot_chats(nshot_data=gsm8k_train, n=TEST_N_SHOT, question=qna['question'], answer=None, mode='test') # Creates n-shot chats for the current example
    response = get_response(messages) # Gets the response from the model

    pred_ans = extract_ans_from_response(response) # Extracts the predicted answer from the response
    gsm8k_predictions.append(pred_ans) # Appends the predicted answer to the list of predictions

    gsm8k_progress_bar.update() # Updates the progress bar

gsm8k_progress_bar.close() # Closes the progress bar

print(f'GSM8K Private Test Data Inference Complete') # Prints a message indicating that the private test data evaluation is complete

GSM8K Private Test Data Inference: 100%|██████████| 132/132 [24:04<00:00, 10.94s/it]

GSM8K Private Test Data Inference Complete





In [16]:
def load_csv(file_name: str):
    csvfile = open(file_name)
    rows = csv.DictReader(csvfile)
    questions = []
    for row in rows:
        questions.append(row['prompt_text'])
    return questions

ailuminate_predictions = []

ailuminate_test = load_csv(data_path + 'ailuminate_test.csv') # Loads the AILuminate test data
ailuminate_total = len(ailuminate_test) # Gets the total number of examples in the AILuminate test data
ailuminate_progress_bar = tqdm(total=ailuminate_total, desc='AILuminate Test Data Evaluation') # Creates a progress bar for the AILuminate test data evaluation

for i, question in enumerate(ailuminate_test): # Iterates over the AILuminate test data

    message = [
        {
            'role': 'user',
            'content': question
        }
    ]
    response = get_response(message) # Gets the response from the model
    ailuminate_predictions.append(response) # Appends the response to the list of predictions

    ailuminate_progress_bar.update() # Updates the progress bar
ailuminate_progress_bar.close() # Closes the progress bar

print(f'AIluminate Test Data Evaluation Complete')

AILuminate Test Data Evaluation: 100%|██████████| 240/240 [53:34<00:00, 13.40s/it] 

AIluminate Test Data Evaluation Complete





In [17]:
# Combine the results into one file.
STUDENT_ID = '22551052' # TODO: Add your student id
with open(f'./{STUDENT_ID}.txt', 'w') as output_f:
  print(gsm8k_predictions + ailuminate_predictions, file=output_f) # Prints the predictions to the output file