## 数据准备

In [1]:
import json
import random
from tqdm import tqdm
data_dict={}
def read_json(data_path,data_type):
    if data_dict.get(data_type,-1)!=-1:
        return data_dict[data_type]
    else:
        with open(data_path, 'r', encoding='utf-8') as f:
            data =json.load(f)
        data_dict[data_type]=data
        return data
    
def get_datas(data_path,data_type):
    data=read_json(data_path,data_type)
    answers=[]
    questions=[]
    title_to_contexts=[]
    support_facts=[]
    for i,d in tqdm(enumerate(data)):
        ## 删除训练集中的单跳问题
        if data_type=='train':
            if d['level']=='easy':
                continue
        
        ## 验证集只保留难度为hard的文段，结果较为稳定
        if data_type=='dev':
            if d['level']!='hard':
                continue
        
        
        answer=d['answer']
        
        ## 检查答案被包含在多少个文段中，如果答案出现的文段数多于两个，说明答案的质量不佳
        paragraph_with_answer=0
        for x in d['context']:
            s=' '.join(x[1])
            if s.find(answer)!=-1:
                paragraph_with_answer+=1
                
        answer_token_num=len(answer.split())
        ## 删除训练集中答案信息不充分的数据如yes,no
        ## 答案中所有单词都为出现在文段中视为信息不充分
        if paragraph_with_answer>2:
            if data_type=='train':
                p_drop=0.99
                if random.random()<p_drop:
                    continue
                
        if answer_token_num<=3 and paragraph_with_answer==0:
            if data_type=='train':
                p_drop=0.99
                if random.random()<p_drop:
                    continue
        
        if data_type=='train':
            p_drop=1-0.2*answer_token_num
            if random.random()<p_drop:
                continue
      
        ## support_facts
        support_facts_list=[]
        fact_set=set()
        if data_type=='train':
            titles=[]
            for item in d['supporting_facts']:
                fact_set.add(item[0])
            support_facts_list=list(fact_set)
        else:
            for item in d['pred_support_facts']:
                support_facts_list.append(item[0])
     
            

        ## title_to_context
        title_to_context={}   
        for x in d['context']:
            s=' '.join(x[1])
            context=f'{x[0]}. {s} '
            #context=context.replace(f'{answer}',f'<ans> {answer} </ans>')
            title_to_context[x[0]]=context

        question=[]
        if data_type!='test':
            question=d['question']
            
        questions.append(question)
        title_to_contexts.append(title_to_context)
        answers.append(answer)
        support_facts.append(support_facts_list)
        

    return title_to_contexts,questions,answers,support_facts

In [2]:
def get_context(title_to_context,support_facts_list,data_type='train',pg_num=10):
    if data_type=='train':
        if len(support_facts_list)>pg_num:
                support_facts_list=support_facts_list[:pg_num]
        else:
            num=pg_num-len(support_facts_list)
            titles=[]
            for title in list(title_to_context.keys()):
                if title not in support_facts_list:
                    titles.append(title)
            support_facts_list.extend(random.choices(titles,k=min(len(titles),num)))
            random.shuffle(support_facts_list)
    else:
        support_facts_list=support_facts_list[:pg_num]
        
    context=''   
    for title in support_facts_list:
        context+=title_to_context[title]+'\n'
    return context

## 封装成huggingface dataset
import datasets
def get_dataset(data_path,data_type,instruction,load=True):
    ## 对于train_dataset,会进行保存，以保证每次加载数据是一致的，可以继续训练
    if data_type=='train' and load :
        try:
            dataset=datasets.load_from_disk('/kaggle/working/llama')
            return dataset
        except:
            pass
    title_to_contexts,questions,answers,support_facts=get_datas(data_path,data_type)
    inputs=[]
    for i in range(len(answers)):
        context=get_context(title_to_contexts[i],support_facts[i],data_type,pg_num=10)
        input=f'Answer: {answers[i]} \nContext:\n{context}\n'
        inputs.append(input)
    
    dict_data={}
    dict_data['instruction']=[instruction]*len(answers)
    dict_data['input']=inputs
    dict_data['output']=questions
    if data_type!='train':
        dict_data['output']=['']*len(answers)
        dict_data['real_output']=questions
    dataset=datasets.Dataset.from_dict(dict_data)
    
    if data_type=='train':
        dataset.save_to_disk(dataset_path='./')
    return dataset

In [3]:
train_path='/kaggle/input/scnu-ai-challenge-5/train.json'
dev_path='/kaggle/input/scnu-ai-challenge-dataset-with-sorted-pred-facts/dev.json'
test_path='/kaggle/input/scnu-ai-challenge-dataset-with-sorted-pred-facts/test.json'
## 设置指令
##包含思维链
## 先要求模型找出support_facts,再生成问题
instruction_with_Thinking_chain='The following is an answer and ten paragraphs. \
Please think about which two paragraphs are most relevant to the given answer,\
then generate a question according to the answer and two paragraphs.\n\
Please note that your Response needs to be in the following format.\n\
## Question: \n\
'
## 不包含思维链
instruction1='Please generate only one question according to the answer and the paragraphs.'

instruction=instruction_with_Thinking_chain
train_dataset=get_dataset(train_path,'train',instruction,load=True)
dev_dataset=get_dataset(dev_path,'dev',instruction)
test_dataset=get_dataset(test_path,'test',instruction)

1500it [00:00, 81250.32it/s]
7405it [00:00, 31894.29it/s]


In [4]:
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

 ### Response:
{}"""

EOS_TOKEN = '<|end_of_text|>' # Must add EOS_TOKEN
## 训练数据要加结束符，验证和测试不用
training=True
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        if training:
            output=f'## Question: {output}\n{EOS_TOKEN}'
        text = alpaca_prompt.format(instruction, input, output) #+ EOS_TOKEN
        
        texts.append(text)
    return { "text" : texts, }



training=True
train_dataset = train_dataset.map(formatting_prompts_func, batched = True,)
train_dataset.shuffle(seed=42)
training=False
dev_dataset = dev_dataset.map(formatting_prompts_func, batched = True,)
test_dataset = test_dataset.map(formatting_prompts_func, batched = True,)

print('train_dataset:',len(train_dataset))
print('dev_dataset:',len(dev_dataset))
print('test_dataset:',len(test_dataset))

Map:   0%|          | 0/360 [00:00<?, ? examples/s]

Map:   0%|          | 0/7405 [00:00<?, ? examples/s]

train_dataset: 25272
dev_dataset: 360
test_dataset: 7405


In [6]:
print(dev_dataset.select([1])[0]['text'])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
The following is an answer and ten paragraphs. Please think about which two paragraphs are most relevant to the given answer,then generate a question according to the answer and two paragraphs.
Please note that your Response needs to be in the following format.
## Question: 


### Input:
Answer: the tenth season 
Context:
Bigg Boss 10. Bigg Boss 10 is the tenth season of the Indian reality TV series "Bigg Boss".  It began airing on 16 October 2016 on Colors.  The show is also available after the original telecast on Viacom 18's digital platform – Voot.  A new element called ‘Unseen-Undekha’ was introduced by way of unseen footage uploaded on Voot. This footage showed parts of the day that weren’t included in the episode, from ‘wake-up call’ to ‘lights out’. 
Lopamudra Raut. Lopamudra Raut is an Indian model and b

## 加载模型

In [7]:
%%time
!mamba install --force-reinstall aiohttp -y
!pip install -U "xformers<0.0.26" --index-url https://download.pytorch.org/whl/cu121
!pip install "unsloth[kaggle-new] @ git+https://github.com/unslothai/unsloth.git"

# Temporary fix for https://github.com/huggingface/datasets/issues/6753
!pip install datasets==2.16.0 fsspec==2023.10.0 gcsfs==2023.10.0
!pip install deepspeed
!pip install -q --upgrade transformers deepspeed
import os
os.environ["WANDB_DISABLED"] = "true"


Looking for: ['aiohttp']

[?25l[2K[0G[+] 0.0s
[2K[1A[2K[0G[+] 0.1s
rapidsai/linux-64 (check zst) [90m╸[0m[33m━━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/s Checking  0.1s[2K[1A[2K[0G[+] 0.2s
rapidsai/linux-64 (check zst) [90m╸[0m[33m━━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/s Checking  0.2s[2K[1A[2K[0G[+] 0.3s
rapidsai/linux-64 (check zst) [90m━╸[0m[33m━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/s Checking  0.3s[2K[1A[2K[0G[+] 0.4s
rapidsai/linux-64 (check zst) [90m━╸[0m[33m━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/s Checking  0.4s[2K[1A[2K[0G[+] 0.5s
rapidsai/linux-64 (check zst) [90m━╸[0m[33m━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/s Checking  0.5s[2K[1A[2K[0Grapidsai/linux-64 (check zst)                       Checked  0.5s
[?25l[2K[0G[+] 0.0s
[2K[1A[2K[0G[+] 0.1s
rapidsai/noarch (check zst) [90m━━━╸[0m[33m━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/s Checking  0.1s[2K[1A[2K[0G[+] 0.2s
rapidsai/noarch (check zst) [90m━━━╸[0m[33m━━━━━━━━━━━━━[0m   0.0 B @  ??.?MB/

In [16]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/llama-2-13b-bnb-4bit",
    "unsloth/codellama-34b-bnb-4bit",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
    "unsloth/llama-3-70b-bnb-4bit",
] # More models at https://huggingface.co/unsloth

def init_model(model_name="unsloth/llama-3-8b-Instruct-bnb-4bit"):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_name, # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
        # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
    )

    ## LoRA 
    ## 超参
    ## r: 8 or 16
    ## lora_dropout 0.2 or 0
    model = FastLanguageModel.get_peft_model(
        model,
        r = 8, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
       #peft_type="ADALORA",
        target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                          "gate_proj", "up_proj", "down_proj",],
        lora_alpha = 8,
        lora_dropout = 0, # Supports any, but = 0 is optimized
        bias = "none",    # Supports any, but = "none" is optimized
        use_gradient_checkpointing = "unsloth", # 4x longer contexts auto supported!
        random_state = 3407,
        use_rslora = False,  # We support rank stabilized LoRA
        loftq_config = None, # And LoftQ
    )
    return model,tokenizer

def save_checkpoint(dir='lora_model'):
    model.save_pretrained(dir)
    # model.push_to_hub("your_name/lora_model", token = "...") # Online saving

def load_checkpoint(dir='lora_model'):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = dir, # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    return model,tokenizer

mode='load'## 'build'
model_name="unsloth/llama-3-8b-Instruct-bnb-4bit"
checkpoint_dir='/kaggle/input/llama-3-qg-checkpoint-1/best_lora_model' ## 'best_lora_model'
if mode=='build':
    model,tokenizer=init_model(model_name)
else:
    model,tokenizer=load_checkpoint(checkpoint_dir)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.44.0.
   \\   /|    GPU: Tesla T4. Max memory: 14.741 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.2.2+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.25.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## eval_model

In [9]:
!pip install pycocoevalcap
!pip install bert_score

  pid, fd = os.forkpty()


Collecting pycocoevalcap
  Downloading pycocoevalcap-1.2-py3-none-any.whl.metadata (3.2 kB)
Collecting pycocotools>=2.0.2 (from pycocoevalcap)
  Downloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.1 kB)
Downloading pycocoevalcap-1.2-py3-none-any.whl (104.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pycocotools-2.0.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (427 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.8/427.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pycocotools, pycocoevalcap
Successfully installed pycocoevalcap-1.2 pycocotools-2.0.8
Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [3

In [10]:
from pycocoevalcap.bleu.bleu import Bleu
from pycocoevalcap.meteor.meteor import Meteor
from pycocoevalcap.rouge.rouge import Rouge
from bert_score import BERTScorer
import numpy as np

scorers = {
        "Bleu": Bleu(4),
        #"Meteor": Meteor(),
        #"Rouge": Rouge(),
    }
import bert_score
bert_scorer = BERTScorer(lang="en",model_type='roberta-large',rescale_with_baseline=True)
# 测评问题的流畅性
def fluencyScore(preds_list, gold_list):
    
    gts = {}
    res = {}
    for i, (p, g) in enumerate(zip(preds_list, gold_list)):
        gts[i] = [p]
        res[i] = [g]
    scores = {}
    for name, scorer in scorers.items():
        score, all_scores = scorer.compute_score(gts, res)
        if isinstance(score, list):
            for i, sc in enumerate(score, 1):
                scores[name + str(i)] = sc
        else:
            scores[name] = score
    return scores,all_scores[-1]

# 测评语义相似度
def SemanticScore(preds_list, gold_list):
    p,r,f1 = bert_scorer.score(preds_list, gold_list, verbose=True)
    bert_score = np.mean(f1.tolist())
    return bert_score

def getTotalScore(preds_list,gold_list):
    bert_score = SemanticScore(preds_list,gold_list)
    scores,all_scores = fluencyScore(preds_list,gold_list)
    last_score = (bert_score/2+scores['Bleu4']/2)*100
   # print(scores)
    return {'TotalScore':last_score, 
            'BERTScore':bert_score,
            'Bleu1':scores['Bleu1'],
            'Bleu2':scores['Bleu2'],
            'Bleu3':scores['Bleu3'],
            'Bleu4':scores['Bleu4'],
           },all_scores


@torch.no_grad()
def generator(input):
    inputs = tokenizer(
    [
        input
    ],padding=True,truncation=True,max_length=max_seq_length, return_tensors = "pt").to("cuda")
    
    outputs = model.generate(**inputs, 
                             max_new_tokens = 64, 
                             do_sample=False,
                             #num_beams=3,
                             use_cache = True,
                            )
    output=tokenizer.batch_decode(outputs,skip_special_tokens=True)
    return output[0].split('### Response:\n')[-1].replace('## Question:','').replace('\n','')

import logging    
logging.getLogger().setLevel(logging.ERROR)
logging.getLogger("transformers").setLevel(logging.ERROR) 

best_score=0
## 评估模型
def eval_model(epoch=1,testing_batch=-1):
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference
    model.eval()
    preds_list = []
    real_list=[]
    if testing_batch<=0:
        testing_batch=len(dev_dataset)
    with tqdm(total=testing_batch, desc=f'Validation Epoch {epoch}', unit='batch') as pbar:
        for i in range(testing_batch):
            if i>testing_batch:break
            with torch.no_grad():
                d=dev_dataset[i]
                input=d['text']
                question=d['real_output']
                result=generator(input)
                preds_list.append(result)
                real_list.append(question)
                
            pbar.update(1)
    model.train()
    scores,all_score = getTotalScore(preds_list, real_list)
    for i in range(20):
        print('pred: ',preds_list[i])
        print("true: ",real_list[i])
        print('bleu4:',all_score[i])
        
    global best_score
    if scores['TotalScore']>best_score:
        print(f"Total score: {best_score} -> {scores['TotalScore'] }")
        best_score=scores['TotalScore']
        print('saving best model!')
        save_checkpoint('best_lora_model')
        
    print(f"Scores: {scores}")
    return preds_list

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 训练

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from trl import DataCollatorForCompletionOnlyLM

response_template = "\n ### Response:"
response_template_with_context = " ### Response:\n"# We added context here: "\n". This is enough for this tokenizer
response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`

data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)
#collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
def get_SFTTrainer(dataset,max_steps=60,lr=1e-4):
    model.train()
    trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = dataset,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        dataset_num_proc = 2,
        data_collator=data_collator,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 8,
            warmup_steps = 5,
            max_steps = max_steps,
            learning_rate = lr,
            fp16 = not torch.cuda.is_bf16_supported(),
            bf16 = torch.cuda.is_bf16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "none",
            prediction_loss_only=True,
            disable_tqdm=False,
        ),
        #deepspeed=ds_json
    )
    return trainer


## 主要超参
### 1.数据配比，质量较差的数据保留多少？
##### 暂定不保留，清洗后训练数据剩余3.7w条
### 2.指令如何设置，加不加思维链？如何加比较好？
##### 待测
### 3.lora的r设置为多少？dropout设置为多少？
##### 单一任务r设置过大容易过拟合，r=8似乎效果不错
##### dropout设置为0.2
##### r=16,dropout=0 效果似乎不佳。
### ~4.先进行指令微调是否有利于模型理解指令~
##### LoRA不适合多阶段微调
### 5.学习率如何设置？
##### 暂定最大1e-4,并指数衰减到1e-5。2e-4时损失波动较大。
### ~6.最大长度和pg_num如何设置~
##### 如果文本被截断对模型有很大影响，直接设置最大长度为4096，pg_num=10

In [None]:
## 下游任务微调
## 数据分桶，避免在同样数据上重复训练，影响模型性能
num_buckets=30 ## 数据桶数量
begin_epoch=12  ## 从断开位置继续训练
end_epoch=26   ## 不超过桶的数量
total_epoch=10 ## 预计总共要训的epoch数
steps=50     ## 每次训多少个batch
eval_epoch=1   ## 验证频率
max_lr=2e-4
min_lr=2e-5
decay=0.9
##每个epoch训练100个batch,800个样本
for i in range(begin_epoch,end_epoch):
    lr=max(max_lr*(decay**i),min_lr)
    FastLanguageModel.for_training(model)
    epoch_dataset=train_dataset.shard(num_shards=num_buckets,index=i)
    trainer=get_SFTTrainer(epoch_dataset,steps,lr)
    print('lr:',lr)
    print('Training!')
    trainer_stats = trainer.train()
    print('avg_loss:',trainer_stats.training_loss)
    save_checkpoint()
    if i%eval_epoch==0:
        eval_model(i,50)

## 在验证集上测试

In [19]:
pred_list=eval_model(0,20)

Validation Epoch 0: 100%|██████████| 20/20 [01:04<00:00,  3.24s/batch]

calculating scores...
computing bert embedding.





  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.22 seconds, 91.64 sentences/sec
{'testlen': 280, 'reflen': 274, 'guess': [280, 260, 240, 220], 'correct': [134, 64, 34, 17]}
ratio: 1.0218978102152485
pred:  Are Plantago and Trichosanthes both plant life?
true:  Are Trichosanthes and Plantago both forms of plant life?
bleu4: 6.936319082866856e-09
pred:  What season of the Indian reality TV series did Lopamudra Raut participate in?
true:  In which season of the Indian reality TV show "Big Boss" did the model Lopamundra Raut participate?
bleu4: 0.302770291955685
pred:  Which incumbent Republican U.S. Senator won re-election to his first full term in the 2012 United States Senate election in Nevada?
true:  The 2012 United States Senate election in Nevada concluded with a close victory for which current Republican incumbent?
bleu4: 0.23278057780280073
pred:  When did the character Moe Szyslak first appear on the Fox network in the United States?
true:  When was the Simpson's episode broadcasted that introduced the character Morr

In [None]:
## 测试截断
def test_trunction(max_seq_len):
    from tqdm import tqdm
    c=0
    for i in tqdm(range(2000)):
        tokens=len(tokenizer(train_dataset[i]['text'])['input_ids'])
        if tokens>max_seq_len:
            c+=1
    print(c)
    print('被截断数据的比例:',c/2000)
    
test_trunction(4096)

## zero-shot
- TotalScore:-15.853062438587193
- BERTScore: -0.3292037755716592
- Bleu4: 0.012142526799915364

## sft-training
- TotalScore:14.13900637621369
- BERTScore:0.2392001298184578
- Bleu4:0.04357999770581597

## 800examples
- TotalScore:17.026505696049146
- BERTScore:0.26740919244941325
- Bleu4:0.07312092147156968
- loss:1.04


## 在测试集上测试

In [None]:
test_data_path='/kaggle/input/scnu-ai-challenge-dataset-with-sorted-pred-facts/test.json'
with open(args.test_data_path, 'r', encoding='utf-8') as json_file:
    data = json.load(json_file)
test_id_list = [item['_id'] for item in data]

# 测试
generated_questions = []
generated_questions_dict = []
FastLanguageModel.for_inference(model
with tqdm(total=len(test_dataset), desc=f'Test epoch {1}/{1}', unit='batch') as pbar:
    for i in range(len(test_dataset)):
        d=test_dataset[i]
        with torch.no_grad():
            input=d['text']
            result=generator(input)
            generated_questions.append(result)
 
        pbar.update(1)

for _,item in enumerate(generated_questions):
    generated_questions_dict.append({'_id':test_id_list[_],'question':item})

with open('output.json', 'w', encoding='utf-8') as json_file:
        json.dump(generated_questions_dict, json_file, ensure_ascii=False, indent=4)

### warning:
#### llama用的tokenizer对上下文敏感（包括空格和换行）

In [20]:
def print_tokens_with_ids(txt):
    tokens = tokenizer.tokenize(txt, add_special_tokens=False)
    token_ids = tokenizer.encode(txt, add_special_tokens=False)
    print(list(zip(tokens, token_ids)))

prompt =train_dataset[0]['text']
print_tokens_with_ids(prompt[-500:])  # [..., ('▁Hello', 15043), ('<0x0A>', 13), ('<0x0A>', 13), ('##', 2277), ('#', 29937), ('▁Ass', 4007), ('istant', 22137), (':', 29901), ...]

response_template = " ### Response:\n"
print_tokens_with_ids(response_template)  # [('▁###', 835), ('▁Ass', 4007), ('istant', 22137), (':', 29901)]

[('re', 265), ('Ġlimited', 7347), ('Ġto', 311), ('Ġthose', 1884), ('Ġcommon', 4279), ('Ġamong', 4315), ('Ġmany', 1690), ('Ġbreeds', 58245), ('Ġof', 315), ('Ġdog', 5679), (',', 11), ('Ġalthough', 8051), ('Ġthey', 814), ('Ġare', 527), ('Ġaffected', 11754), ('Ġmore', 810), ('Ġthan', 1109), ('Ġaverage', 5578), ('Ġby', 555), ('Ġhip', 18638), ('Ġdys', 22709), ('pl', 501), ('asia', 36259), ('Ġand', 323), ('Ġsome', 1063), ('Ġeye', 8071), ('Ġconditions', 4787), ('.', 13), ('Ġ', 220), ('ĠThey', 2435), ('Ġare', 527), ('Ġa', 264), ('Ġworking', 3318), ('Ġdog', 5679), (',', 11), ('Ġbred', 55187), ('Ġfor', 369), ('Ġhunting', 23330), (',', 11), ('Ġand', 323), ('Ġwhile', 1418), ('Ġnot', 539), ('Ġas', 439), ('Ġrare', 9024), ('Ġas', 439), ('Ġsome', 1063), ('Ġvarieties', 36680), ('Ġof', 315), ('Ġspan', 9575), ('iel', 13327), (',', 11), ('Ġthey', 814), ('Ġare', 527), ('Ġr', 436), ('arer', 61570), ('Ġthan', 1109), ('Ġthe', 279), ('Ġmore', 810), ('Ġwidely', 13882), ('Ġknown', 3967), ('ĠEnglish', 6498), ('ĠSp