In [3]:
import random
import numpy as np
import torch
import bert_score
from tqdm import tqdm

from datasets import DatasetDict, load_dataset, concatenate_datasets,load_from_disk
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
ds = load_from_disk('hf_dataset_2024-12-06')
ds
# ds = ds.train_test_split(test_size=0.1)
test_set = ds['test']

In [None]:
def combine_fields_v1(example):
    text = f"###context:\n{example['Context']}\n\n###cell_sentences_data:\n{example['cell_sentences_data']}\n\n###Question:\n{example['Question']}\n\n###Answer:\n{example['Answer']}"
    example["text"] = text
    return example

test_set = test_set.map(combine_fields_v1)


In [None]:
model_name = "vandijklab/C2S-Pythia-410m-diverse-single-and-multi-cell-tasks"
adapter_id = 'sft_output/checkpoint-2800'
model = AutoModelForCausalLM.from_pretrained(model_name)


tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.add_tokens(["<|Question|>", "<|Answer|>"]) 

model.resize_token_embeddings(len(tokenizer))
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
if tokenizer.padding_side == 'right':
    tokenizer.padding_side = 'left'
model = PeftModel.from_pretrained(model, adapter_id)
model = model.merge_and_unload()

In [None]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device='cuda:0')

In [None]:
all_cans = []

In [4]:
for i in tqdm(range(len(test_set))):
    question = test_set['text'][i]
    # Store original question by extracting text between <|Question|> and <|Answer|>
    original_question = question.split('###Question:')[-1].split('###Answer:')[0].strip()
    # Generate 5 answers for the same question
    cans = generator(question, max_new_tokens=512, num_return_sequences=5, do_sample=True)
    # Extract just the answer portion after <|Answer|> tag
    answers = []
    for c in cans:
        text = c['generated_text']
        answer = text.split('###Answer:')[-1].strip()
        answers.append({
            'question': original_question,
            'answer': answer
        })
    all_cans.append(answers)
    
    # Save all_cans after each iteration
    with open('all_cans.json', 'w') as f:
        json.dump(all_cans, f)

NameError: name 'test_set' is not defined

In [None]:
all_cans

In [1]:
from bert_score import score
import numpy as np
import json

# Get keywords and ground truth answers
keywords_list = test_set['Keyword']
ground_truth_answers = test_set['Answer']

# Calculate BERT scores against ground truth
final_scores = []
dpo_pairs = []

for i in range(len(all_cans)):
    answers = all_cans[i]
    # Remove brackets from keywords string and split
    keywords = keywords_list[i].strip('[]').split(',')
    keywords = [k.strip() for k in keywords]
    question = test_set['text'][i]
    ground_truth = ground_truth_answers[i]
    
    # Calculate scores for each answer
    scores = []
    for ans in answers:
        P, R, F1 = score([ans], [ground_truth], lang='en', verbose=False)
        bert_score = F1.item()
        
        # Calculate keyword presence ratio
        ans_lower = ans.lower()
        present_keywords = sum(1 for kw in keywords if kw.lower() in ans_lower)
        keyword_ratio = present_keywords / len(keywords)
        if keyword_ratio == 0:
            keyword_ratio = 0.01
        # Adjust bert_score based on keyword presence ratio
        adjusted_score = bert_score * keyword_ratio
        
        scores.append({
            'answer': ans,
            'score': adjusted_score
        })
    
    # Sort answers by adjusted score
    scores.sort(key=lambda x: x['score'], reverse=True)
    
    # Create DPO pairs for all possible combinations of 2 answers
    # where the higher scored answer is chosen and lower scored is rejected
    for j in range(len(answers)):
        for k in range(j+1, len(answers)):
            if scores[j]['score'] > scores[k]['score']:
                chosen = scores[j]['answer']
                rejected = scores[k]['answer']
                chosen_score = scores[j]['score']
                rejected_score = scores[k]['score']
            else:
                chosen = scores[k]['answer']
                rejected = scores[j]['answer']
                chosen_score = scores[k]['score']
                rejected_score = scores[j]['score']
                
            dpo_pair = {
                'question': question,
                'chosen': chosen,
                'rejected': rejected,
                'ground_truth': ground_truth,
                'chosen_score': float(chosen_score),
                'rejected_score': float(rejected_score),
                'keywords': keywords
            }
            dpo_pairs.append(dpo_pair)
    
    # Calculate average score for this question
    avg_score = np.mean([s['score'] for s in scores])
    final_scores.append(avg_score)

# Convert to numpy arrays
final_scores = np.array(final_scores)

# Save DPO dataset as JSON file
with open('dpo_dataset_with_score.json', 'w') as f:
    json.dump(dpo_pairs, f, indent=2)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'test_set' is not defined

In [19]:
import json
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import DPOTrainer, DPOConfig
from datasets import Dataset
# Load and prepare DPO dataset
with open('dpo_dataset_with_scorev2.json', 'r') as f:
    dpo_data = json.load(f)
train_dataset = Dataset.from_list(dpo_data)
# Load model and tokenizer
model_name = "vandijklab/C2S-Pythia-410m-diverse-single-and-multi-cell-tasks"
adapter_id = 'sft_output/checkpoint-2800'
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Remove model_init_kwargs from the config
dpo_config = DPOConfig(
    # model_init_kwargs={},  # Remove this line
    output_dir="dpo_output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=10,
    save_strategy="epoch",
    evaluation_strategy="no"
)

trainer = DPOTrainer(
    model=model,
    args=dpo_config,
    tokenizer=tokenizer,
    train_dataset=train_dataset,  # Use the Dataset object, not the list
    beta=0.1
)

# Proceed with training
trainer.train()

# Save the final model
trainer.save_model("dpo_final_model")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Extracting prompt from train dataset: 100%|██████████| 750/750 [00:00<00:00, 9842.98 examples/s]
Applying chat template to train dataset: 100%|██████████| 750/750 [00:00<00:00, 1845.20 examples/s]
Tokenizing train dataset: 100%|██████████| 750/750 [00:01<00:00, 491.39 examples/s]
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelis

Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss
10,8.0037
20,52.4666
30,10.5916
40,1.8563
50,5.2982
60,3.7986
70,3.1325
80,2.1503
90,1.2182
100,2.6608


In [20]:
# Load test data
with open('test_set.json', 'r') as f:
    test_data = json.load(f)
test_set = Dataset.from_list(test_data)

# Load the trained DPO model
dpo_model = AutoModelForCausalLM.from_pretrained("dpo_final_model")
dpo_model.eval()  # Set to evaluation mode

all_cans = []  # Store all generated answers

for i in tqdm(range(len(test_set))):
    question = test_set[i]['text']
    
    # Prepare input for model
    inputs = tokenizer(question + " ###Answer:", return_tensors="pt", truncation=True, max_length=512)
    
    # Generate answer
    with torch.no_grad():
        outputs = dpo_model.generate(
            inputs.input_ids,
            max_length=512,
            num_return_sequences=1,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode and clean up the generated answer
    generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract answer from after ###Answer: token
    if "###Answer:" in generated_answer:
        answer = generated_answer.split("###Answer:")[1].strip()
    else:
        answer = generated_answer.strip()
        
    all_cans.append(answer)


FileNotFoundError: [Errno 2] No such file or directory: 'test_set.json'

In [None]:
## Depose！！！

In [10]:
import json

json.dump(all_cans, open('answer_generated_1124.json', 'w'))

In [None]:
def chain_of_thought_analysis(refs, cans):
    """
    Analyze the chain of thought in the answers by breaking them down into key components
    and comparing with reference text.
    """
    analysis_results = []
    
    for ref, can in zip(refs, cans):
        # Split candidate answer into reasoning steps (based on sentences/paragraphs)
        steps = [s.strip() for s in can.split('\n\n') if s.strip()]
        
        # Analyze each reasoning step
        step_analysis = []
        for i, step in enumerate(steps):
            step_info = {
                'step_number': i+1,
                'content': step,
                'type': 'initial_claim' if i==0 else 'supporting_evidence' if i<len(steps)-1 else 'conclusion'
            }
            step_analysis.append(step_info)
            
        # Overall analysis
        analysis = {
            'reference': ref,
            'candidate_answer': can,
            'num_reasoning_steps': len(steps),
            'reasoning_chain': step_analysis,
            'has_conclusion': any(s['type']=='conclusion' for s in step_analysis),
            'has_evidence': any(s['type']=='supporting_evidence' for s in step_analysis)
        }
        analysis_results.append(analysis)
    
    return analysis_results

# Analyze the chain of thought
cot_analysis = chain_of_thought_analysis(refs, cans)

# Print analysis results
for i, analysis in enumerate(cot_analysis):
    print(f"\nAnalysis for Answer {i+1}:")
    print(f"Number of reasoning steps: {analysis['num_reasoning_steps']}")
    print("\nReasoning chain:")
    for step in analysis['reasoning_chain']:
        print(f"\nStep {step['step_number']} ({step['type']}):")
        print(step['content'])
    print("\nHas conclusion:", analysis['has_conclusion'])
    print("Has supporting evidence:", analysis['has_evidence'])


In [None]:
from bert_score import score

refs = ['Our preliminary results indicate that SAH leads to an increase in NO-M in CSF. This increase of NO-M significantly correlates with the flow velocities in TCDS measurement suggesting that NO plays an important role in the pathogenesis of cerebral vasospasm.',
 'A lack of decline in the AMH level in early pregnancy can be used to identify women with a high probability for preterm birth, especially when MSAFP levels are >1 multiple of the median. Monitoring changes in the AMH level between the first and second trimesters of pregnancy may help identify women who would benefit from interventional therapies such as supplemental progesterone.']
cans = ["Yes, the study found a significant correlation between nitric oxide metabolites in cisternal CSF and cerebral vasospasm in patients with a subarachnoid haemorrhage. The patients who developed clinically symptomatic vasospasm showed significantly higher levels of nitric oxide metabolites in CSF compared to the patients with an uncomplicated follow-up. This suggests that nitric oxide metabolites may play a role in the development of cerebral vasospasm in patients with subarachnoid haemorrhage.\n\nThe study's findings are consistent with the idea that nitric oxide (NO) is involved in the pathogenesis of cerebral vasospasm. NO is a key molecule in the regulation of vascular tone, and its metabolites can be measured in cisternal CSF as a marker of NO production. The fact that patients with cerebral vasospasm showed higher levels of NO metabolites in CSF suggests that NO production is increased in these patients, which may contribute to the development of vasospasm.\n\nThe study's results also have implications for the treatment of cerebral vasospasm in patients with subarachnoid haemorrhage. The use of nitric oxide donors or other NO-releasing agents may be beneficial in preventing or treating cerebral vasospasm. Additionally, the study's findings suggest that monitoring NO metabolites in cisternal CSF may be a useful tool in identifying patients at risk of developing cerebral vasospasm.\n\nOverall, the study provides evidence that nitric oxide metabolites in cisternal CSF are associated with cerebral vasospasm in patients with subarachnoid haemorrhage. Further studies are needed to confirm these findings and to explore the potential therapeutic applications of NO-releasing agents in the treatment of cerebral vasospasm.",
 'Yes, changes in antimüllerian hormone (AMH) levels in early pregnancy are associated with preterm birth. This association was found after adjusting for other markers of fetoplacental health, such as maternal serum α-fetoprotein (MSAFP) and maternal weight change between the first and second trimesters. Specifically, women with a stable or rising AMH level in early pregnancy and an MSAFP >1 multiple of the median were at higher risk for preterm birth.\n\nThe study found that second trimester AMH levels were not associated with preterm birth, either independently or after controlling for other markers of fetoplacental health. However, a statistically significant association was found after adjusting for MSAFP and maternal weight change. This suggests that AMH levels may be a useful marker for identifying women at higher risk for preterm birth, particularly when combined with other markers of fetoplacental health.\n\nThe study also found that most of the risk for preterm birth was identified in women with an MSAFP >1 multiple of the median and who had a stable or rising AMH level in early pregnancy. This suggests that women with high MSAFP levels and stable or rising AMH levels may be at higher risk for preterm birth, and that these women may benefit from closer monitoring and interventions to reduce the risk of preterm birth.\n\nOverall, the study suggests that changes in AMH levels in early pregnancy are associated with preterm birth, and that AMH levels may be a useful marker for identifying women at higher risk for preterm birth. However, further research is needed to confirm these findings and to determine the clinical utility of AMH levels as a marker for preterm birth.']

# 计算 BERTScore
P, R, F1 = score(cans, refs, lang="zh")

print(f"Precision: {P}")
print(f"Recall: {R}") 
print(f"F1: {F1}")