# **Fine-tuned LLM Comparison Notebook 4**

This notebook compares results of the LoRA (Low Rank Adaptation) fine-tuned alpaca-native (fine-tuned on custom text summarisation data) to the base pretrained Flan-T5-small, Flan-T5-base, Alpaca-native and the Vicuna-13b-v1.3 LLMs.

Few-shots prompting is used for all LLMs in this notebook to generate summaries

In [1]:
import os, warnings, torch, gc
import pandas as pd
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    LlamaTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoModelForSeq2SeqLM, 
    AutoConfig,
    pipeline
)
from datasets import load_dataset
from datasets.dataset_dict import DatasetDict
from peft import PeftConfig, PeftModel
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable, Union, Any, Optional

warnings.filterwarnings("ignore")

os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "2048"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["ALPACA_NATIVE_ID"] = "alpaca-native_finetuned_results"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["FLAN_T5_SMALL_LLM"] = "google/flan-t5-small"
os.environ["FLAN_T5_BASE_LLM"] = "google/flan-t5-base"
os.environ["VICUNA_LLM"] = "lmsys/vicuna-13b-v1.3"

In [2]:
def run_on_test_data(
    model: Union[AutoModelForSeq2SeqLM, AutoModelForCausalLM, LlamaForCausalLM], 
    tokenizer: Union[AutoTokenizer, LlamaTokenizer], 
    dataset_key: str="test",
    task="text2text-generation",
    n_docs: int = 5,
    log_summary: bool=False,
    log_metrics: bool=False,
    summary_chain_kwargs: Dict[str, Any]={},
    **kwargs):
    
    # switch model to eval mode
    model.eval()
    
    # Define model pipeline for inference with langchain
    kwargs = {**dict(temperature=0.1, top_p=0.15, top_k=0, repetition_penalty=1.1), **kwargs}

    # define model pipeline
    hgf_pipeline = pipeline(
        task=task, 
        model=model, 
        tokenizer=tokenizer,
        max_length=int(os.environ["MAX_TOKENS"]),
        **kwargs
    )
    llm = HuggingFacePipeline(pipeline=hgf_pipeline)
    
    # Define Summary chain
    summary_chain_kwargs = {"chain_type": "map_reduce", **summary_chain_kwargs}
    summary_chain = load_summarize_chain(llm, **summary_chain_kwargs)
    
    # Load dataset
    dataset: DatasetDict = load_dataset(path=os.environ["DATASET_PATH"])
    
    # Generate Summaries and Measure Performance (Rouge Metric and Cosine Similarity Metric)
    rouge = Rouge()  # rouge metric object
    embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"]) # embeddings model object
    embeddings_model.to(os.environ["DEVICE"])
    document_ids = dataset[dataset_key]["id"][:n_docs]
    documents = dataset[dataset_key]["document"][:n_docs]
    target_summaries = dataset[dataset_key]["summary"][:n_docs]
    _zipped = zip(document_ids, documents, target_summaries)
    metrics_values: Iterable[Dict[str, Any]] = []

    for i, (document_id, document, target_summary) in enumerate(_zipped):
        document = Document(page_content=document)
        try:
            generated_summary = summary_chain.run([document])
        except ValueError as e:
            print(f"Error summarizing document-{i+1}: {e}")
            continue
            
        if log_summary:
            print(f"DOCUMENT {document_id}: {document}\n")
            print(f"GENERATED SUMARY: {generated_summary}\n")
            print(f"TARGET SUMARY: {target_summary}\n")
        
        generated_summary_embeddings, target_summary_embeddings = (
            embeddings_model.encode(generated_summary).reshape(1, -1),
            embeddings_model.encode(target_summary).reshape(1, -1)
        )
        try:
            cos_similarity = cosine_similarity(target_summary_embeddings, generated_summary_embeddings)
            rouge_scores = rouge.get_scores(generated_summary, target_summary)
        except ValueError as e:
            print(e)
            cos_similarity = [[0.0]]
            rouge_scores = [{
                'rouge-1': {
                    'r': 0.0, 
                    'p': 0.0, 
                    'f': 0.0,
                }, 
                'rouge-2': {
                    'r': 0.0, 
                    'p': 0.0, 
                    'f': 0.0,
                }, 
                'rouge-l': {
                    'r': 0.0,
                    'p': 0.0,
                    'f': 0.0,
                }
            }] 
        if log_metrics:
            print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
            print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n")
            
        if log_metrics or log_summary:
            print("-"*120)
            print("\n")
        _metric = dict(semantic_similarity=cos_similarity, rouge_scores=rouge_scores)
        metrics_values.append(_metric)
        
    return metrics_values

In [3]:
INFERENCE_DATASET_KEY = "test"
N_INFERENCE_DOCS = 10
LOG_SUMMARY = True
LOG_METRICS = True
DELETE_LLM_AFTER_USE = True

## Define Few Shots Prompts

In [4]:
PROMPT_TEMPLATE = """:

Using the following document and summarisation pair examples below:

EXAMPLES:

Example 1:
-----------------------------------
"document": "Medical Center-Lakeside, Lubbock, Texas, 79410, United States|Texas Tech University Health Sciences \
Center-Lubbock, Lubbock, Texas, 79430, United States|MD Anderson Regional Care Center-Bay Area, Nassau Bay, Texas, \
77058, United States|University of Texas Health Science Center at San Antonio, San Antonio, Texas, 78229, United States|MD\
Anderson Regional Care Center-Sugar Land, Sugar Land, Texas, 77478, United States|MD Anderson Regional Care Center-The \
Woodlands, The Woodlands, Texas, 77384, United States|American Fork Hospital / Huntsman Intermountain Cancer Center, \
American Fork, Utah, 84003, United States|Sandra L Maxwell Cancer Center, Cedar City, Utah, 84720, United States|Logan\
Regional Hospital, Logan, Utah, 84321, United States|Intermountain Medical Center, Murray, Utah, 84107, United \
States|McKay-Dee Hospital Center, Ogden, Utah, 84403, United States|Utah Valley Regional Medical Center, Provo, Utah, \
84604, United States|Dixie Medical Center Regional Cancer Center, Saint George, Utah, 84770, United",

"summary": "There are medical and cancer centers in Texas and Utah in the US."


Example 2:
-----------------------------------
"document": "Wisconsin, 54449, United States|Marshfield Medical Center, Marshfield, Wisconsin, 54449, United \
States|Community Memorial Hospital, Menomonee Falls, Wisconsin, 53051, United States|Aurora Cancer Care-Milwaukee, \
Milwaukee, Wisconsin, 53209, United States|Aurora Saint Luke's Medical Center, Milwaukee, Wisconsin, 53215, United \
States|Froedtert and the Medical College of Wisconsin, Milwaukee, Wisconsin, 53226, United States|Aurora Sinai Medical Center, \
Milwaukee, Wisconsin, 53233, United States|Marshfield Clinic-Minocqua Center, Minocqua, Wisconsin, 54548, United \
States|ProHealth D N Greenwald Center, Mukwonago, Wisconsin, 53149, United States|Cancer Center of Western Wisconsin, \
New Richmond, Wisconsin, 54017, United States|ProHealth Oconomowoc Memorial Hospital, Oconomowoc, Wisconsin, 53066, \
United States|Vince Lombardi Cancer Clinic - Oshkosh, Oshkosh, Wisconsin, 54904, United States|Aurora Cancer Care-Racine, \
Racine, Wisconsin, 53406, United States|Marshfield Clinic at James Beck Cancer Center, Rhinelander,",

"summary": "The list provides information about medical centers and cancer clinics in different cities in Wisconsin, USA."


Generate a concise summary of the text below:

"{text}"


SUMMARY:"""

PROMPT = PromptTemplate.from_template(PROMPT_TEMPLATE)

## Evaluate Base Pretrained Alpaca Native

In [5]:
alpaca_config = PeftConfig.from_pretrained(os.environ["ALPACA_NATIVE_ID"])

# load alpaca base model and tokenizer
alpaca_model = LlamaForCausalLM.from_pretrained(
    alpaca_config.base_model_name_or_path,
    load_in_8bit=True,
    device_map="auto",
)

alpaca_tokenizer = LlamaTokenizer.from_pretrained(
    alpaca_config.base_model_name_or_path,
    max_length=os.environ["MAX_TOKENS"],
)

lora_alpaca_native_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Token indices sequence length is longer than the specified maximum sequence length for this model (887 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 1398: page_content='NCT Number: NCT03049449\nStudy Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03049449\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:' metadata={}

GENERATED SUMARY:  This study is a Phase I clinical trial evaluating the safety and efficacy of T cells expressing a fully-human anti-CD30 chimeric antigen receptor (CAR) in patients with CD30-expressing lymphoma. The study will assess response rate, duration of response, and safety.

TARGET SUMARY: A study was conducted to explore the use of T cells with a specific receptor for treating lymphomas. The study has been completed, but no additional details are available.

Cosine similarity for summary 1: 0.7231908 

Rouge scores for summary 1: {'rouge-1': {'r': 0.32142857142857145, 'p': 0.2571428571428571, 'f': 0.2857142807760142}, 'rouge-2': {'r': 0.10714285714285714, 'p'

## Evaluate LoRA Fine-tuned Alpaca Native

In [6]:
# load LoRA peft alpaca model
alpaca_model = PeftModel.from_pretrained(
    alpaca_model, 
    os.environ["ALPACA_NATIVE_ID"],
    device_map="auto"
)

alpaca_native_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del alpaca_model, alpaca_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'ReformerModelWithLMHead', 'RemBertForCausal

DOCUMENT 1398: page_content='NCT Number: NCT03049449\nStudy Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03049449\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:' metadata={}

GENERATED SUMARY:  Don

TARGET SUMARY: A study was conducted to explore the use of T cells with a specific receptor for treating lymphomas. The study has been completed, but no additional details are available.

Cosine similarity for summary 1: 0.12988631 

Rouge scores for summary 1: {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}} 

------------------------------------------------------------------------------------------------------------------------


DOCUMENT 2909: page_content='Interventions: DRUG: Ibrutinib|OTHER: Laboratory Biomarker Analysis\nPrimary Outcome Measures: Incidence of 

Token indices sequence length is longer than the specified maximum sequence length for this model (2523 > 1024). Running this sequence through the model will result in indexing errors
Input length of input_ids is 2701, but `max_length` is set to 2048. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


DOCUMENT 4698: page_content='54476, United States|Aspirus UW Cancer Center, Wisconsin Rapids, Wisconsin, 54494, United States|Marshfield Clinic - Wisconsin Rapids Center, Wisconsin Rapids, Wisconsin, 54494, United States|Cheyenne Regional Medical Center-West, Cheyenne, Wyoming, 82001, United States|Billings Clinic-Cody, Cody, Wyoming, 82414, United States|Welch Cancer Center, Sheridan, Wyoming, 82801, United States' metadata={}

GENERATED SUMARY:  

TARGET SUMARY: This summary provides a list of medical centers in various cities in the United States, including Wisconsin Rapids, Wisconsin, and several cities in Wyoming. The medical centers mentioned include cancer centers and general medical facilities.

Cosine similarity for summary 3: -0.040348034 

Rouge scores for summary 3: {'rouge-1': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.0, 'p': 0.0, 'f': 0.0}} 

-------------------------------------------------------------------------------

Token indices sequence length is longer than the specified maximum sequence length for this model (2054 > 1024). Running this sequence through the model will result in indexing errors
Input length of input_ids is 2462, but `max_length` is set to 2048. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


DOCUMENT 3405: page_content='Other Outcome Measures: unknown\nSponsor: Emory University\nCollaborators: Bristol-Myers Squibb|Oncolytics Biotech|University of Utah|City of Hope Medical Center|Phylogeny|National Cancer Institute (NCI)|National Institutes of Health (NIH)\nSex: ALL\nAge: ADULT, OLDER_ADULT\nPhases: PHASE1\nEnrollment: 23\nFunder Type: OTHER\nStudy Type: INTERVENTIONAL\nStudy Design: Allocation: NON_RANDOMIZED|Intervention Model: PARALLEL|Masking: NONE|Primary Purpose: TREATMENT\nOther IDs: IRB00104234|NCI-2018-01217|Winship4398-18|P30CA138292\nStart Date: 2018-10-24\nPrimary Completion Date: 2024-05-05\nCompletion Date: 2024-05-05\nFirst Posted: 2018-07-30\nResults First Posted: unknown\nLast Update Posted: 2023-05-31\nLocations: Emory University Hospital/Winship Cancer Institute, Atlanta, Georgia, 30322, United States|Huntsman Cancer Institute/University of Utah, Salt Lake City, Utah, 84112, United States\nStudy Documents: Informed Consent Form' metadata={}

GENERATED SUM

Token indices sequence length is longer than the specified maximum sequence length for this model (2391 > 1024). Running this sequence through the model will result in indexing errors
Input length of input_ids is 2576, but `max_length` is set to 2048. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


DOCUMENT 330: page_content='10 months|Pharmacokinetic Profile of SX-682 as a Single Agent and in Combination, Plasma and serum samples will be drawn and may be analyzed by a validated immunoassay to quantitate SX-682 as a single agent and in combination or by a validated electrochemiluminescence immunoassay to detect the presence of M7824 concentration., Predose, 30 minutes, 60 minutes, 120 minutes, <6 hours of 2nd dose, and end of infusion (EOI)|Pharmacodynamic Profile of SX-682 as a Single Agent and in Combination, Plasma and serum samples will be drawn and may be analyzed by a validated immunoassay to quantitate SX-682 as a single agent and in combination or by a validated electrochemiluminescence immunoassay to detect the presence of M7824 concentration., Predose, 30 minutes, 60 minutes, 120 minutes, <6 hours of 2nd dose, and end of infusion (EOI)' metadata={}

GENERATED SUMARY:  Ne

TARGET SUMARY: This study will analyze the effects of SX-682 alone and in combination with M7824 by

## Evaluate Base Pre-trained FLAN-T5-SMALL

In [7]:
flan_t5_small_model = AutoModelForSeq2SeqLM.from_pretrained(
    os.environ["FLAN_T5_SMALL_LLM"],  
    load_in_8bit=True,  
    device_map="auto",
)
flan_t5_small_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["FLAN_T5_SMALL_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_small_performance = run_on_test_data(
    flan_t5_small_model, 
    flan_t5_small_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_small_model, flan_t5_small_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Token indices sequence length is longer than the specified maximum sequence length for this model (792 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 1398: page_content='NCT Number: NCT03049449\nStudy Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03049449\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:' metadata={}

GENERATED SUMARY: The study titled "NCT Number: NCT03049449 Study Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas Study URL: https://beta.clinicaltrials.gov/study/NCT03049449"

TARGET SUMARY: A study was conducted to explore the use of T cells with a specific receptor for treating lymphomas. The study has been completed, but no additional details are available.

Cosine similarity for summary 1: 0.6818962 

Rouge scores for summary 1: {'rouge-1': {'r': 0.17857142857142858, 'p': 0.2, 'f': 0.18867924029903896}, 'rouge-2': {'r': 0.03571428571428571, 'p': 0.04, 'f': 0.037735844072624376}, 'rouge-l': {'r': 0.1

Token indices sequence length is longer than the specified maximum sequence length for this model (2279 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 1949: page_content="Cancer Center at Cooper-Voorhees, Voorhees, New Jersey, 08043, United States|University of New Mexico Cancer Center, Albuquerque, New Mexico, 87102, United States|Southwest Gynecologic Oncology Associates Inc, Albuquerque, New Mexico, 87106, United States|Lovelace Radiation Oncology, Albuquerque, New Mexico, 87109, United States|Memorial Medical Center - Las Cruces, Las Cruces, New Mexico, 88011, United States|Women's Cancer Care Associates LLC, Albany, New York, 12208, United States|Montefiore Medical Center-Einstein Campus, Bronx, New York, 10461, United States|State University of New York Downstate Medical Center, Brooklyn, New York, 11203, United States|New York-Presbyterian/Brooklyn Methodist Hospital, Brooklyn, New York, 11215, United States|Roswell Park Cancer Institute, Buffalo, New York, 14263, United States|Memorial Sloan Kettering Commack, Commack, New York, 11725, United States|New York Hospital Medical Center of Queens, Fresh Meadows, New York,

Token indices sequence length is longer than the specified maximum sequence length for this model (1658 > 1024). Running this sequence through the model will result in indexing errors


DOCUMENT 3655: page_content='* Complete response (CR): Disappearance of all target lesions. Any pathological lymph nodes (whether target or non-target) must have reduction in short axis to \\<10 mm. Disappearance of all non-target lesions and normalization of tumor marker level.\n* Partial response (PR): At least a 30% decrease in the sum of the diameters of target lesions, taking as reference the baseline sum diameters.' metadata={}

GENERATED SUMARY: The list includes the following: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list includes: "The list 

## Evaluate Base Pretrained FLAN-T5-BASE

In [8]:
flan_t5_base_model = AutoModelForSeq2SeqLM.from_pretrained(
    os.environ["FLAN_T5_BASE_LLM"], 
    load_in_8bit=True,  
    device_map="auto",
)

flan_t5_base_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["FLAN_T5_BASE_LLM"],
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_base_performance = run_on_test_data(
    flan_t5_base_model, 
    flan_t5_base_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_base_model, flan_t5_base_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Token indices sequence length is longer than the specified maximum sequence length for this model (745 > 512). Running this sequence through the model will result in indexing errors


DOCUMENT 1398: page_content='NCT Number: NCT03049449\nStudy Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03049449\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:' metadata={}

GENERATED SUMARY: A new antigen receptor can be used to treat cancer.

TARGET SUMARY: A study was conducted to explore the use of T cells with a specific receptor for treating lymphomas. The study has been completed, but no additional details are available.

Cosine similarity for summary 1: 0.5890186 

Rouge scores for summary 1: {'rouge-1': {'r': 0.10714285714285714, 'p': 0.3, 'f': 0.157894732963989}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.07142857142857142, 'p': 0.2, 'f': 0.10526315401662063}} 

------------------------------------------------------------------------------------------------------------------------


DOCUMENT 2909: page_content='

## Evaluate Base Pretrained Vicuna 13b v1.3

In [9]:
vicuna_model = AutoModelForCausalLM.from_pretrained(
    os.environ["VICUNA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

vicuna_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["VICUNA_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

vicuna_13b_performance = run_on_test_data(
    vicuna_model, 
    vicuna_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
    summary_chain_kwargs={"combine_prompt": PROMPT},
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del vicuna_model, vicuna_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

DOCUMENT 1398: page_content='NCT Number: NCT03049449\nStudy Title: T Cells Expressing a Fully-Human Anti-CD30 Chimeric Antigen Receptor for Treating CD30-Expressing Lymphomas\nStudy URL: https://beta.clinicaltrials.gov/study/NCT03049449\nAcronym: unknown\nStudy Status: COMPLETED\nBrief Summary: Background:' metadata={}

GENERATED SUMARY: 
This study is evaluating the safety and effectiveness of using T cells with a human anti-CD30 CAR for treating patients with CD30-expressing lymphomas. Results have not been released yet.

TARGET SUMARY: A study was conducted to explore the use of T cells with a specific receptor for treating lymphomas. The study has been completed, but no additional details are available.

Cosine similarity for summary 1: 0.68136454 

Rouge scores for summary 1: {'rouge-1': {'r': 0.39285714285714285, 'p': 0.39285714285714285, 'f': 0.3928571378571429}, 'rouge-2': {'r': 0.14285714285714285, 'p': 0.14285714285714285, 'f': 0.14285713785714302}, 'rouge-l': {'r': 0.3928571

In [10]:
def compile_metrics(performance: Iterable[Dict[str, Any]], outer_key: Optional[str]=None) -> Dict[str, Iterable]:
    performance_table_dict = dict()
    performance_dict = dict()
    join_keys = lambda inner_key : (outer_key, inner_key) if outer_key else inner_key
    
    for i, item in enumerate(performance):
        similarity_score = item["semantic_similarity"][0][0]
        rouge_scores = item["rouge_scores"][0]
        if i == 0:
            performance_dict[join_keys("similarity_score")]=[similarity_score]
        else:
            performance_dict[join_keys("similarity_score")].append(similarity_score)
        for key in rouge_scores.keys():
            if i == 0:
                performance_dict = {
                    **performance_dict, 
                    **{join_keys(f"{key}_{k}"):[v] for k, v in rouge_scores[key].items()}
                }
            else:
                for k, v in rouge_scores[key].items():
                    performance_dict[join_keys(f"{key}_{k}")].append(v)
    return performance_dict

In [11]:
performances = {
    "alpaca-7b-native": alpaca_native_performance,
    "lora-alpaca-7b-native": lora_alpaca_native_performance,
    "flan-t5-small": flan_t5_small_performance,
    "flan-t5-base": flan_t5_base_performance,
    "vicuna-13b-v1.3": vicuna_13b_performance,
}
metrics_dict = dict()
for key, performance_values in performances.items():
    metrics_dict = {**metrics_dict, **compile_metrics(performance_values, outer_key=key)}
    
outer_cols = [k for k in performances.keys()]
inner_cols = [k[1] for k in metrics_dict.keys() if k[0]=="flan-t5-small"]
columns = pd.MultiIndex.from_product([outer_cols, inner_cols])
metrics_df = pd.DataFrame(metrics_dict, columns=columns).round(2)

## Performance Comparison Table

In [12]:
pd.set_option('display.max_columns', None)
metrics_df

Unnamed: 0_level_0,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3
Unnamed: 0_level_1,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.72,0.32,0.26,0.29,0.11,0.08,0.09,0.29,0.23,0.25,0.68,0.18,0.2,0.19,0.04,0.04,0.04,0.18,0.2,0.19,0.59,0.11,0.3,0.16,0.0,0.0,0.0,0.07,0.2,0.11,0.68,0.39,0.39,0.39,0.14,0.14,0.14,0.39,0.39,0.39
1,0.64,0.29,0.19,0.23,0.11,0.06,0.08,0.29,0.19,0.23,0.84,0.45,0.29,0.35,0.23,0.15,0.18,0.45,0.29,0.35,0.8,0.16,0.29,0.21,0.03,0.06,0.04,0.13,0.24,0.17,0.78,0.19,0.35,0.25,0.06,0.12,0.08,0.16,0.29,0.21,0.97,0.71,0.45,0.55,0.51,0.31,0.39,0.68,0.43,0.52
2,-0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.65,0.33,0.24,0.28,0.09,0.06,0.08,0.3,0.22,0.25,0.58,0.33,0.29,0.31,0.09,0.08,0.08,0.26,0.23,0.24,0.58,0.48,0.57,0.52,0.19,0.22,0.2,0.44,0.52,0.48,0.65,0.26,0.5,0.34,0.06,0.15,0.09,0.26,0.5,0.34
3,0.78,0.48,0.24,0.32,0.26,0.11,0.15,0.48,0.24,0.32,0.77,0.19,0.36,0.25,0.04,0.09,0.06,0.19,0.36,0.25,0.7,0.24,0.17,0.2,0.09,0.05,0.07,0.19,0.13,0.16,0.5,0.14,0.15,0.15,0.0,0.0,0.0,0.14,0.15,0.15,0.79,0.33,0.47,0.39,0.09,0.14,0.11,0.33,0.47,0.39
4,0.59,0.38,0.23,0.29,0.15,0.08,0.11,0.36,0.21,0.27,0.96,0.41,0.76,0.53,0.31,0.67,0.42,0.41,0.76,0.53,0.12,0.13,0.38,0.19,0.0,0.0,0.0,0.1,0.31,0.15,0.64,0.08,0.38,0.13,0.0,0.0,0.0,0.08,0.38,0.13,0.89,0.54,0.48,0.51,0.29,0.28,0.29,0.54,0.48,0.51
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.28,0.39,0.33,0.08,0.12,0.09,0.25,0.35,0.29,0.16,0.01,0.12,0.02,0.0,0.0,0.0,0.01,0.12,0.02,0.36,0.05,0.24,0.09,0.0,0.0,0.0,0.05,0.24,0.09,0.53,0.25,0.37,0.3,0.06,0.1,0.07,0.23,0.33,0.27
6,0.59,0.69,0.13,0.22,0.46,0.07,0.12,0.62,0.12,0.2,0.67,0.38,0.22,0.28,0.0,0.0,0.0,0.31,0.17,0.22,0.68,0.38,0.19,0.26,0.0,0.0,0.0,0.31,0.15,0.21,0.72,0.15,0.22,0.18,0.0,0.0,0.0,0.15,0.22,0.18,0.85,0.46,0.43,0.44,0.15,0.14,0.15,0.38,0.36,0.37
7,0.12,0.11,0.15,0.13,0.02,0.03,0.02,0.11,0.15,0.13,0.68,0.29,0.34,0.31,0.09,0.13,0.11,0.26,0.31,0.28,0.09,0.03,0.14,0.05,0.0,0.0,0.0,0.03,0.14,0.05,0.29,0.11,0.4,0.18,0.02,0.11,0.04,0.09,0.3,0.13,0.83,0.46,0.42,0.44,0.11,0.12,0.11,0.37,0.34,0.36
8,0.13,0.19,0.21,0.2,0.02,0.02,0.02,0.19,0.21,0.2,0.96,0.56,0.71,0.62,0.3,0.41,0.35,0.53,0.68,0.59,0.9,0.39,0.45,0.42,0.22,0.29,0.25,0.33,0.39,0.36,0.69,0.14,0.56,0.22,0.08,0.38,0.12,0.14,0.56,0.22,0.85,0.25,0.31,0.28,0.12,0.16,0.14,0.25,0.31,0.28
9,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,0.5,0.52,0.51,0.19,0.25,0.22,0.42,0.44,0.43,0.79,0.35,0.53,0.42,0.19,0.35,0.25,0.35,0.53,0.42,0.83,0.5,0.54,0.52,0.19,0.22,0.21,0.42,0.46,0.44,0.92,0.5,0.52,0.51,0.19,0.23,0.21,0.46,0.48,0.47


## Mean Performance Comparison Table

In [13]:
metrics_mean = metrics_df.mean(axis=0)
mean_metrics_df = pd.DataFrame(metrics_mean.values.reshape(1, -1), columns=metrics_mean.index)
mean_metrics_df

Unnamed: 0_level_0,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,lora-alpaca-7b-native,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-small,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,flan-t5-base,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3,vicuna-13b-v1.3
Unnamed: 0_level_1,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f,similarity_score,rouge-1_r,rouge-1_p,rouge-1_f,rouge-2_r,rouge-2_p,rouge-2_f,rouge-l_r,rouge-l_p,rouge-l_f
0,0.289,0.214,0.115,0.139,0.102,0.037,0.05,0.205,0.112,0.135,0.755,0.371,0.409,0.375,0.144,0.196,0.16,0.341,0.381,0.344,0.55,0.22,0.276,0.227,0.066,0.087,0.073,0.189,0.244,0.197,0.598,0.195,0.371,0.24,0.054,0.105,0.065,0.174,0.332,0.214,0.796,0.415,0.434,0.415,0.172,0.177,0.17,0.389,0.409,0.39
