In [1]:
import os, warnings, torch, json, random, gc
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    LlamaTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoModelForSeq2SeqLM, 
    AutoConfig,
    LlamaConfig,
    pipeline
)
from datasets import load_dataset, concatenate_datasets
from datasets.dataset_dict import DatasetDict
from peft import PeftConfig
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable, Union, Any

warnings.filterwarnings("ignore")

os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "2048"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["FINETUNED_FLAN_T5_ID"] = "flan-t5-small_finetuned_results"
os.environ["ALPACA_LLM"] = "chavinlo/alpaca-native"
os.environ["VICUNA_LLM"] = "lmsys/vicuna-13b-v1.3"

In [2]:
def run_on_test_data(
    model: Union[AutoModelForSeq2SeqLM, AutoModelForCausalLM, LlamaForCausalLM], 
    tokenizer: Union[AutoTokenizer, LlamaTokenizer], 
    dataset_key: str="test",
    task="text2text-generation",
    n_docs: int = 5,
    log_summary: bool=False,
    log_metrics: bool=False,
    **kwargs):
    
    # switch model to eval mode
    model.eval()
    
    # Define model pipeline for inference with langchain
    kwargs = {**dict(temperature=0.1, top_p=0.15, top_k=0, repetition_penalty=1.1), **kwargs}

    # define model pipeline
    hgf_pipeline = pipeline(
        task=task, 
        model=model, 
        tokenizer=tokenizer,
        max_length=int(os.environ["MAX_TOKENS"]),
        **kwargs
    )
    llm = HuggingFacePipeline(pipeline=hgf_pipeline)
    
    # Define Summary chain
    summary_chain = load_summarize_chain(llm, chain_type="map_reduce")
    
    # Load dataset
    dataset = load_dataset(path=os.environ["DATASET_PATH"])
    
    # Generate Summaries and Measure Performance (Rouge Metric and Cosine Similarity Metric)
    rouge = Rouge()  # rouge metric object
    embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"]) # embeddings model object
    embeddings_model.to(os.environ["DEVICE"])
    documents = dataset[dataset_key]["document"][:n_docs]
    target_summaries = dataset[dataset_key]["summary"][:n_docs]
    _zipped = zip(documents, target_summaries)
    metrics_values: Iterable[Dict[str, Any]] = []

    for i, (document, target_summary) in enumerate(_zipped):
        document = Document(page_content=document)
        try:
            generated_summary = summary_chain.run([document])
        except ValueError as e:
            print(f"Error summarizing document-{i+1}: {e}")
            continue
            
        if log_summary:
            print(f"GENERATED SUMARY: {generated_summary}\n")
            print(f"TARGET SUMARY: {target_summary}\n")

        generated_summary_embeddings, target_summary_embeddings = (
            embeddings_model.encode(generated_summary).reshape(1, -1),
            embeddings_model.encode(target_summary).reshape(1, -1)
        )
        cos_similarity = cosine_similarity(target_summary_embeddings, generated_summary_embeddings)
        rouge_scores = rouge.get_scores(generated_summary, target_summary)
        if log_metrics:
            print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
            print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n")
            
        if log_metrics or log_summary:
            print("\n")
        _metric = dict(semantic_similarity=cos_similarity, rouge_scores=rouge_scores)
        metrics_values.append(_metric)
        
    return metrics_values

In [3]:
INFERENCE_DATASET_KEY = "test"
N_INFERENCE_DOCS = 5
LOG_SUMMARY = True
LOG_METRICS = True
DELETE_LLM_AFTER_USE = True

In [4]:
# Load peft config for pre-trained checkpoint etc. 
config = PeftConfig.from_pretrained(os.environ["FINETUNED_FLAN_T5_ID"])

# load base LLM model and tokenizer
flan_t5_model = AutoModelForSeq2SeqLM.from_pretrained(
    config.base_model_name_or_path,  
    load_in_8bit=True,  
    device_map="auto",
)
flan_t5_tokenizer = AutoTokenizer.from_pretrained(
    config.base_model_name_or_path, 
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_performance = run_on_test_data(
    flan_t5_model, 
    flan_t5_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_model, flan_t5_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/doc_summary_data-786ffbbe80ba07a0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/3 [00:00<?, ?it/s]

GENERATED SUMARY: Study of Gemcitabine, Abraxane® Plus Placebo Versus Gemcitabine, Abraxane® Plus 1 or 2 Truncated Courses of Demcizumab in Subjects With 1st-Line Metastatic Pancreatic Ductal Adenocarcinoma Study

TARGET SUMARY: The completed study, known as YOSEMITE, investigated the treatment of 1st-line metastatic pancreatic ductal adenocarcinoma. It was a randomized, double-blind study with three arms. The study has the NCT Number NCT02289898.

Cosine similarity for summary 1: 0.57218146 

Rouge scores for summary 1: {'rouge-1': {'r': 0.03571428571428571, 'p': 0.047619047619047616, 'f': 0.04081632163265365}, 'rouge-2': {'r': 0.0, 'p': 0.0, 'f': 0.0}, 'rouge-l': {'r': 0.03571428571428571, 'p': 0.047619047619047616, 'f': 0.04081632163265365}} 





Token indices sequence length is longer than the specified maximum sequence length for this model (993 > 512). Running this sequence through the model will result in indexing errors


GENERATED SUMARY: Demcizumab is a standard treatment for patients with metastatic pancreatic ductal adenocarcinoma.

TARGET SUMARY: This study evaluated the effectiveness and safety of demcizumab in combination with gemcitabine and Abraxane® for treating metastatic pancreatic ductal adenocarcinoma. The study, sponsored by OncoMed Pharmaceuticals, Inc., included 207 participants. The primary outcome measures were the progression-free survival rates in the placebo and demcizumab arms. The results were initially published in May 2018.

Cosine similarity for summary 2: 0.86372316 

Rouge scores for summary 2: {'rouge-1': {'r': 0.13333333333333333, 'p': 0.5, 'f': 0.210526312465374}, 'rouge-2': {'r': 0.05555555555555555, 'p': 0.2727272727272727, 'f': 0.09230768949585806}, 'rouge-l': {'r': 0.1111111111111111, 'p': 0.4166666666666667, 'f': 0.17543859316712837}} 



GENERATED SUMARY: The United States has a population of 74,769, and a population of 87,769, in the United States.

TARGET SUMARY: 

In [5]:
alpaca_config = AutoConfig.from_pretrained(os.environ["ALPACA_LLM"])

alpaca_model = LlamaForCausalLM.from_pretrained(
    os.environ["ALPACA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

alpaca_tokenizer = LlamaTokenizer.from_pretrained(
    os.environ["ALPACA_LLM"], 
    max_length=alpaca_config.max_sequence_length,
)

alpaca_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del alpaca_model, alpaca_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565
Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/doc_summary_data-786ffbbe80ba07a0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/3 [00:00<?, ?it/s]

GENERATED SUMARY: 
This study was a randomized, double-blind, 3-arm (1:1:1) trial in subjects with 1st-line metastatic pancreatic ductal adenocarcinoma. The study compared gemcitabine plus Abraxane® plus placebo, gemcitabine plus Abraxane® plus 1 truncated course of demcizumab, and gemcitabine plus Abraxane® plus 2 truncated courses of demcizumab. The primary endpoint was overall survival. Secondary endpoints included progression-free survival, objective response rate, and safety.

TARGET SUMARY: The completed study, known as YOSEMITE, investigated the treatment of 1st-line metastatic pancreatic ductal adenocarcinoma. It was a randomized, double-blind study with three arms. The study has the NCT Number NCT02289898.

Cosine similarity for summary 1: 0.64328766 

Rouge scores for summary 1: {'rouge-1': {'r': 0.42857142857142855, 'p': 0.26666666666666666, 'f': 0.3287671185588291}, 'rouge-2': {'r': 0.23333333333333334, 'p': 0.1320754716981132, 'f': 0.16867469417912626}, 'rouge-l': {'r': 0.

Token indices sequence length is longer than the specified maximum sequence length for this model (1192 > 512). Running this sequence through the model will result in indexing errors


GENERATED SUMARY:  
This clinical trial is studying the efficacy and safety of demcizumab, when given in combination with gemcitabine and Abraxane®, compared to placebo in patients with metastatic pancreatic ductal adenocarcinoma. The primary outcome measure is progression-free survival, and secondary outcome measures include overall survival and safety.

TARGET SUMARY: This study evaluated the effectiveness and safety of demcizumab in combination with gemcitabine and Abraxane® for treating metastatic pancreatic ductal adenocarcinoma. The study, sponsored by OncoMed Pharmaceuticals, Inc., included 207 participants. The primary outcome measures were the progression-free survival rates in the placebo and demcizumab arms. The results were initially published in May 2018.

Cosine similarity for summary 2: 0.9663737 

Rouge scores for summary 2: {'rouge-1': {'r': 0.4444444444444444, 'p': 0.5405405405405406, 'f': 0.4878048730963712}, 'rouge-2': {'r': 0.24074074074074073, 'p': 0.3023255813953

In [6]:
vicuna_model = AutoModelForCausalLM.from_pretrained(
    os.environ["VICUNA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

vicuna_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["VICUNA_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

vicuna_performance = run_on_test_data(
    vicuna_model, 
    vicuna_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del vicuna_model, vicuna_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Found cached dataset json (/home/ubuntu/.cache/huggingface/datasets/json/doc_summary_data-786ffbbe80ba07a0/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


  0%|          | 0/3 [00:00<?, ?it/s]

GENERATED SUMARY: 
The YOSEMITE clinical trial involved three arms for patients with first-line metastatic pancreatic ductal adenocarcinoma. The study compared gemcitabine, Abraxane® plus placebo to gemcitabine, Abraxane® plus one or two truncated courses of demcizumab.

TARGET SUMARY: The completed study, known as YOSEMITE, investigated the treatment of 1st-line metastatic pancreatic ductal adenocarcinoma. It was a randomized, double-blind study with three arms. The study has the NCT Number NCT02289898.

Cosine similarity for summary 1: 0.768512 

Rouge scores for summary 1: {'rouge-1': {'r': 0.35714285714285715, 'p': 0.3448275862068966, 'f': 0.35087718798399514}, 'rouge-2': {'r': 0.16666666666666666, 'p': 0.16666666666666666, 'f': 0.16666666166666683}, 'rouge-l': {'r': 0.32142857142857145, 'p': 0.3103448275862069, 'f': 0.31578946868574953}} 



GENERATED SUMARY: 
A clinical trial evaluated the effectiveness and safety of demcizumab, a drug used in combination with gemcitabine and Abr