In [1]:
import os, warnings, torch, json, random, gc
from tqdm.notebook import tqdm
from transformers import (
    AutoTokenizer, 
    LlamaTokenizer,
    AutoModelForCausalLM,
    LlamaForCausalLM,
    AutoModelForSeq2SeqLM, 
    AutoConfig,
    LlamaConfig,
    pipeline
)
from datasets import load_dataset, concatenate_datasets
from datasets.dataset_dict import DatasetDict
from peft import PeftConfig, PeftModel
from rouge import Rouge
from sentence_transformers import SentenceTransformer
from langchain.llms import HuggingFacePipeline
from langchain.chains.summarize import load_summarize_chain
from langchain.schema import Document
from sklearn.metrics.pairwise import cosine_similarity
from typing import Dict, Iterable, Union, Any

warnings.filterwarnings("ignore")

os.environ["EMBEDDINGS_MODEL"] = "all-MiniLM-L12-v2"
os.environ["MAX_TOKENS"] = "2048"
os.environ["DEVICE"] = "cuda:0" if torch.cuda.is_available() else "cpu"
os.environ["DATASET_PATH"] = "data/doc_summary_data"
os.environ["FINETUNED_FLAN_T5_SMALL_ID"] = "flan-t5-small_finetuned_results"
os.environ["FINETUNED_FLAN_T5_BASE_ID"] = "flan-t5-base_finetuned_results"
os.environ["ALPACA_LLM"] = "chavinlo/alpaca-native"
os.environ["VICUNA_LLM"] = "lmsys/vicuna-13b-v1.3"

In [2]:
def run_on_test_data(
    model: Union[AutoModelForSeq2SeqLM, AutoModelForCausalLM, LlamaForCausalLM], 
    tokenizer: Union[AutoTokenizer, LlamaTokenizer], 
    dataset_key: str="test",
    task="text2text-generation",
    n_docs: int = 5,
    log_summary: bool=False,
    log_metrics: bool=False,
    **kwargs):
    
    # switch model to eval mode
    model.eval()
    
    # Define model pipeline for inference with langchain
    kwargs = {**dict(temperature=0.1, top_p=0.15, top_k=0, repetition_penalty=1.1), **kwargs}

    # define model pipeline
    hgf_pipeline = pipeline(
        task=task, 
        model=model, 
        tokenizer=tokenizer,
        max_length=int(os.environ["MAX_TOKENS"]),
        **kwargs
    )
    llm = HuggingFacePipeline(pipeline=hgf_pipeline)
    
    # Define Summary chain
    summary_chain = load_summarize_chain(llm, chain_type="map_reduce")
    
    # Load dataset
    dataset = load_dataset(path=os.environ["DATASET_PATH"])
    
    # Generate Summaries and Measure Performance (Rouge Metric and Cosine Similarity Metric)
    rouge = Rouge()  # rouge metric object
    embeddings_model = SentenceTransformer(os.environ["EMBEDDINGS_MODEL"]) # embeddings model object
    embeddings_model.to(os.environ["DEVICE"])
    documents = dataset[dataset_key]["document"][:n_docs]
    target_summaries = dataset[dataset_key]["summary"][:n_docs]
    _zipped = zip(documents, target_summaries)
    metrics_values: Iterable[Dict[str, Any]] = []

    for i, (document, target_summary) in enumerate(_zipped):
        document = Document(page_content=document)
        try:
            generated_summary = summary_chain.run([document])
        except ValueError as e:
            print(f"Error summarizing document-{i+1}: {e}")
            continue
            
        if log_summary:
            print(f"GENERATED SUMARY: {generated_summary}\n")
            print(f"TARGET SUMARY: {target_summary}\n")

        generated_summary_embeddings, target_summary_embeddings = (
            embeddings_model.encode(generated_summary).reshape(1, -1),
            embeddings_model.encode(target_summary).reshape(1, -1)
        )
        cos_similarity = cosine_similarity(target_summary_embeddings, generated_summary_embeddings)
        rouge_scores = rouge.get_scores(generated_summary, target_summary)
        if log_metrics:
            print(f"Cosine similarity for summary {i+1}:", cos_similarity[0][0], "\n")
            print(f"Rouge scores for summary {i+1}:", rouge_scores[0], "\n")
            
        if log_metrics or log_summary:
            print("\n")
        _metric = dict(semantic_similarity=cos_similarity, rouge_scores=rouge_scores)
        metrics_values.append(_metric)
        
    return metrics_values

In [3]:
INFERENCE_DATASET_KEY = "test"
N_INFERENCE_DOCS = 5
LOG_SUMMARY = True
LOG_METRICS = True
DELETE_LLM_AFTER_USE = True

In [4]:
# Load peft config for pre-trained checkpoint etc. 
flan_t5_small_config = PeftConfig.from_pretrained(os.environ["FINETUNED_FLAN_T5_SMALL_ID"])

# load base LLM model and tokenizer
flan_t5_small_model = AutoModelForSeq2SeqLM.from_pretrained(
    flan_t5_small_config.base_model_name_or_path,  
    load_in_8bit=True,  
    device_map="auto",
)
flan_t5_small_model = PeftModel.from_pretrained(
    flan_t5_small_model, 
    os.environ["FINETUNED_FLAN_T5_SMALL_ID"],
    device_map="auto"
)
flan_t5_small_tokenizer = AutoTokenizer.from_pretrained(
    flan_t5_small_config.base_model_name_or_path, 
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_performance = run_on_test_data(
    flan_t5_small_model, 
    flan_t5_small_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_small_model, flan_t5_small_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].
Token indices sequence length is longer than the specified maximum sequence length for this model (1938 > 1024

GENERATED SUMARY: The summary provides a list of cancer centers and hospitals in different cities in Florida, including Jacksonville, Delaware, and Newnan.

TARGET SUMARY: The summary provides a list of cancer centers and hospitals in various states in the US, such as Delaware, Florida, and Georgia, that provide cancer treatment and care services.

Cosine similarity for summary 1: 0.8069508 

Rouge scores for summary 1: {'rouge-1': {'r': 0.52, 'p': 0.7222222222222222, 'f': 0.6046511579232018}, 'rouge-2': {'r': 0.35714285714285715, 'p': 0.5263157894736842, 'f': 0.42553191007695795}, 'rouge-l': {'r': 0.48, 'p': 0.6666666666666666, 'f': 0.5581395300162251}} 



GENERATED SUMARY: This study includes medical centers and hospitals in different cities in the United States, including Georgia, Hawaii, and Illinois.

TARGET SUMARY: The summary provides a list of medical centers and hospitals across multiple states in the US that specialize in oncology and cancer treatment.

Cosine similarity for

In [5]:
# Load peft config for pre-trained checkpoint etc. 
flan_t5_base_config = PeftConfig.from_pretrained(os.environ["FINETUNED_FLAN_T5_BASE_ID"])

# load base LLM model and tokenizer
flan_t5_base_model = AutoModelForSeq2SeqLM.from_pretrained(
    flan_t5_base_config.base_model_name_or_path,  
    load_in_8bit=True,  
    device_map="auto",
)
flan_t5_base_model = PeftModel.from_pretrained(
    flan_t5_base_model, 
    os.environ["FINETUNED_FLAN_T5_BASE_ID"],
    device_map="auto"
)
flan_t5_base_tokenizer = AutoTokenizer.from_pretrained(
    flan_t5_base_config.base_model_name_or_path, 
    max_length=os.environ["MAX_TOKENS"],
)

flan_t5_performance = run_on_test_data(
    flan_t5_base_model, 
    flan_t5_base_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text2text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del flan_t5_base_model, flan_t5_base_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Some weights of T5ForConditionalGeneration were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The model 'PeftModelForSeq2SeqLM' is not supported for text2text-generation. Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditio

GENERATED SUMARY: The summary provides a list of cancer centers and hospitals in different states in the US, including Delaware, Florida, and Georgia.

TARGET SUMARY: The summary provides a list of cancer centers and hospitals in various states in the US, such as Delaware, Florida, and Georgia, that provide cancer treatment and care services.

Cosine similarity for summary 1: 0.95705646 

Rouge scores for summary 1: {'rouge-1': {'r': 0.64, 'p': 0.8421052631578947, 'f': 0.7272727223657025}, 'rouge-2': {'r': 0.5357142857142857, 'p': 0.75, 'f': 0.624999995138889}, 'rouge-l': {'r': 0.64, 'p': 0.8421052631578947, 'f': 0.7272727223657025}} 



GENERATED SUMARY: The summary provides a list of medical centers and hospitals in various cities across the United States, including Georgia, Hawaii, and Illinois.

TARGET SUMARY: The summary provides a list of medical centers and hospitals across multiple states in the US that specialize in oncology and cancer treatment.

Cosine similarity for summary

In [6]:
alpaca_config = AutoConfig.from_pretrained(os.environ["ALPACA_LLM"])

alpaca_model = LlamaForCausalLM.from_pretrained(
    os.environ["ALPACA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

alpaca_tokenizer = LlamaTokenizer.from_pretrained(
    os.environ["ALPACA_LLM"], 
    max_length=alpaca_config.max_sequence_length,
)

alpaca_performance = run_on_test_data(
    alpaca_model, 
    alpaca_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del alpaca_model, alpaca_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

You are using the legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


GENERATED SUMARY: 

The Helen F. Graham Cancer Center is located in Newark, Delaware and provides comprehensive cancer care services. The University of Florida Health Science Center is located in Gainesville, Florida and offers a range of specialized cancer treatments and clinical trials. Memorial Regional Hospital/Joe DiMaggio Children's Hospital is located in Hollywood, Florida and provides pediatric oncology services. The Mayo Clinic in Florida is located in Jacksonville, Florida and specializes in medical and surgical oncology. The Miami Cancer Institute is located in Miami, Florida and provides advanced treatment options for cancer patients. The Orlando Health Cancer Institute is located in Orlando, Florida and provides comprehensive cancer care services. Memorial Hospital West is located in Pembroke Pines, Florida and provides radiation therapy and chemotherapy services. Emory University Hospital Midtown is located in Atlanta, Georgia and provides a range of specialized cancer tr

In [7]:
vicuna_model = AutoModelForCausalLM.from_pretrained(
    os.environ["VICUNA_LLM"],
    load_in_8bit=True,
    device_map="auto",
)

vicuna_tokenizer = AutoTokenizer.from_pretrained(
    os.environ["VICUNA_LLM"], 
    max_length=os.environ["MAX_TOKENS"],
)

vicuna_performance = run_on_test_data(
    vicuna_model, 
    vicuna_tokenizer, 
    INFERENCE_DATASET_KEY, 
    task="text-generation", 
    n_docs=N_INFERENCE_DOCS,
    log_summary=LOG_SUMMARY,
    log_metrics=LOG_METRICS,
)

if DELETE_LLM_AFTER_USE:
    #model.to(torch.device("cpu"))
    del vicuna_model, vicuna_tokenizer
    gc.collect()
    torch.cuda.empty_cache()

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

GENERATED SUMARY: 

The text provides a list of various cancer centers and hospitals across the United States, including locations in Delaware, Florida, Georgia, and Atlanta. Some of these institutions are affiliated with universities or have multiple campuses, such as the Mayo Clinic and Emory University Hospital.

TARGET SUMARY: The summary provides a list of cancer centers and hospitals in various states in the US, such as Delaware, Florida, and Georgia, that provide cancer treatment and care services.

Cosine similarity for summary 1: 0.8590759 

Rouge scores for summary 1: {'rouge-1': {'r': 0.68, 'p': 0.425, 'f': 0.5230769183431954}, 'rouge-2': {'r': 0.2857142857142857, 'p': 0.18604651162790697, 'f': 0.22535210789922644}, 'rouge-l': {'r': 0.6, 'p': 0.375, 'f': 0.4615384568047337}} 



GENERATED SUMARY: 
The passage provides a list of various medical centers and hospitals located throughout the United States, including Southeastern Regional Medical Center in Newnan, Georgia; Lewis 