In [None]:
pip install -U deepeval

# DeepEval Summarization

In [None]:
import os
import pandas as pd
import time
import math
from deepeval import evaluate
from deepeval.metrics import SummarizationMetric
from deepeval.test_case import LLMTestCase
from langchain.chat_models import AzureChatOpenAI
# from langchain_openai import AzureChatOpenAI
from deepeval.models.base_model import DeepEvalBaseLLM
from datetime import datetime

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = "2024-02-15-preview"
os.environ["AZURE_OPENAI_API_KEY"] = '31143cd4889f4ef8bb2742d3570c2b24'
os.environ["AZURE_OPENAI_ENDPOINT"] = "https://qmed.openai.azure.com/"

def round_up(n, decimals = 0):
    multiplier = 10**decimals
    return math.ceil(n * multiplier) / multiplier

class AzureOpenAI(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str) -> str:
        chat_model = self.load_model()
        res = await chat_model.ainvoke(prompt)
        return res.content

    def get_model_name(self):
        return "Custom Azure OpenAI Model"

custom_model = AzureChatOpenAI(
    openai_api_version= "2024-02-15-preview",
    azure_deployment= "gpt-4-turbo",
    temperature=0,
    model_version="turbo-2024-04-09",
)


azure_openai = AzureOpenAI(model=custom_model)

# base_path = '/home/qmed-intel/Desktop/Notebook/Shamus/ipex+tcm+bf16'
# file_name = 'inference_ipex+tcm+bf16.csv'
# inference_results_path = os.path.join(base_path, file_name)
# df = pd.read_csv(inference_results_path)

base_path = '/home/qmed-intel/Desktop/Notebook/ZiYu/'
file_name = 'test.xlsx'
inference_results_path = os.path.join(base_path, file_name)
df = pd.read_excel(inference_results_path)

score_breakdown = []
results = []
reason = []
start_time = 0
end_time = 0
exe_time = 0
total_exe_time = 0
avg_exe_time = 0
query = 0
score = 0
alignment_score = 0
coverage_score = 0

for index, row in df.iterrows():
    query += 1
    print("Query:", query, "\n")
    # print(row['Input_Query'] + "\n")
    # print(row['Output_Response'])
    # This is the original text to be summarized
    input_query = row['Input Query']
    
    # This is the summary, replace this with the actual output from your LLM application
    actual_output = row['Generated Output']
    
    test_case = LLMTestCase(input=input_query, actual_output=actual_output)
    metric = SummarizationMetric(
        threshold=0.5, 
        model=azure_openai, 
        assessment_questions=[
     "Does this summary clearly state the age and gender of the patient?", 
    "Does it cover the main symptom that concerns the patient the most (Chief Complaint) and the duration it has been present?", 
    "Does it cover the elaboration of the main symptom, additional symptoms associated with the chief complaint and important negative history?", 
    "Does the summary include relevant past medical history, such as chronic conditions, allergy history or previous significant illnesses and information on any past surgical procedures the patient has undergone?", 
    "Does the summary include relevant family medical history and social history on smoking and alcohol consumption?"
        ]
    )
    
    start_time = time.time()
    metric.measure(test_case)
    end_time = time.time()

    exe_time = round_up((end_time - start_time), 4)

    total_exe_time += exe_time

    print("Execution time:", exe_time, "\n")
    
    print(metric.score)
    print(metric.reason)
    print(metric.score_breakdown)
    print("\n")
    score_breakdown = metric.score_breakdown
    # if metric.score >= 0.85:
    results.append({
        'Input Query': input_query, 
        'Generated Summary': actual_output, 
        'Score': metric.score, 
        'Reason': metric.reason, 
        'Alignment Score': score_breakdown['Alignment'], 
        'Coverage Score': score_breakdown['Coverage'], 
        'Execution Time (s)': exe_time, 
    })

    time.sleep(3)

avg_exe_time = total_exe_time / query

print("Overall Average Execution Time:", avg_exe_time)


df = pd.DataFrame(results)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Save the DataFrame to a CSV file
base_path = '/home/qmed-intel/Desktop/Notebook/Shamus/'
file_name = 'deepeval_n=50.csv'
deepeval_path = os.path.join(base_path, file_name)
os.makedirs(base_path, exist_ok=True)


df.to_csv(deepeval_path, index=False)
print(f"Results successfully saved to {deepeval_path}")



In [32]:
import os

# Correct file path
base_path = '/home/qmed-intel/Desktop/Notebook/Shamus/ipex+tcm+AMX+bf16'
file_name = 'deepeval_n=14.csv'
file_path = os.path.join(base_path, file_name)

# Check if the file exists at the new path
if os.path.exists(file_path):
    print(f"The file exists at: {file_path}")
    df = pd.read_csv(file_path)
else:
    print(f"File not found: {file_path}")



File not found: /home/qmed-intel/Desktop/Notebook/Shamus/ipex+tcm+AMX+bf16/deepeval_n=14.csv


In [11]:
import os
df = pd.DataFrame(results)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# Save the DataFrame to a CSV file
csv_file_path = filename = f"deepeval_llama_no_FT_pipeline_50_samples{current_datetime}.csv"
df.to_csv(csv_file_path, index=False)
print(f"Results successfully saved to {csv_file_path}")

Results successfully saved to HEREEEdeepeval_llama_no_FT_pipeline_50_samples2024-05-28_12-54-33.csv


In [21]:

good_results
df_good_results = pd.DataFrame(good_results)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"deepeval_llama_instruct_raw_50_samples_extracted_less_than_0.8_{current_datetime}.csv"

df_good_results.to_csv(filename, index=False)

In [11]:
bad_results = []
existing_csv_file = "deepeval_colon_rows_2024-05-14_12-32-06.csv"
df = pd.read_csv(existing_csv_file)

for index, row in df.iterrows():
    if row['Score'] < 0.8:
        print("row['Score']:", row['Score'])
        bad_results.append({
            'Input Query': row['Input Query'], 
            'Output Response': row['Output Response'], 
            'Score': row['Score'], 
            'Reason': row['Reason'], 
            'Alignment Score': row['Alignment Score'], 
            'Coverage Score': row['Coverage Score'], 
            'Execution Time (s)': row['Execution Time (s)'], 
        })

df_bad_results = pd.DataFrame(bad_results)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"deepeval_colon_rows_lower_than_80_{current_datetime}.csv"

df_bad_results.to_csv(filename, index=False)

# df_bad_results = pd.DataFrame(bad_results)
# current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
# filename = f"deepeval_gpt_bad_results2_{current_datetime}.csv"

# df_bad_results.to_csv(filename, index=False)

row['Score']: 0.75
row['Score']: 0.6
row['Score']: 0.4
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.7647058823529411
row['Score']: 0.75
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.4
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.6
row['Score']: 0.75
row['Score']: 0.6
row['Score']: 0.6


In [44]:
results

df_results = pd.DataFrame(results)
current_datetime = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"deepeval_mistral_instruct_qlora_{current_datetime}.csv"

df_results.to_csv(filename, index=False)