# <font color = orange> Baseline --> <font color = teal> with GPT-3-Turbo

---

In [3]:
########### Basic Imports ##########

# import tensorflow as tf
import torch
# print(torch.__version__)
import requests
import numpy as np
import pandas as pd
from dotenv.main import load_dotenv
from datasets import load_dataset, load_metric

In [4]:
############ Display & Logging ##############

import os
import sys

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import logging
logging.basicConfig(level=logging.ERROR)
logging.getLogger('ipynb').setLevel(60) # 60 suppresses messages
logging.getLogger().setLevel(logging.ERROR)
logging.disable(logging.CRITICAL)

    
from transformers import logging as transformers_logging
# transformers_logging.set_verbosity_error()

# Set to display full (non-truncated) dataframe information
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_rows', None)

print(sys.executable)

import warnings
warnings.filterwarnings("ignore")
# warnings.filterwarnings("ignore", message="*weights of the model*")
warnings.filterwarnings("ignore", message="Some weights of the model checkpoint at microsoft/deberta-xlarge-mnli were not used when initializing DebertaModel")

/home/ubuntu/anaconda3/envs/20231203_MediQA/bin/python


In [5]:
########## Huggingface ###########

import os
from huggingface_hub import HfFolder
import openai

# Set your Hugging Face API token as an environment variable
os.environ["HF_TOKEN"] = "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"

# Save the token using HfFolder
HfFolder.save_token(os.environ["HF_TOKEN"])

import wandb

# Login to wandb
wandb.login(key="36dc776f4c6ac2dedcfb3c5f7f51c2b7f3f47736")  # Replace with your new API key

# # # # Initialize wandb
# # # wandb.init(project="MediQA-2023-Task_A1")

# # # HF: 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
openai.api_key = "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"

# from huggingface_hub import notebook_login

# notebook_login()

[34m[1mwandb[0m: Currently logged in as: [33mgaurav-narasimhan[0m ([33mzibajoon[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/gaurav_narasimhan/.netrc


In [6]:
########## Pandas/DataSci-Core ###########

import pandas as pd
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.utils import class_weight
import numpy as np
from datetime import datetime

---

## <font color = grey> Core Computation Process

In [16]:
import openai
import pandas as pd
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from bleurt import score as bleurt_score

# Function to create GPT-3 prompt
def create_gpt3_prompt(dialogue):
    prompt = (
        "Classify the conversation into one of these 20 classes: "
        "FAMILY HISTORY/SOCIAL HISTORY, HISTORY of PRESENT ILLNESS, PAST MEDICAL HISTORY, CHIEF COMPLAINT, PAST "
        "SURGICAL HISTORY, Allergy, REVIEW OF SYSTEMS, Medications, Assessment, Exam, Diagnosis, Disposition, Plan, "
        "EMERGENCY DEPARTMENT COURSE, Immunizations, Imaging, GYNECOLOGIC HISTORY, Procedures, Other history, "
        "Labs. The response should start with the selected class, followed by # then the summary of the conversation "
        "in a clinical note style. The conversation is: \n\n" + dialogue
    )
    return prompt

# Function to call GPT-3.5-turbo API
def call_gpt3_turbo(prompt):
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo-1106",
        messages=[
            {"role": "system", "content": "You are a medical assistant."},
            {"role": "user", "content": prompt}
        ]
    )
    return response['choices'][0]['message']['content']

# Function to extract the section header from the GPT-3 response
def extract_section_header(gpt3_summary):
    for line in gpt3_summary.split('\n'):
        if '#' in line:
            return line.split('#')[0].strip()
    return "Unknown"

# Function to extract the section text from the GPT-3 response
def extract_section_text(gpt3_summary):
    if '#' in gpt3_summary:
        return gpt3_summary.split('#', 1)[1].strip()
    return "Unknown"

# Function to compute accuracy
def compute_accuracy(predictions, references):
    correct = 0
    for pred, ref in zip(predictions, references):
        correct += int(pred == ref)
    return correct / len(predictions)

baseline_detail = pd.DataFrame(columns=[
    'ID', 'Predicted_Section_Header', 'True_Section_Header', 
    'Predicted_Section_Text', 'True_Section_Text',
    'ROUGE-1', 'BERTScore-F1', 'BLEURT', 'Agg'
])

# Load the test data
test_data = pd.read_csv('/home/gaurav_narasimhan/03.gn_projects/03.MediQA_2023/11.Source_Data/MTS-Dialog-TestSet-1-MEDIQA-Chat-2023.csv')

# Initialize metrics computation tools
rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
bleurt_scorer = bleurt_score.BleurtScorer("/home/gaurav_narasimhan/05.gn_benchmarks/01.BLUERT/BLEURT-20")

# Prepare the metrics dictionaries
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': [], 'rougeLsum': []}
bert_scores = {'precision': [], 'recall': [], 'f1': []}
bleurt_scores = []

# Specify the number of records to process
n = 200
test_data_subset = test_data.head(n)

# Lists to store the predicted headers and texts
predicted_section_headers = []
predicted_section_texts = []

# Iterate over the test_data subset and perform inference and metric computation
for index, row in test_data_subset.iterrows():
    dialogue = row['dialogue']
    prompt = create_gpt3_prompt(dialogue)
    
#     # Print the payload for debugging
    # print(f"Sending the following payload to GPT-3 for record {index}: {prompt}")    

    gpt3_summary = call_gpt3_turbo(prompt)
    
#     # Print the response from GPT-3 for debugging
    # print(f"Received the following response from GPT-3 for record {index}: {gpt3_summary}")
    
    predicted_header = extract_section_header(gpt3_summary)
    predicted_text = extract_section_text(gpt3_summary)
    
    predicted_section_headers.append(predicted_header)
    predicted_section_texts.append(predicted_text)
    
    # Compute ROUGE scores
    rouge_result = rouge_scorer.score(row['section_text'], predicted_text)
    for key in rouge_scores:
        rouge_scores[key].append(rouge_result[key].fmeasure)
    
    # Compute BERT scores
    P, R, F1 = bert_score([predicted_text], [row['section_text']], lang="en", model_type="bert-base-uncased", rescale_with_baseline=False)
    bert_scores['precision'].append(P.mean().item())
    bert_scores['recall'].append(R.mean().item())
    bert_scores['f1'].append(F1.mean().item())
    
    # Compute BLEURT score
    bleurt_result = bleurt_scorer.score(references=[row['section_text']], candidates=[predicted_text])
    bleurt_scores.append(bleurt_result[0])

# Compute overall scores
section_header_accuracy = compute_accuracy(
    predictions=predicted_section_headers,
    references=test_data_subset['section_header'].tolist()
)
overall_rouge_scores = {key: np.mean(values) for key, values in rouge_scores.items()}
overall_bert_scores = {key: np.mean(values) for key, values in bert_scores.items()}
overall_bleurt_score = np.mean(bleurt_scores)

# Print overall scores for verification
print("Section Header Accuracy:", section_header_accuracy)
print("Overall ROUGE Scores:", overall_rouge_scores)
print("Overall BERT Scores:", overall_bert_scores)
print("Overall BLEURT Score:", overall_bleurt_score)

Section Header Accuracy: 0.005
Overall ROUGE Scores: {'rouge1': 0.33273161865787587, 'rouge2': 0.13606609767323136, 'rougeL': 0.2622863830961412, 'rougeLsum': 0.26347581492707933}
Overall BERT Scores: {'precision': 0.5819932270050049, 'recall': 0.6302992513775826, 'f1': 0.599636298418045}
Overall BLEURT Score: 0.5368788198381662


### <font color = grey> Baseline Detail

In [18]:
# Initialize the DataFrame for storing baseline details
baseline_detail = pd.DataFrame(columns=[
    'ID', 'Predicted_Section_Header', 'True_Section_Header', 
    'Predicted_Section_Text', 'True_Section_Text',
    'ROUGE-1', 'BERTScore-F1', 'BLEURT', 'Agg'
])

# Iterate over the test_data subset and store the details in baseline_detail
for i in range(len(predicted_section_headers)):
    # Calculate the aggregated score for the current record
    agg_score = np.mean([
        rouge_scores['rouge1'][i],
        bert_scores['f1'][i],
        bleurt_scores[i]
    ])

    # Create a new record as a DataFrame
    new_record_df = pd.DataFrame([{
        'ID': test_data_subset.iloc[i]['ID'],
        'Predicted_Section_Header': predicted_section_headers[i],
        'True_Section_Header': test_data_subset.iloc[i]['section_header'],
        'Predicted_Section_Text': predicted_section_texts[i],
        'True_Section_Text': test_data_subset.iloc[i]['section_text'],
        'ROUGE-1': rouge_scores['rouge1'][i],
        'BERTScore-F1': bert_scores['f1'][i],
        'BLEURT': bleurt_scores[i],
        'Agg': agg_score
    }])

    # Concatenate the new record to the DataFrame
    baseline_detail = pd.concat([baseline_detail, new_record_df], ignore_index=True)

# Display the DataFrame
baseline_detail

Unnamed: 0,ID,Predicted_Section_Header,True_Section_Header,Predicted_Section_Text,True_Section_Text,ROUGE-1,BERTScore-F1,BLEURT,Agg
0,0,HISTORY of PRESENT ILLNESS,GENHX,The doctor inquired about the patient's recent...,The patient is a 55-year-old African-American ...,0.365385,0.604478,0.574911,0.514925
1,1,FAMILY HISTORY/SOCIAL HISTORY,FAM/SOCHX,Patient reports a family history of stroke and...,Positive for stroke and sleep apnea.,0.500000,0.753286,0.604451,0.619246
2,2,REVIEW OF SYSTEMS,ROS,"Patient denies any muscle pain, joint pain, st...","MSK: Negative myalgia, negative joint pain, ne...",0.480000,0.676757,0.582900,0.579886
3,3,FAMILY HISTORY/SOCIAL HISTORY,FAM/SOCHX,The patient denies any family history of blood...,Noncontributory.,0.000000,0.294915,0.154629,0.149848
4,4,FAMILY HISTORY/SOCIAL HISTORY,FAM/SOCHX,The patient has a family history of medical pr...,"Father died of a thoracic aortic aneurysm, age...",0.566038,0.733069,0.758237,0.685781
...,...,...,...,...,...,...,...,...,...
195,195,HISTORY of PRESENT ILLNESS,GENHX,"The patient, a 72-year-old with a history of h...","Briefly, the patient is a very pleasant 72-yea...",0.502564,0.703613,0.535004,0.580394
196,196,HISTORY of PRESENT ILLNESS,ROS,The patient denies any uncomfortable feeling o...,"NOSE AND THROAT: Negative postnasal drip, nega...",0.137931,0.485025,0.360505,0.327820
197,197,HISTORY OF PRESENT ILLNESS,GENHX,The patient reports no new symptoms since June...,The patient is alert and oriented x3 and sitti...,0.285714,0.595152,0.442872,0.441246
198,198,PAST MEDICAL HISTORY,PASTMEDICALHX,The patient denies any major medical condition...,Past medical history is unremarkable.,0.125000,0.458521,0.329374,0.304298


---

## <font color = grey> Section_Text --> Rouge, BertScore and Bluert

In [19]:
# Ensuring numeric columns are in the correct type
numeric_cols = ['ROUGE-1', 'BERTScore-F1', 'BLEURT', 'Agg']
baseline_detail[numeric_cols] = baseline_detail[numeric_cols].apply(pd.to_numeric, errors='coerce')

# Calculate the mean of each numeric column
baseline_summary = baseline_detail[numeric_cols].mean().to_frame().transpose()

# Optionally, rename the index to make it more descriptive
baseline_summary.index = ['Average']

# Display the baseline_summary DataFrame
baseline_summary

Unnamed: 0,ROUGE-1,BERTScore-F1,BLEURT,Agg
Average,0.332732,0.599636,0.536879,0.489749


In [20]:
# Define the directory path where you want to save the files
directory_path = '/home/gaurav_narasimhan/03.gn_projects/03.MediQA_2023/10.TaskA_Summarization_Metrics'

# Save baseline_detail DataFrame as CSV
baseline_detail_csv_path = directory_path + '/baseline_detail.csv'
baseline_detail.to_csv(baseline_detail_csv_path, index=False)

# Save baseline_summary DataFrame as CSV
baseline_summary_csv_path = directory_path + '/baseline_summary.csv'
baseline_summary.to_csv(baseline_summary_csv_path, index=False)

print(f"Saved baseline_detail to {baseline_detail_csv_path}")
print(f"Saved baseline_summary to {baseline_summary_csv_path}")

Saved baseline_detail to /home/gaurav_narasimhan/03.gn_projects/03.MediQA_2023/10.TaskA_Summarization_Metrics/baseline_detail.csv
Saved baseline_summary to /home/gaurav_narasimhan/03.gn_projects/03.MediQA_2023/10.TaskA_Summarization_Metrics/baseline_summary.csv


---

## <font color = grey> Section_Header --> Accuracy

In [21]:
# Mapping dictionary
header_mapping = {
    "FAMILY HISTORY/SOCIAL HISTORY": "FAM/SOCHX",
    "HISTORY of PRESENT ILLNESS": "GENHX",
    "PAST MEDICAL HISTORY": "PASTMEDICALHX",
    "CHIEF COMPLAINT": "CC",
    "PAST SURGICAL HISTORY": "PASTSURGICAL",
    "Allergy": "ALLERGY",
    "REVIEW OF SYSTEMS": "ROS",
    "Medications": "MEDICATIONS",
    "Assessment": "ASSESSMENT",
    "Exam": "EXAM",
    "Diagnosis": "DIAGNOSIS",
    "Disposition": "DISPOSITION",
    "Plan": "PLAN",
    "EMERGENCY DEPARTMENT COURSE": "EDCOURSE",
    "Immunizations": "IMMUNIZATIONS",
    "Imaging": "IMAGING",
    "GYNECOLOGIC HISTORY": "GYNHX",
    "Procedures": "PROCEDURES",
    "Other history": "OTHER_HISTORY",
    "Labs": "LABS"
}

# Function to compute overall accuracy
def compute_overall_accuracy(df, mapping):
    correct = 0
    total = len(df)
    for index, row in df.iterrows():
        predicted = mapping.get(row['Predicted_Section_Header'], None)
        if predicted == row['True_Section_Header']:
            correct += 1
    return correct / total if total > 0 else 0

# Calculate overall accuracy
overall_accuracy = compute_overall_accuracy(baseline_detail, header_mapping)

print(f"Overall Accuracy: {overall_accuracy}")


Overall Accuracy: 0.57


In [22]:
# Function to compute overall accuracy with case insensitivity
def compute_overall_accuracy_case_insensitive(df, mapping):
    correct = 0
    total = len(df)
    for index, row in df.iterrows():
        predicted = mapping.get(row['Predicted_Section_Header'].lower(), None)
        if predicted and predicted.lower() == row['True_Section_Header'].lower():
            correct += 1
    return correct / total if total > 0 else 0

# Update the mapping dictionary keys to lowercase
header_mapping_lower = {key.lower(): value for key, value in header_mapping.items()}

# Calculate overall accuracy with case insensitivity
overall_accuracy = compute_overall_accuracy_case_insensitive(baseline_detail, header_mapping_lower)

print(f"Overall Accuracy (Case Insensitive): {overall_accuracy}")


Overall Accuracy (Case Insensitive): 0.6
