In [None]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import copy
import ast

### Download the MMLU datasets

In [None]:
# Download the datasets from HuggingFace
#clinical_knowledge = load_dataset("cais/mmlu", "clinical_knowledge")
#college_medicine = load_dataset("cais/mmlu", "college_medicine")

anatomy = load_dataset("cais/mmlu", "anatomy")
college_biology = load_dataset("cais/mmlu", "college_biology")
medical_genetics = load_dataset("cais/mmlu", "medical_genetics")
professional_medicine = load_dataset("cais/mmlu", "professional_medicine")

In [None]:
# Load the datasets
#clinical_knowledge = load_dataset("../.cache/huggingface/datasets/cais___mmlu/clinical_knowledge")
#college_medicine = load_dataset("../.cache/huggingface/datasets/cais___mmlu/college_medicine")

anatomy = load_dataset("../.cache/huggingface/datasets/cais___mmlu/anatomy")
college_biology = load_dataset("../.cache/huggingface/datasets/cais___mmlu/college_biology")
medical_genetics = load_dataset("../.cache/huggingface/datasets/cais___mmlu/medical_genetics")
professional_medicine = load_dataset("../.cache/huggingface/datasets/cais___mmlu/professional_medicine")

### Add the processed columns

In [None]:
# Function to process each subset and save as CSV
def process_and_save_subset(subset, filename):
    test = pd.DataFrame(subset['test'])
    validation = pd.DataFrame(subset['validation'])
    dataset_list = [test, validation]
    section_dict = {0: 'test', 1: 'validation'}
    for i, dataset in enumerate(dataset_list):
        dataset['section'] = [section_dict[i]] * len(dataset)
    df = pd.concat(dataset_list, ignore_index=True)
    df['correct_answer_str'] = df.apply(lambda row: row['choices'][row['answer']], axis=1)
    df['wrong_answer_str_1'] = df.apply(lambda row: row['choices'][1] if row['answer'] == 0 else row['choices'][0], axis=1)
    df['wrong_answer_str_2'] = df.apply(lambda row: row['choices'][2] if row['answer'] == 0 or row['answer'] == 1 else row['choices'][1], axis=1)
    df['wrong_answer_str_3'] = df.apply(lambda row: row['choices'][2] if row['answer'] == 3 else row['choices'][3], axis=1)

    # Split data into two groups and assign propose_correct_answer and proposed_answer
    np.random.seed(42)
    half_size = len(df) // 2
    propose_correct = np.zeros(len(df))
    propose_correct[:half_size] = 1
    np.random.shuffle(propose_correct)  # Randomly shuffle the array to assign 1s and 0s

    df['propose_correct_answer'] = propose_correct
    df['proposed_answer'] = df.apply(lambda row: row['correct_answer_str'] if row['propose_correct_answer'] == 1 else np.random.choice([row['wrong_answer_str_1'], row['wrong_answer_str_2'], row['wrong_answer_str_3']]), axis=1)
    
    df.to_csv(filename, index=False)

In [None]:
# Process and save both subsets
#process_and_save_subset(clinical_knowledge, 'clinical_knowledge.csv')
#process_and_save_subset(college_medicine, 'college_medicine.csv')

process_and_save_subset(anatomy, 'anatomy.csv')
process_and_save_subset(college_biology, 'college_biology.csv')
process_and_save_subset(medical_genetics, 'medical_genetics.csv')
process_and_save_subset(professional_medicine, 'professional_medicine.csv')

print("Datasets processed and saved as CSV files.")

### Download the MedQA datasets

In [None]:
# Load the datasets
medQA_en = load_dataset("../.cache/huggingface/datasets/bigbio___med_qa/med_qa_en_source")
medQA_zh = load_dataset("../.cache/huggingface/datasets/bigbio___med_qa/med_qa_zh_source")

In [None]:
# Function to process each subset and save as CSV
def process_and_save_subset(dataframe, filename):
    test = pd.DataFrame(dataframe['test'])
    validation = pd.DataFrame(dataframe['validation'])
    train = pd.DataFrame(dataframe['train'])
    dataset_list = [test, validation, train]
    section_dict = {0: 'test', 1: 'validation', 2: 'train'}
    for i, dataset in enumerate(dataset_list):
        dataset['section'] = [section_dict[i]] * len(dataset)
    df = pd.concat(dataset_list, ignore_index=True)
    df['correct_answer_str'] = df.apply(lambda row: row['answer'], axis=1)
    df['wrong_answer_str_1'] = df.apply(lambda row: row['options'][1]['value'] if row['answer_idx'] == 'A' else row['options'][0]['value'], axis=1)
    df['wrong_answer_str_2'] = df.apply(lambda row: row['options'][2]['value'] if row['answer_idx'] == 'A' or row['answer_idx'] == 'B' else row['options'][1]['value'], axis=1)
    df['wrong_answer_str_3'] = df.apply(lambda row: row['options'][2]['value'] if row['answer_idx'] == 'D' or row['answer_idx'] == 'E' else row['options'][3]['value'], axis=1)
    df['wrong_answer_str_4'] = df.apply(lambda row: row['options'][3]['value'] if row['answer_idx'] == 'E' else row['options'][4]['value'], axis=1)

    # Split data into two groups and assign propose_correct_answer and proposed_answer
    np.random.seed(42)
    half_size = len(df) // 2
    propose_correct = np.zeros(len(df))
    propose_correct[:half_size] = 1
    np.random.shuffle(propose_correct)  # Randomly shuffle the array to assign 1s and 0s

    df['propose_correct_answer'] = propose_correct
    df['proposed_answer'] = df.apply(lambda row: row['correct_answer_str'] if row['propose_correct_answer'] == 1 else np.random.choice([row['wrong_answer_str_1'], row['wrong_answer_str_2'], row['wrong_answer_str_3'], row['wrong_answer_str_4']]), axis=1)
    
    df.to_csv(filename, index=False)

In [None]:
# Process and save both subsets
#process_and_save_subset(medQA_en, 'medQA_en.csv')
#process_and_save_subset(medQA_zh, 'medQA_zh.csv')

process_and_save_subset(medQA_en, 'USMLE.csv')
#process_and_save_subset(medQA_zh, 'MCMLE.csv')

print("Datasets processed and saved as CSV files.")

In [None]:
medQA_en_df = pd.read_csv('USMLE.csv')
medQA_en_df

### Process the SDoH dataset

In [None]:
choices_dict = {"SDoH_Q1": ["Single", "Widowed", "Divorced", "Married", "Not mentioned"],
                "SDoH_Q2": ["0", "1", "2", "3", "4", "5 or more", "Not mentioned"],
                "SDoH_Q3": ["Yes", "No", "Not mentioned"],
                "SDoH_Q4": ["Yes", "No", "Not mentioned"],
                "SDoH_Q5": ["Yes", "No", "Not mentioned"],
                "SDoH_Q6": ["Yes", "No", "Not mentioned"],
                "SDoH_Q7": ["Employed", "Jobless", "Retired", "Not mentioned"],
                "SDoH_Q8": ["Elementary school", "Middle school", "High school", "College", "Graduate School", "Not mentioned"],
                "SDoH_Q9": ["Yes", "No", "In the past", "Not mentioned"]}

def process_and_save_subset(filename):
    df = pd.read_csv(filename)
    assert len(df) == 200, "The length of the SDoH CSV files must be 200"
    choices_list = choices_dict[filename.split(".")[0]]
    df['choices'] = [choices_list] * len(df)
    df['question'] = df.apply(lambda row: f"Given the following context about a patient's socioeconomic document:\n{row['Text']}\n{row['Raw Question']}", axis=1)
    df['wrong_choices'] = df.apply(lambda row: [element for element in row['choices'] if element != row['correct_answer_str']], axis=1)
    for i in range(0, len(choices_list) - 1):
        df[f'wrong_answer_str_{i}'] = df.apply(lambda row: row['wrong_choices'][i], axis=1)

    # Split data into two groups and assign propose_correct_answer and proposed_answer
    np.random.seed(42)
    half_size = len(df) // 2
    propose_correct = np.zeros(len(df))
    propose_correct[:half_size] = 1
    np.random.shuffle(propose_correct)  # Randomly shuffle the array to assign 1s and 0s

    df['propose_correct_answer'] = propose_correct
    df['proposed_answer'] = df.apply(lambda row: row['correct_answer_str'] if row['propose_correct_answer'] == 1 else np.random.choice(row['wrong_choices']), axis=1)
    
    df.to_csv(filename, index=False)

In [None]:
def combine(filenames):
    # List to hold DataFrames
    dfs = []
    
    # Loop through the CSV files
    for file in filenames:
        # Read the CSV file
        df = pd.read_csv(file)
        
        # Select the specified columns
        selected_columns = df[['question', 'correct_answer_str', 'choices', 'propose_correct_answer', 'proposed_answer']]
        
        # Append the selected columns DataFrame to the list
        dfs.append(selected_columns)
    
    # Concatenate all DataFrames
    concatenated_df = pd.concat(dfs, ignore_index=True)
    
    # Save the concatenated DataFrame to "SDoH.csv"
    concatenated_df.to_csv('SDoH.csv', index=False)

### Add the A/B choices

In [None]:
def add_A_B_choices(filename):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(filename)
    
    # Set the seed to ensure reproducibility
    np.random.seed(42)
    
    # Add the 'correct_choice' column with random 'A' or 'B'
    df['correct_choice'] = np.random.choice(['A', 'B'], size=len(df))
    
    # Initialize the 'choice_A_str' and 'choice_B_str' columns
    df['choice_A_str'] = ''
    df['choice_B_str'] = ''
    
    # Iterate over each row to set 'choice_A_str' and 'choice_B_str' based on 'correct_choice'
    for index, row in df.iterrows():
        wrong_answers = [row[col] for col in df.columns if col.startswith('wrong_answer_str_')]
        
        if row['correct_choice'] == 'A':
            df.at[index, 'choice_A_str'] = row['correct_answer_str']
            df.at[index, 'choice_B_str'] = np.random.choice(wrong_answers)
        else:
            df.at[index, 'choice_B_str'] = row['correct_answer_str']
            df.at[index, 'choice_A_str'] = np.random.choice(wrong_answers)

    df['correct_choice_id'] = df.apply(lambda row: 1 if row['correct_choice'] == 'A' else 0, axis=1)
    # Save the modified DataFrame back to the CSV file
    df.to_csv(filename, index=False)

In [None]:
def add_A_B_choices_SDoH(filename):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(filename)
    
    # Set the seed to ensure reproducibility
    np.random.seed(42)
    
    # Add the 'correct_choice' column with random 'A' or 'B'
    df['correct_choice'] = np.random.choice(['A', 'B'], size=len(df))
    
    # Initialize the 'choice_A_str' and 'choice_B_str' columns
    df['choice_A_str'] = ''
    df['choice_B_str'] = ''
    
    # Iterate over each row to set 'choice_A_str' and 'choice_B_str' based on 'correct_choice'
    for index, row in df.iterrows():
        choices = ast.literal_eval(row['choices'])
        incorrect_choices = [choice for choice in choices if choice != row['correct_answer_str']]
        
        if row['correct_choice'] == 'A':
            df.at[index, 'choice_A_str'] = row['correct_answer_str']
            df.at[index, 'choice_B_str'] = np.random.choice(incorrect_choices)
        else:
            df.at[index, 'choice_B_str'] = row['correct_answer_str']
            df.at[index, 'choice_A_str'] = np.random.choice(incorrect_choices)

    df['correct_choice_id'] = df.apply(lambda row: 1 if row['correct_choice'] == 'A' else 0, axis=1)
    # Save the modified DataFrame back to the CSV file
    df.to_csv(filename, index=False)

In [None]:
add_A_B_choices('clinical_knowledge.csv')
add_A_B_choices('college_medicine.csv')
add_A_B_choices('medQA_en.csv')
add_A_B_choices('medQA_zh.csv')
for i in range(1, 10):
    add_A_B_choices(f'SDoH_Q{i}.csv')
add_A_B_choices_SDoH('SDoH.csv')

In [None]:
add_A_B_choices('anatomy.csv')
add_A_B_choices('college_biology.csv')
add_A_B_choices('medical_genetics.csv')
add_A_B_choices('professional_medicine.csv')

In [None]:
add_A_B_choices('USMLE.csv')

In [None]:
df = pd.read_csv('anatomy.csv')
df.head(10)

In [None]:
df = pd.read_csv('college_biology.csv')
df.head(10)

In [None]:
df = pd.read_csv('medical_genetics.csv')
df.head(10)

In [None]:
df = pd.read_csv('professional_medicine.csv')
df.head(10)

### Make imbalanced dataset

In [None]:
def reweight_correct_choices(filename, target_ratio):
    # Read the CSV file
    df = pd.read_csv(filename)

    # Calculate the current ratio of choice A being correct
    current_ratio = df['correct_choice'].value_counts(normalize=True)['A']
    print(f"Current ratio of A being correct: {current_ratio*100:.2f}%")

    # Determine the number of rows that need to be switched to achieve the target ratio
    total_rows = len(df)
    target_num_A = int(target_ratio * total_rows)
    current_num_A = int(current_ratio * total_rows)
    num_to_switch = abs(target_num_A - current_num_A)
    
    # Select rows to switch
    if target_num_A > current_num_A:
        # Need to increase A correct choices
        rows_to_switch = df[df['correct_choice'] == 'B'].sample(num_to_switch, random_state=42).index
    else:
        # Need to increase B correct choices
        rows_to_switch = df[df['correct_choice'] == 'A'].sample(num_to_switch, random_state=42).index

    # Create new columns for the altered data
    df[f'correct_choice ({target_ratio*100:.0f}%)'] = df['correct_choice']
    df[f'choice_A_str ({target_ratio*100:.0f}%)'] = df['choice_A_str']
    df[f'choice_B_str ({target_ratio*100:.0f}%)'] = df['choice_B_str']
    df[f'correct_choice_id ({target_ratio*100:.0f}%)'] = df['correct_choice_id']

    # Switch the choices for the selected rows
    df.loc[rows_to_switch, f'correct_choice ({target_ratio*100:.0f}%)'] = df.loc[rows_to_switch, 'correct_choice'].apply(lambda x: 'A' if x == 'B' else 'B')
    df.loc[rows_to_switch, f'choice_A_str ({target_ratio*100:.0f}%)'] = df.loc[rows_to_switch, 'choice_B_str']
    df.loc[rows_to_switch, f'choice_B_str ({target_ratio*100:.0f}%)'] = df.loc[rows_to_switch, 'choice_A_str']
    df.loc[rows_to_switch, f'correct_choice_id ({target_ratio*100:.0f}%)'] = df.loc[rows_to_switch, 'correct_choice_id'].apply(lambda x: 1 if x == 0 else 0)

    df.to_csv(filename, index=False)
    print(f"Successfully add imbalanced data into {filename}")

    return df

In [None]:
filename_list = ['clinical_knowledge.csv', 'college_medicine.csv', 'medQA_en.csv', 'medQA_zh.csv', 'SDoH_Q1.csv', 'SDoH_Q2.csv', 'SDoH_Q3.csv', 'SDoH_Q4.csv', 'SDoH_Q5.csv', 'SDoH_Q6.csv', 'SDoH_Q7.csv', 'SDoH_Q8.csv', 'SDoH_Q9.csv', 'SDoH.csv']
#target_ratio_list = [0.1, 0.3, 0.5, 0.7, 0.9]
target_ratio_list = [0.05]
for filename in filename_list:
    for target_ratio in target_ratio_list:
        reweight_correct_choices(filename, target_ratio)