In [1]:
import google.generativeai as genai
import json
import os
import pandas as pd
import re

# Download the dataset

In [2]:
# Download the input dataset
GOOGLE_DRIVE_DATASET_URL = (
    "https://drive.google.com/file/d/1vXyLOFRc98f097x4CrK9gOOb3JsKvPmN/view?usp=sharing"
)
FILE_ID = re.search("/file/d/(.*)/view", GOOGLE_DRIVE_DATASET_URL).group(1)
DOWNLOAD_URL = f"https://drive.usercontent.google.com/download?id={FILE_ID}&export=download&authuser=0&confirm=t"
original_dataset = pd.read_csv(DOWNLOAD_URL)

In [3]:
original_dataset.head()

Unnamed: 0,question,answer
0,What is (are) Glaucoma ?,Glaucoma is a group of diseases that can damag...
1,What is (are) Glaucoma ?,The optic nerve is a bundle of more than 1 mil...
2,What is (are) Glaucoma ?,Open-angle glaucoma is the most common form of...
3,Who is at risk for Glaucoma? ?,Anyone can develop glaucoma. Some people are a...
4,How to prevent Glaucoma ?,"At this time, we do not know how to prevent gl..."


# Count questions

Count number of occurrences of each question

In [4]:
original_dataset.question.value_counts().value_counts().sort_index()

count
1     14092
2       691
3        93
4        40
5        18
6        19
7         6
8         4
9         4
10        3
11        2
12        4
13        2
14        1
19        1
20        1
Name: count, dtype: int64

### Eliminate duplicates

Let's eliminate rows with duplicated questions and answers.  
The reason is that it would be hard to deal with duplicate questions in the evaluation.  
Even if we control the training / test split to avoid that a given question is on both datasets, to avoid data leakage, we would still have different answers to the same questions, which is potentially a problem for the scoring of model responses.

In [5]:
print(original_dataset.shape)
print(original_dataset.drop_duplicates().shape)

(16406, 2)
(16358, 2)


In [6]:
original_dataset = original_dataset.drop_duplicates()

In [7]:
question_occurence_count = original_dataset.question.value_counts()
print(
    f"{(question_occurence_count > 1).sum() / len(question_occurence_count) * 100:.2f}% of questions appear more than once after removing duplicates"
)

5.83% of questions appear more than once after removing duplicates


In [8]:
print(f"Total unique questions: {len(original_dataset.question.unique())}")

Total unique questions: 14981


# Preprocess correct answers

We would like each question to have only one answer.  
If we have more than one answer to a question, we will use a language model to combine the answers into one.

In [9]:
genai.configure(api_key=os.environ["GEMINI_API_KEY"])
model = genai.GenerativeModel("gemini-1.5-flash")

In [10]:
SUMMARIZE_ANSWERS_PROMPT = """
You are a chatbot programmed to help users with medical issues.
I have a question from a user and a couple of texts that are related to the question.
I want you to summarize the texts producing a concise but detailed response to the user question.
Every important piece of information contained in the texts should be included in the summary.
You cannot use any external data source, only the data that is provided in the prompt.
In the end please add a disclaimer that you can not provide direct medical diagnoses.
The format of the response should be plain text, with no title and avoid using bullet points.

QUESTION:
{question}

CONTEXT:
{context}"""

DISCLAIMER = "Disclaimer: This information is for educational purposes only and should not be considered medical advice.  I cannot provide diagnoses.  Always consult with a qualified healthcare professional for any health concerns or before making any decisions related to your medical care."


def produce_synthetic_summary(question, answers):
    prompt = SUMMARIZE_ANSWERS_PROMPT.format(
        question=question, context="\n".join(answers)
    )
    response = model.generate_content(prompt)
    return response.text

In [11]:
def save_partial(result, filename):
    with open(filename, "a") as f:
        json.dump(result, f)
        f.write("\n")

In [12]:
correct_answers_data = []
for question, question_data in original_dataset.groupby("question"):
    try:
        if len(question_data) > 1:
            response = produce_synthetic_summary(question, question_data.answer)
            result = {"question": question, "answer": response}
        else:
            response = str(question_data.answer.iloc[0]) + "\n" + DISCLAIMER
            result = {"question": question, "answer": response}
        save_partial(result, "correct_answers_partial.jsonl")
        correct_answers_data.append(result)
    except Exception as e:
        print(f"Error processing question:{question}\n{e}")

Error processing question:What are the treatments for Heart Failure ?
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Error processing question:What are the treatments for Sarcoidosis ?
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that the model was reciting from copyrighted material.
Error processing question:What causes Hemochromatosis ?
Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. The candidate's [finish_reason](https://ai.google.dev/api/generate-content#finishreason) is 4. Meaning that t

In [13]:
correct_answers = pd.DataFrame(correct_answers_data)

# Generate incorrect answers

To perform policy optimization in the model we must provide a correct answer and an incorrect answer for each question.  
We are going to use a language model to generate the incorrect answers.

In [14]:
INCORRECT_ANSWER_PROMPT = """
You are a data annotation expert working on an initiative to build a preference optimization dataset.
I have a question from a user and a couple of texts that are related to the question.
I want you to generate a wrong response, with incorrect facts, so we can use it as the rejected answer.
The answer is supposed to be wrong, but realistic.
You cannot use any external data source, only the data that is provided in the prompt.
The format of the response should be plain text, with no title and avoid using bullet points.
Do not add any note saying that the response is incorrect, this is training data for a model, not a response to a human user.

QUESTION:
{question}

CONTEXT:
{context}"""


def produce_incorrect_answer(question, answers):
    prompt = INCORRECT_ANSWER_PROMPT.format(
        question=question, context="\n".join(answers)
    )
    response = model.generate_content(prompt)
    return response.text

In [15]:
incorrect_answers_data = []
for question, question_data in original_dataset.groupby("question"):
    try:
        response = produce_incorrect_answer(question, question_data.answer)
        result = {"question": question, "incorrect_answer": response}
        save_partial(result, "incorrect_answers_partial.jsonl")
        incorrect_answers_data.append(result)
    except Exception as e:
        print(f"Error processing question:{question}\n{e}")

Error processing question:What is (are) Emery-Dreifuss muscular dystrophy, X-linked ?
sequence item 0: expected str instance, float found
Error processing question:What is (are) Emery-Dreifuss muscular dystrophy, dominant type ?
sequence item 0: expected str instance, float found
Error processing question:What is (are) Familial HDL deficiency ?
sequence item 0: expected str instance, float found
Error processing question:What is (are) HELLP syndrome ?
sequence item 0: expected str instance, float found
Error processing question:What is (are) X-linked lymphoproliferative syndrome ?
sequence item 0: expected str instance, float found


In [16]:
incorrect_answers = pd.DataFrame(incorrect_answers_data)

In [17]:
result_df = pd.concat(
    [correct_answers.set_index("question"), incorrect_answers.set_index("question")],
    axis=1,
).reset_index()

In [18]:
result_df.to_csv("preprocessed_data.csv")

# Upload results to Google Drive

Results uploaded to Google Drive:  
[https://drive.google.com/file/d/1jlG-__9zwHB_USs5q2D7ipb4SV0TJFBe/view?usp=sharing](https://drive.google.com/file/d/1jlG-__9zwHB_USs5q2D7ipb4SV0TJFBe/view?usp=sharing)