### Preprocess data

In [1]:
import pandas as pd
import numpy as np

In [5]:
data = pd.read_csv("..\\datasets_file\\output\\synthetic_queries.tsv", sep="\t")

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document_index             237 non-null    int64 
 1   document                   237 non-null    object
 2   synthetic_query            237 non-null    object
 3   Context_Relevance_Label    237 non-null    object
 4   generated_answer           144 non-null    object
 5   Answer_Faithfulness_Label  144 non-null    object
 6   Answer_Relevance_Label     144 non-null    object
dtypes: int64(1), object(6)
memory usage: 13.1+ KB


In [7]:
data.head()

Unnamed: 0,document_index,document,synthetic_query,Context_Relevance_Label,generated_answer,Answer_Faithfulness_Label,Answer_Relevance_Label
0,47,b'%PDF-1.7\r%\xe2\xe3\xcf\xd3\r\n114 0 obj\r<<...,What are the clinical findings in this case?,Yes,The clinical syndromic diagnosis is acute flac...,No,No
1,17,b'%PDF-1.7\r%\xe2\xe3\xcf\xd3\r\n106 0 obj\r<<...,What is the most likely diagnosis for the 28-y...,Yes,Sickle cell disease.,Yes,Yes
2,67,b'%PDF-1.7\r%\xe2\xe3\xcf\xd3\r\n121 0 obj\r<<...,What is the most important differential diagno...,No,,,
3,80,b'%PDF-1.7\r%\xe2\xe3\xcf\xd3\r\n115 0 obj\r<<...,What are the key clinical findings in the 31-y...,Yes,Sickle cell disease.,No,No
4,6,b'%PDF-1.7\r%\xe2\xe3\xcf\xd3\r\n101 0 obj\r<<...,What are the most common pathogens associated ...,Yes,The most common pathogens are Staphylococcus a...,Yes,Yes


### Generate strong negative Context_Relevance_Label

In [17]:
import ast
from PyPDF2 import PdfReader
from io import BytesIO
from rank_bm25 import BM25Okapi
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

# Turn bytes of string to bytes
def string_to_bytes(s: str) -> bytes:
    try:
        return ast.literal_eval(s)
    except Exception as e:
        print(f"Error converting to bytes: {e}")
        return b""
    
# Extract text from PDF bytes
def extract_text_from_pdf_bytes(pdf_bytes_string: str) -> str:
    try:
        pdf_bytes = ast.literal_eval(pdf_bytes_string)
        reader = PdfReader(BytesIO(pdf_bytes))
        return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
    except Exception as e:
        print(f"[Error] Failed to extract text: {e}")
        return ""


In [18]:
data['document'] = data['document'].apply(extract_text_from_pdf_bytes)
data['document']

0      82\nA 31-Year-Old Man from Guatemala\nWith Acu...
1      62\nA 28-Year-Old Man from Ghana\nWith a Chron...
2      92\nA 42-Year-Old Traveller Returning\nfrom Th...
3      68\nA 31-Year-Old Woman from\nMalawi With a Ge...
4      15\nA 3-Year-Old Boy from Laos With Right\nSup...
                             ...                        
232    39\nA 30-Year-Old Male Chinese Trader\nWith Fe...
233    60\nA 6-Year-Old Boy from Malawi\nWith Proptos...
234    17\nA 34-Year-Old Man from Thailand\nWith Feve...
235    77\nA 51-Year-Old Female Traveller\nReturning ...
236    31\nA 6-Year-Old Boy from Malawi With\nFever, ...
Name: document, Length: 237, dtype: object

In [7]:
# Tokenize the corpus
tokenized_corpus = [word_tokenize(doc.lower()) for doc in data['document'].unique()]

# Initialize BM25
bm25 = BM25Okapi(tokenized_corpus)

In [8]:
# Count number of positive and weak negative samples (as integers)
num_of_positives = data[data['Context_Relevance_Label'] == 'Yes'].count()
num_of_weak_negatives = data['Context_Relevance_Label'].count() - num_of_positives

# Sample the difference from the positive set
n_to_sample = num_of_positives - num_of_weak_negatives

# Only sample if the number is > 0
if n_to_sample[0] > 0:
    strong_negatives_data = data[data['Context_Relevance_Label'] == "Yes"].sample(n=n_to_sample[0])
else:
    strong_negatives_data = pd.DataFrame()

  if n_to_sample[0] > 0:
  strong_negatives_data = data[data['Context_Relevance_Label'] == "Yes"].sample(n=n_to_sample[0])


In [9]:
strong_negatives_data.iloc[0]

document_index                                                              51
document                     43\nA 35-Year-Old Malawian Woman With a\nPainf...
synthetic_query              What is the most likely diagnosis for the 35-y...
Context_Relevance_Label                                                    Yes
generated_answer             The macroscopic presentation and localization ...
Answer_Faithfulness_Label                                                   No
Answer_Relevance_Label                                                      No
Name: 123, dtype: object

In [10]:
strong_negatives_data = strong_negatives_data[['document_index', 'document', 'synthetic_query', 'Context_Relevance_Label']]

In [11]:
def replace_with_bm25_neighbors(df, text_column: str) -> pd.DataFrame:
    """
    For each row in a DataFrame, replace the document in `text_column` 
    with the document that has the second-highest BM25 similarity 
    (excluding itself) from the entire corpus.

    Args:
        df (pd.DataFrame): The input DataFrame.
        text_column (str): Column name containing the document text.

    Returns:
        pd.DataFrame: A copy of the DataFrame with updated documents.
    """
    df = df.copy()
    
    # Tokenize corpus
    corpus = df[text_column].tolist()
    tokenized_corpus = [word_tokenize(doc.lower()) for doc in corpus]

    # Create BM25 model
    bm25 = BM25Okapi(tokenized_corpus)

    # Replace each row with the second-highest scoring document
    new_documents = []
    for i, query_doc in enumerate(corpus):
        tokenized_query = word_tokenize(query_doc.lower())
        scores = bm25.get_scores(tokenized_query)

        # Exclude the current document itself by setting its score to -inf
        scores[i] = -np.inf

        # Get index of document with second-highest score (after self)
        second_best_idx = np.argmax(scores)

        new_documents.append(corpus[second_best_idx])

    df[text_column] = new_documents
    return df

In [12]:
strong_negatives_data = replace_with_bm25_neighbors(strong_negatives_data, 'document')
strong_negatives_data['Context_Relevance_Label'] = "No"

In [13]:
context_relevance_data = data[['document_index', 'document', 'synthetic_query', 'Context_Relevance_Label']]
context_relevance_data = pd.concat([context_relevance_data, strong_negatives_data], ignore_index=True)

In [14]:
context_relevance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 288 entries, 0 to 287
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   document_index           288 non-null    int64 
 1   document                 288 non-null    object
 2   synthetic_query          288 non-null    object
 3   Context_Relevance_Label  288 non-null    object
dtypes: int64(1), object(3)
memory usage: 9.1+ KB


In [17]:
train_context_relevance_data, test_context_relevance_data = train_test_split(context_relevance_data, test_size=0.2)

In [99]:
context_relevance_data.to_csv("..\\datasets_file\\output\\Context_Relevance_Data.tsv", sep="\t")

In [22]:
train_context_relevance_data.to_csv("..\\datasets_file\\output\\Train_Context_Relevance_Data.tsv", sep="\t")

In [23]:
test_context_relevance_data.to_csv("..\\datasets_file\\output\\Test_Context_Relevance_Data.tsv", sep="\t")

### Generate Answer Faithfulness

In [19]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237 entries, 0 to 236
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document_index             237 non-null    int64 
 1   document                   237 non-null    object
 2   synthetic_query            237 non-null    object
 3   Context_Relevance_Label    237 non-null    object
 4   generated_answer           144 non-null    object
 5   Answer_Faithfulness_Label  144 non-null    object
 6   Answer_Relevance_Label     144 non-null    object
dtypes: int64(1), object(6)
memory usage: 13.1+ KB


In [31]:
(data["Answer_Faithfulness_Label"] == "Yes").sum() - (data["Answer_Faithfulness_Label"] == "No").sum()

16

In [None]:
def create_negative_answer_data(data: pd.DataFrame, column_name: str) -> pd.DataFrame:
    # Compute how many more "Yes" than "No"
    num_of_data = (data[column_name] == "Yes").sum() - (data[column_name] == "No").sum()
    # Check that enough unlabeled rows exist
    if num_of_data > data[column_name].isna().sum():
        raise ValueError("Not enough unlabeled data to create the required negative samples.")
    # Sample negative rows from unlabeled data
    negative_data = data[data[column_name].isna()].sample(num_of_data, random_state=29).copy()
    # Sample answers from positive examples
    sampled_answer = data[data[column_name] == "Yes"]["generated_answer"].dropna().sample(
        n=num_of_data, random_state=29
    ).reset_index(drop=True)
    # Assign sampled answers and "No" label
    negative_data["generated_answer"] = sampled_answer.values
    negative_data[column_name] = "No"
    

    return negative_data


In [None]:
negative_answer_faithfulness_data = create_negative_answer_data(data, "Answer_Faithfulness_Label")
negative_answer_faithfulness_data.info()
answer_faithfulness_data = data[~data["Answer_Faithfulness_Label"].isna()]

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 50 to 23
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document_index             16 non-null     int64 
 1   document                   16 non-null     object
 2   synthetic_query            16 non-null     object
 3   Context_Relevance_Label    16 non-null     object
 4   generated_answer           16 non-null     object
 5   Answer_Faithfulness_Label  16 non-null     object
 6   Answer_Relevance_Label     0 non-null      object
dtypes: int64(1), object(6)
memory usage: 1.0+ KB


In [40]:
answer_faithfulness_data = data[~data["Answer_Faithfulness_Label"].isna()]
answer_faithfulness_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 0 to 236
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document_index             144 non-null    int64 
 1   document                   144 non-null    object
 2   synthetic_query            144 non-null    object
 3   Context_Relevance_Label    144 non-null    object
 4   generated_answer           144 non-null    object
 5   Answer_Faithfulness_Label  144 non-null    object
 6   Answer_Relevance_Label     144 non-null    object
dtypes: int64(1), object(6)
memory usage: 9.0+ KB


In [41]:
answer_faithfulness_data = pd.concat([answer_faithfulness_data, negative_answer_faithfulness_data], ignore_index=True)
answer_faithfulness_data = answer_faithfulness_data[["document", "generated_answer", "Answer_Faithfulness_Label"]]
answer_faithfulness_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 3 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document                   160 non-null    object
 1   generated_answer           160 non-null    object
 2   Answer_Faithfulness_Label  160 non-null    object
dtypes: object(3)
memory usage: 3.9+ KB


In [42]:
train_answer_faithfulness_data, test_answer_faithfulness_data = train_test_split(answer_faithfulness_data, test_size=0.2)

In [43]:
train_answer_faithfulness_data.to_csv("..\\datasets_file\\output\\Train_Answer_Faithfulness_Data.tsv", sep="\t")
test_answer_faithfulness_data.to_csv("..\\datasets_file\\output\\Test_Answer_Faithfulness_Data.tsv", sep="\t")

### Generate Answer Relevance

In [None]:
negative_answer_relevance_data = create_negative_answer_data(data, "Answer_Relevance_Label")
negative_answer_relevance_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 16 entries, 50 to 23
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document_index             16 non-null     int64 
 1   document                   16 non-null     object
 2   synthetic_query            16 non-null     object
 3   Context_Relevance_Label    16 non-null     object
 4   generated_answer           16 non-null     object
 5   Answer_Faithfulness_Label  0 non-null      object
 6   Answer_Relevance_Label     16 non-null     object
dtypes: int64(1), object(6)
memory usage: 1.0+ KB


In [45]:
answer_relevance_data = data[~data["Answer_Relevance_Label"].isna()]
answer_relevance_data = data[~data["Answer_Relevance_Label"].isna()]
answer_relevance_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 144 entries, 0 to 236
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   document_index             144 non-null    int64 
 1   document                   144 non-null    object
 2   synthetic_query            144 non-null    object
 3   Context_Relevance_Label    144 non-null    object
 4   generated_answer           144 non-null    object
 5   Answer_Faithfulness_Label  144 non-null    object
 6   Answer_Relevance_Label     144 non-null    object
dtypes: int64(1), object(6)
memory usage: 9.0+ KB


In [47]:
answer_relevance_data = pd.concat([answer_relevance_data, negative_answer_relevance_data], ignore_index=True)
answer_relevance_data = answer_relevance_data[["synthetic_query", "generated_answer", "Answer_Relevance_Label"]]
answer_relevance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176 entries, 0 to 175
Data columns (total 3 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   synthetic_query         176 non-null    object
 1   generated_answer        176 non-null    object
 2   Answer_Relevance_Label  176 non-null    object
dtypes: object(3)
memory usage: 4.2+ KB


In [48]:
train_answer_relevance_data, test_answer_relevance_data = train_test_split(answer_relevance_data, test_size=0.2)
train_answer_relevance_data.to_csv("..\\datasets_file\\output\\Train_Answer_Relevance_Data.tsv", sep="\t")
test_answer_relevance_data.to_csv("..\\datasets_file\\output\\Test_Answer_Relevance_Data.tsv", sep="\t")