# RAG System - Baseline and Transformation Results

## Libraries and Conifguration

In [1]:
from dotenv import load_dotenv, find_dotenv
import openai
import os
import nltk
import logging
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
import faiss
import time
from rouge import Rouge
from typing import List, Optional, Union, Dict
from enum import Enum
import numpy as np
import tiktoken
from llmlingua import PromptCompressor
import re

  from tqdm.autonotebook import tqdm, trange


In [1]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("CUDA Version:", torch.version.cuda) 

CUDA Available: True
CUDA Version: 12.4


In [4]:
load_dotenv(find_dotenv())

# Get OpenAI Key
SECRET_KEY = os.environ.get("OPENAI_KEY")
os.environ['OPENAI_API_KEY'] = SECRET_KEY

In [3]:
# Suppress httpx INFO logs
# logging.getLogger("httpx").setLevel(logging.WARNING)

# Ensure NLTK's punkt tokenizer is downloaded
nltk.download('punkt')

# Base directory paths
BASE_DATA_PATH = os.path.join('..', 'data', 'cleaned')
BASE_INDEX_PATH = os.path.join('..', 'models', 'embeddings')

# Sector-specific configurations
SECTORS = {
    'education': {
        'dataset_path': os.path.join(BASE_DATA_PATH, 'squad', 'squad_train-v2.0_with_topics.json'),
        'dataset_test': os.path.join('data', 'education', 'splits', 'education_test.csv'),
        'default_index_path': os.path.join(BASE_INDEX_PATH, 'squad_faiss_index_new.idx'),
        'embedding_model': 'all-mpnet-base-v2',
        'prompt_prefix': "Answer the following educational question based on the provided context. Provide only a single, concise word or short phrase as the answer without any additional explanation or context.\n\nQuestion: What was the name of Shen Fu's memoir?\nAnswer: Six Chapters of a Floating Life\n\nQuestion: What became more common during the Baroque era?\nAnswer: vocal forms",
        'prompt_eval_prefix': "As an expert evaluator, compare the following generated answer with the ground truth",
        'context_column': 'context',
        'question_category': 'question_class',
        'answer_truth': 'answers',
        'file_type': 'json',
        'hyperparameters': {
            'batch_size': 5,
            'max_retries': 5,
            'initial_delay': 1,
            'temperature': 1.0,
            'k': 7,
            'top_p': 0.86
        }
    },
    'healthcare': {
        'dataset_path': os.path.join(BASE_DATA_PATH, 'pubmedqa', 'pubmedqa_cleaned_topics_subset.csv'),
        'dataset_test': os.path.join('data', 'healthcare', 'splits', 'healthcare_test.csv'),
        'default_index_path': os.path.join(BASE_INDEX_PATH, 'pubmedqa_faiss_index_subset.idx'),
        'embedding_model': 'pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb',
        'prompt_prefix': "Answer the following medical question based on the provided context. Please provide a concise and direct answer.",
        'prompt_eval_prefix': "You are a healthcare expert in evaluating AI-generated healthcare answers compared with their ground truth",
        'context_column': 'context',
        'answer_truth': 'answer',
        'file_type': 'csv',
        'hyperparameters': {
            'batch_size': 5,
            'max_retries': 5,
            'initial_delay': 1,
            'temperature': 0.59,
            'k': 7,
            'top_p': 0.57
        }
    }
}

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Emir\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Create folders
required_folders = [
    'data/education/processed/',
    'data/healthcare/processed/'
]

for folder in required_folders:
    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)
        print(f"Created missing folder: {folder}")
    else:
        print(f"Folder exists: {folder}")

Created missing folder: data/education/processed/
Created missing folder: data/healthcare/processed/


## RAG Setup

In [7]:
class QueryTransformationType(Enum):
    NONE = "none"
    BASELINE = "baseline"
    COOKBOOK = "cookbook"
    HYDE = "hyde"
    COMPRESSION = "compression"

class RAGSystem:
    def __init__(self, sector_config, hyperparameters, questions_df, faiss_index_path=None, transformation_type=QueryTransformationType.NONE, model_name='gpt-4o'):
        self.dataset_test = sector_config['dataset_test']
        self.dataset_path = sector_config['dataset_path']
        self.file_type = sector_config.get('file_type', 'json')
        self.context_column = sector_config.get('context_column', 'context')
        self.question_column = sector_config.get('question_column', 'question')
        self.answer_column = 'generated_answer'
        self.answer_truth = sector_config.get('answer_truth', 'answer')
        self.index_path = faiss_index_path if faiss_index_path else sector_config['default_index_path']
        self.embedding_model_name = sector_config['embedding_model']
        self.prompt_prefix = sector_config['prompt_prefix']
        self.prompt_eval_prefix = sector_config['prompt_eval_prefix']

        # Hyperparameters
        self.temperature = hyperparameters.get('temperature', 0.7)
        self.k = hyperparameters.get('k', 5)
        self.top_p = hyperparameters.get('top_p', 0.5)
        self.batch_size = hyperparameters.get('batch_size', 5)
        self.max_retries = hyperparameters.get('max_retries', 5)
        self.initial_delay = hyperparameters.get('initial_delay', 1)

        # Query transformation settings
        if isinstance(transformation_type, str):
            transformation_type = QueryTransformationType[transformation_type.upper()]
        self.transformation_type = transformation_type
        
        # Initialize models
        self.embedding_model = SentenceTransformer(self.embedding_model_name)
        self.contexts = self._load_contexts()
        self.index = self._load_faiss_index()
        self.questions_df = questions_df.copy()
        self.model_llm = model_name
        
        # Initialize query transformer if needed
        self.query_transformer = self._initialize_transformer()

    def _initialize_transformer(self):
        """Initialize the appropriate query transformer based on transformation type."""
        if self.transformation_type == QueryTransformationType.NONE:
            return None

        # Initialize model that is used for transformation
        client = openai.OpenAI()
        return QueryTransformer(client)

       
    def _load_contexts(self):
        if not os.path.exists(self.dataset_path):
            raise FileNotFoundError(f"Dataset not found at {self.dataset_path}")

        if self.file_type.lower() == 'json':
            data = pd.read_json(self.dataset_path, orient='records', lines=True)
        elif self.file_type.lower() == 'csv':
            data = pd.read_csv(self.dataset_path)
        else:
            raise ValueError(f"Unsupported file type: {self.file_type}")

        if self.context_column not in data.columns:
            raise ValueError(f"Context column '{self.context_column}' not found in the dataset.")

        contexts = data[self.context_column].dropna().unique().tolist()
        return contexts

    def _load_faiss_index(self):
        if not os.path.exists(self.index_path):
            raise FileNotFoundError(f"FAISS index not found at {self.index_path}. Please ensure the index is created.")
        
        print(f"Loading FAISS index from {self.index_path}...")
        index = faiss.read_index(self.index_path)
        return index

    def transform_query(self, query: str):
        """Apply the selected transformation strategy to the query."""
        if not self.query_transformer or self.transformation_type == QueryTransformationType.NONE:
            return query, 0

        # Start timing for query transformation
        start_time = time.time()

        if self.transformation_type == QueryTransformationType.COOKBOOK:
            transformed = self.query_transformer.cookbook_rewrite(query, n_variations=3)
            transformed_query = transformed[0]  # Use the first variation
            
        elif self.transformation_type == QueryTransformationType.HYDE:
            transformed_query = self.query_transformer.hyde_expansion(query)
            
        elif self.transformation_type == QueryTransformationType.COMPRESSION:
            transformed_query = self.query_transformer.llm_compression(query)
            
        else:
            transformed_query = query

        # End timing for query transformation
        transformation_latency = time.time() - start_time

        return transformed_query, transformation_latency

    def retrieve_relevant_contexts(self, query):
        # Transform the query before retrieval
        transformed_query, transformation_latency = self.transform_query(query)

        # Retrieve the reference answer's contexts for evaluation
        relevant_contexts = self.questions_df.iloc[self.current_idx][self.context_column]

       # Standard single query processing
        query_embedding = self.embedding_model.encode([transformed_query], convert_to_numpy=True).astype('float32')
        distances, indices = self.index.search(query_embedding, self.k)
        retrieved_contexts = [self.contexts[idx] for idx in indices[0] if idx < len(self.contexts)]

        # Calculate metrics
        self.questions_df.loc[self.current_idx, 'query'] = transformed_query
        transformed_query_token = self.calculate_token_count(transformed_query)
        self.questions_df.loc[self.current_idx, 'token_query'] = transformed_query_token
        context_text_token = "\n\n".join(retrieved_contexts)
        full_prompt = f"{self.prompt_prefix}\n\nContext:\n{context_text_token}\n\nQuestion:\n{transformed_query}\n\nAnswer:"
        full_prompt_token_count = self.calculate_token_count(full_prompt)
        self.questions_df.loc[self.current_idx, 'token_query_full_prompt'] = full_prompt_token_count
      
        # Calculate metrics if ground truth contexts are available
        if relevant_contexts:
            recall_k = self.recall_at_k(retrieved_contexts, relevant_contexts, self.k)
            mrr = self.mean_reciprocal_rank(retrieved_contexts, relevant_contexts)
            ndcg_k = self.normalized_discounted_cumulative_gain(retrieved_contexts, relevant_contexts, self.k)
            
            # Store metric values in the questions_df for the current question
            self.questions_df.loc[self.current_idx, 'Recall@k'] = recall_k
            self.questions_df.loc[self.current_idx, 'MRR'] = mrr
            self.questions_df.loc[self.current_idx, 'nDCG@k'] = ndcg_k

        return retrieved_contexts, transformation_latency

    def generate_answer(self, question, retrieved_contexts, reference_answer):
         # Start timing for retrieval and generation
        start_time = time.time()
        
        delay = self.initial_delay
        for attempt in range(self.max_retries):
            try:
                # Prepare context
                context_text = "\n\n".join(retrieved_contexts)

                client = openai.OpenAI()
                response = client.chat.completions.create(
                messages=[
                    {"role": "system", "content": f"{self.prompt_prefix}"},
                    {
                        "role": "user",
                        "content": f"Context:\n{context_text}\n\nQuestion:\n{question}\n\nAnswer:"
                    }
                ],
                model=self.model_llm,
                temperature=self.temperature,
                top_p=self.top_p
                )
                answer = response.choices[0].message.content

                # End timing for retrieval and generation
                retrieval_generation_latency = time.time() - start_time

                self.questions_df.loc[self.current_idx, 'retrieval_generation_latency'] = retrieval_generation_latency

                
                # Calculate gpt_score if reference_answer is provided
                score = self.gpt_score(answer, reference_answer) if reference_answer else None
                
                return answer, score 

            except openai.OpenAIError as e:
                print(f"OpenAI API error on attempt {attempt + 1}: {e}")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            except Exception as e:
                print(f"Unexpected error: {e}")
                return "NO RESPONSE", None 
        print("Max retries exceeded. Returning 'NO RESPONSE'.")
        return "NO RESPONSE", None 

    def calculate_token_count(self, prompt):
        """Calculate the number of tokens in a given prompt for a specified model."""
        encoding = tiktoken.encoding_for_model(self.model_llm)
        tokens = encoding.encode(prompt)
        return len(tokens)

    def gpt_score(self, generated_answer, reference_answer):
        """Evaluates the quality of the generated answer using gpt_score.
        
        Args:
            generated_answer: The answer generated by the model.
            reference_answer: The reference (ground truth) answer.
            
        Returns:
            A score indicating the quality of the generated answer.
        """
        prompt = f"""
        {self.prompt_eval_prefix}.
        Provide a score between 0.0 and 1.0, where 1.0 indicates perfect alignment in terms of correctness, relevance, and completeness.
    
        Ground Truth:
        {reference_answer}
        
        Generated Answer:
        {generated_answer}
        
        Score (just the number):
        """
        client = openai.OpenAI()
        response = client.chat.completions.create(
            model=self.model_llm,
            messages=[{"role": "user", "content": prompt}],
            temperature=0
        )
        score = response.choices[0].message.content.strip()
        try:
            return float(score)
        except ValueError:
            return None

    def process_questions(self, start_index=0, output_path='output.csv'):
        total_questions = len(self.questions_df)
        for idx in range(start_index, total_questions):
            self.current_idx = idx 

            # Start total processing time
            total_start_time = time.time()

            # Obtain question along with context and ground-truth answer
            question = self.questions_df.iloc[idx][self.question_column]
            retrieved_contexts, transformation_latency = self.retrieve_relevant_contexts(question)
            reference_answer = self.questions_df.iloc[idx].get(self.answer_truth) 

            self.questions_df.loc[self.current_idx, 'transformation_latency'] = transformation_latency

            # Generate answer
            answer, score = self.generate_answer(question, retrieved_contexts, reference_answer)
    
            # End total processing time and store it
            total_latency = time.time() - total_start_time
            self.questions_df.loc[idx, 'total_latency'] = total_latency
                        
            # Use .loc[] to avoid the SettingWithCopyWarning
            self.questions_df.loc[idx, self.answer_column] = answer
            self.questions_df.loc[idx, 'gpt_score'] = score

            # Add tokens
            # Insert 'total_token_size' after 'token_query_full_prompt'     
            if 'total_token_size' not in self.questions_df.columns:
                token_query_index = self.questions_df.columns.get_loc('token_query_full_prompt')
                self.questions_df.insert(token_query_index + 1, 'total_token_size', 0)

            token_size_full_prompt = self.questions_df.loc[idx, 'token_query_full_prompt']
            self.questions_df.loc[idx, 'total_token_size'] = token_size_full_prompt + self.calculate_token_count(answer)

            # Incorporate additional token size from the transformation queries
            if self.transformation_type == QueryTransformationType.HYDE:
                # Create the hyde transformation prompt
                hyde_prompt = f"""Write a detailed hypothetical passage that would perfectly answer this query:
        
                Query: {self.questions_df.loc[idx, self.question_column]}
        
                Write in a natural, informative style. Focus on key facts and details that would be relevant 
                for retrieving similar passages."""
                token_transform = self.calculate_token_count(hyde_prompt)
                token_query = self.calculate_token_count(self.questions_df.loc[idx, 'query'])
        
                # Calculate total token size for cookbook transformation
                self.questions_df.loc[idx, 'total_token_size'] += token_transform + token_query
                
            elif self.transformation_type == QueryTransformationType.COOKBOOK:
                # Create the cookbook transformation prompt
                cookbook_prompt = f"""Given the search query below, generate 1 different version that:
                1. Use different relevant synonyms and terms
                2. Maintain the core intent but vary length and style
                3. Include both shorter and longer versions
                
                Original query: {self.questions_df.loc[idx, self.question_column]}
        
                Return just the rewritten queries, one per line."""
                
                token_transform = self.calculate_token_count(cookbook_prompt)
                token_query = self.calculate_token_count( self.questions_df.loc[idx, 'query'])
        
                # Calculate total token size for cookbook transformation
                self.questions_df.loc[idx, 'total_token_size'] += token_transform + token_query

            # Save progress every 'batch_size' iterations
            if (idx + 1) % self.batch_size == 0 or idx == total_questions - 1:
                self.questions_df.to_csv(output_path, index=False)
                print(f"Progress saved at question {idx + 1}/{total_questions}.")
        print("Processing complete.")

    # Retrieval Performance Metrics
    def recall_at_k(self, retrieved, relevant, k):
        """Compute Recall@k for a single query."""

        relevant = relevant if isinstance(relevant, list) else [relevant]
        retrieved_at_k = retrieved[:k]
        relevant_retrieved = [doc for doc in retrieved_at_k if doc in relevant]
        return len(relevant_retrieved) / len(relevant) if relevant else 0.0

    def mean_reciprocal_rank(self, retrieved, relevant):
        """Compute Mean Reciprocal Rank (MRR) for a single query."""
        for idx, doc in enumerate(retrieved):
            if doc in relevant:
                return 1 / (idx + 1)
        return 0.0

    def discounted_cumulative_gain(self, retrieved, relevant):
        """Compute Discounted Cumulative Gain (DCG) for a single query."""
        dcg = 0.0
        for i, doc in enumerate(retrieved):
            if doc in relevant:
                dcg += 1 / np.log2(i + 2)  # DCG formula with log base 2
        return dcg

    def normalized_discounted_cumulative_gain(self, retrieved, relevant, k):
        """Compute nDCG@k for a single query with one relevant document."""
        dcg_k = self.discounted_cumulative_gain(retrieved[:k], relevant)
        ideal_dcg_k = 1 / np.log2(2)  # iDCG when relevant doc is ideally ranked first
        return dcg_k / ideal_dcg_k if ideal_dcg_k > 0 else 0.0

In [8]:
class QueryTransformer:
    def __init__(self, model):
        """Initialize with an LLM that implements a generate() method.
        
        Args:
            model: LLM instance with generate() method returning string completion
        """
        self.model = model
        self.model_llm = 'gpt-4o'
        self.prompt_prefix = "You are a helpful assistant."

    def generate_response(self, content: str) -> str:
        """Helper to send a standardized request to the model.
        
        Args:
            content: Main content for the 'user' role in the message.
            
        Returns:
            Model-generated completion as a string.
        """
        response = self.model.chat.completions.create(
            messages=[
                {"role": "system", "content": self.prompt_prefix},
                {"role": "user", "content": content}
            ],
            model=self.model_llm
        )
        return response.choices[0].message.content.strip()
        

    def cookbook_rewrite(self, query: str, n_variations: int = 3) -> List[str]:
        """Implements Microsoft AI Cookbook query rewriting strategy.
        
        Args:
            query: Original user query
            n_variations: Number of rewrites to generate
            
        Returns:
            List of rewritten queries
        """
        prompt = f"""Given the search query below, generate {n_variations} different versions that:
        1. Use different relevant synonyms and terms
        2. Maintain the core intent but vary length and style
        3. Include both shorter and longer versions
        
        Original query: {query}
        
        Return just the rewritten queries, one per line."""

        response = self.generate_response(prompt)
        variations = [q.strip() for q in response.split('\n') if q.strip()]
        return variations[:n_variations]

    def hyde_expansion(self, query: str) -> str:
        """Implements HyDE (Hypothetical Document Embedding) expansion.
        
        Args:
            query: Original user query
            
        Returns:
            Expanded query with hypothetical answer context
        """
        prompt = f"""Write a detailed hypothetical passage that would perfectly answer this query:
        
        Query: {query}
        
        Write in a natural, informative style. Focus on key facts and details that would be relevant 
        for retrieving similar passages."""

        hypothetical_doc = self.generate_response(prompt)
        
        # Combine original query with hypothetical document
        expanded_query = f"{query}\n\nRelevant context: {hypothetical_doc}"
        return expanded_query


    def llm_compression(self, query, comp_rate=0.5):
        """Implements LLM-based query compression using LLMLingua.
        
        Args:
            query: Original user query
            comp_rate: Compression rate
            
        Returns:
            Compressed query maintaining key intent
        """

        # Load compressor
        compressor = PromptCompressor(
              model_name="microsoft/llmlingua-2-bert-base-multilingual-cased-meetingbank",
              use_llmlingua2=True, # Use llmlingua-2
        )

        # Compress prompt
        compressed_output = compressor.compress_prompt(
            query,
            rate=comp_rate
        )

        return compressed_output["compressed_prompt"]
        
    def transform_query(self, 
                       query: str,
                       strategies: List[str],
                       **kwargs) -> dict:
        """Apply multiple transformation strategies to a query.
        
        Args:
            query: Original user query
            strategies: List of strategies to apply ('rewrite', 'hyde', 'compress', 'chunk')
            **kwargs: Strategy-specific parameters
            
        Returns:
            Dictionary mapping strategy names to transformed queries
        """
        results = {}
        
        for strategy in strategies:
            if strategy == 'rewrite':
                n = kwargs.get('n_variations', 3)
                results['rewrite'] = self.cookbook_rewrite(query, n)
            elif strategy == 'hyde':
                results['hyde'] = self.hyde_expansion(query)
            elif strategy == 'compress':
                max_tokens = kwargs.get('max_tokens', 50)
                results['compress'] = self.llm_compression(query, max_tokens)
                
        return results

In [9]:
def populate_answers(sector, df, transformation_type = QueryTransformationType.NONE, output_suffix: str = "baseline"):
    """
    Populate baseline answers using specified query transformation strategy.
    
    Args:
        sector: Sector identifier
        df: Input DataFrame with questions
        transformation_type: Query transformation strategy to use
        output_suffix: Suffix for output filename
    """
    # Select sector configuration
    sector_config = SECTORS[sector]
    
    # Extract hyperparameters from sector config
    hyperparameters = {
        'top_p': sector_config['hyperparameters']['top_p'],
        'temperature': sector_config['hyperparameters']['temperature'],
        'k': sector_config['hyperparameters']['k'],
        'batch_size': sector_config['hyperparameters']['batch_size'],
        'max_retries': sector_config['hyperparameters']['max_retries'],
        'initial_delay': sector_config['hyperparameters']['initial_delay']
    }
    
    # Initialize RAGSystem with transformation strategy
    rag_system = RAGSystem(
        sector_config=sector_config,
        hyperparameters=hyperparameters,
        questions_df=df,
        faiss_index_path=None,
        transformation_type=transformation_type,
        model_name='gpt-4o'
    )
    
    # Create output filename based on transformation type
    transform_name = transformation_type.value if isinstance(transformation_type, QueryTransformationType) else transformation_type
    if output_suffix == "baseline": 
        output_suffix = transform_name
    
    output_path = f'data/{sector}/processed/2_{sector}_{output_suffix}.csv'
    
    # Process questions and save to output file
    rag_system.process_questions(start_index=0, output_path=output_path)
    
    print(f'Output saved in: {output_path}')

## Populate answers

In [19]:
def stratified_sampling(sector, sample_size=10, random_state=42):
    sector_config = SECTORS[sector]

    # Read the file
    df = pd.read_csv(sector_config['dataset_test'])

    # Define required columns
    required_columns = [
        sector_config['context_column'],
        'question',
        sector_config['answer_truth'],
        'topic_lda'
    ]
    
    # Conditionally add `question_category` if available
    question_category = sector_config.get('question_category', None)
    if question_category and question_category in df.columns:
        required_columns.append(question_category)
        has_question_category = True
    else:
        has_question_category = False

    # Ensure required columns exist in the DataFrame
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in the DataFrame.")

    # Convert 'answer_truth' column from list to string (select the first answer if it's a list)
    df[sector_config['answer_truth']] = df[sector_config['answer_truth']].apply(
        lambda x: x[0] if isinstance(x, list) else x
    )
    
    # Drop duplicates and keep only required columns
    df = df[required_columns].drop_duplicates()

    # Perform stratified sampling
    if has_question_category:
        # Stratify based on both `topic_lda` and `question_category`
        sampled_df = df.groupby(['topic_lda', question_category], group_keys=False).apply(
            lambda x: x.sample(n=min(sample_size, len(x)), random_state=random_state)
        ).reset_index(drop=True)
    else:
        # Stratify based on `topic_lda` only
        sampled_df = df.groupby('topic_lda', group_keys=False).apply(
            lambda x: x.sample(n=min(sample_size, len(x)), random_state=random_state)
        ).reset_index(drop=True)

    return sampled_df

### Education

In [12]:
# Perform stratified sampling from testing dataset
edu_sampled_df = stratified_sampling('education', sample_size=40)

  sampled_df = df.groupby(['topic_lda', question_category], group_keys=False).apply(


In [13]:
edu_sampled_df.shape

(929, 5)

In [14]:
edu_sampled_df['question_class'].value_counts()

question_class
DESC    160
ENTY    160
HUM     160
LOC     160
NUM     160
ABBR    129
Name: count, dtype: int64

In [15]:
edu_sampled_df['topic_lda'].value_counts()

topic_lda
1    240
2    240
0    225
3    224
Name: count, dtype: int64

In [16]:
edu_sampled_df.to_csv('data/education/processed/1_education_noanswer.csv', index=False)

#### Few-shot prompting (education)

In [78]:
# Extract unique contexts from the testing dataset (edu_sampled_df)
testing_contexts = edu_sampled_df['context'].unique()

# Read the entire dataset
df = pd.read_csv(SECTORS['education']['dataset_test'])

# Filter rows where the context is not in the testing contexts
filtered_df = df[~df['context'].isin(testing_contexts)]

# Display the filtered DataFrame containing only questions with unique contexts not in the testing dataset
filtered_df.to_csv('data/education/processed/1_education_few-shot.csv', index=False)

filtered_df.head()

Unnamed: 0,context,question,answers,topic_lda,question_class
0,The majority of Greek Cypriots identify as Gre...,What religion do most Turkish Cypriots identif...,Sunni Islam,0,ENTY
1,Guinea-Bissau's GDP per capita is one of the l...,What are Guinea-Bissau's major exports?,"fish, cashew nuts and ground nuts",2,DESC
2,Autodidacticism (also autodidactism) is a cont...,Which famous inventor was a Autodidact?,Thomas Alva Edison,1,HUM
3,The Lacey Act of 1900 was the first federal la...,What did the first federal wildlife commerce l...,interstate commerce of animals killed in viola...,2,ENTY
4,"John Forbes Kerry was born on December 11, 194...",What was Kerry's mother's religion?,Episcopalian,3,ENTY


In [79]:
# What was the name of Shen Fu's memoir?
# Six Chapters of a Floating Life

# What became more common during the Baroque era?
# vocal forms

#### Baseline

In [14]:
# Read the education testing dataset
edu_sampled_df = pd.read_csv('data/education/processed/1_education_noanswer.csv')

In [18]:
# Baseline
populate_answers("education", edu_sampled_df, QueryTransformationType.BASELINE)

Loading FAISS index from ..\models\embeddings\squad_faiss_index_new.idx...
Progress saved at question 5/929.
Progress saved at question 10/929.
Progress saved at question 15/929.
Progress saved at question 20/929.
Progress saved at question 25/929.
Progress saved at question 30/929.
Progress saved at question 35/929.
Progress saved at question 40/929.
Progress saved at question 45/929.
Progress saved at question 50/929.
Progress saved at question 55/929.
Progress saved at question 60/929.
Progress saved at question 65/929.
Progress saved at question 70/929.
Progress saved at question 75/929.
Progress saved at question 80/929.
Progress saved at question 85/929.
Progress saved at question 90/929.
Progress saved at question 95/929.
Progress saved at question 100/929.
Progress saved at question 105/929.
Progress saved at question 110/929.
Progress saved at question 115/929.
Progress saved at question 120/929.
Progress saved at question 125/929.
Progress saved at question 130/929.
Progress 

#### Query transformation

In [17]:
# Using cookbook rewriting
populate_answers("education", edu_sampled_df, QueryTransformationType.COOKBOOK)

Loading FAISS index from ..\models\embeddings\squad_faiss_index_new.idx...
Progress saved at question 5/929.
Progress saved at question 10/929.
Progress saved at question 15/929.
Progress saved at question 20/929.
Progress saved at question 25/929.
Progress saved at question 30/929.
Progress saved at question 35/929.
Progress saved at question 40/929.
Progress saved at question 45/929.
Progress saved at question 50/929.
Progress saved at question 55/929.
Progress saved at question 60/929.
Progress saved at question 65/929.
Progress saved at question 70/929.
Progress saved at question 75/929.
Progress saved at question 80/929.
Progress saved at question 85/929.
Progress saved at question 90/929.
Progress saved at question 95/929.
Progress saved at question 100/929.
Progress saved at question 105/929.
Progress saved at question 110/929.
Progress saved at question 115/929.
Progress saved at question 120/929.
Progress saved at question 125/929.
Progress saved at question 130/929.
Progress 

In [15]:
# Using HyDE
populate_answers("education", edu_sampled_df, QueryTransformationType.HYDE)

Loading FAISS index from ..\models\embeddings\squad_faiss_index_new.idx...
Progress saved at question 5/929.
Progress saved at question 10/929.
Progress saved at question 15/929.
Progress saved at question 20/929.
Progress saved at question 25/929.
Progress saved at question 30/929.
Progress saved at question 35/929.
Progress saved at question 40/929.
Progress saved at question 45/929.
Progress saved at question 50/929.
Progress saved at question 55/929.
Progress saved at question 60/929.
Progress saved at question 65/929.
Progress saved at question 70/929.
Progress saved at question 75/929.
Progress saved at question 80/929.
Progress saved at question 85/929.
Progress saved at question 90/929.
Progress saved at question 95/929.
Progress saved at question 100/929.
Progress saved at question 105/929.
Progress saved at question 110/929.
Progress saved at question 115/929.
Progress saved at question 120/929.
Progress saved at question 125/929.
Progress saved at question 130/929.
Progress 

In [16]:
# Using compression
populate_answers("education", edu_sampled_df, QueryTransformationType.COMPRESSION)

Loading FAISS index from ..\models\embeddings\squad_faiss_index_new.idx...
Progress saved at question 5/929.
Progress saved at question 10/929.
Progress saved at question 15/929.
Progress saved at question 20/929.
Progress saved at question 25/929.
Progress saved at question 30/929.
Progress saved at question 35/929.
Progress saved at question 40/929.
Progress saved at question 45/929.
Progress saved at question 50/929.
Progress saved at question 55/929.
Progress saved at question 60/929.
Progress saved at question 65/929.
Progress saved at question 70/929.
Progress saved at question 75/929.
Progress saved at question 80/929.
Progress saved at question 85/929.
Progress saved at question 90/929.
Progress saved at question 95/929.
Progress saved at question 100/929.
Progress saved at question 105/929.
Progress saved at question 110/929.
Progress saved at question 115/929.
Progress saved at question 120/929.
Progress saved at question 125/929.
Progress saved at question 130/929.
Progress 

### Healthcare

In [22]:
# Perform stratified sampling from testing dataset
hc_sampled_df = stratified_sampling('healthcare', sample_size=465)

  sampled_df = df.groupby('topic_lda', group_keys=False).apply(


In [23]:
hc_sampled_df.shape

(930, 4)

In [28]:
hc_sampled_df.head()

Unnamed: 0,context,question,answer,topic_lda
0,The etiology of benign prostatic hypertrophy (...,Does prostate-specific antigen induce prolifer...,These findings show that PSA is able to induce...,0
1,Platelet-rich plasma is characterized by conta...,Does platelet-rich plasma suppress osteoclasto...,"Under our experimental conditions, platelet-ri...",0
2,To investigate the effects of intravenous pent...,Does tilidine affect human sphincter of Oddi m...,"In contrast to 30 mg of pentazocine, 50 mg of ...",0
3,Animals respond to inflammation by suppressing...,Is expression of myeloid differentiation facto...,Sickness behavior is mediated by MyD88 and is ...,0
4,To investigate the role of Rho A and Rho-kinas...,Does inhibition of Rho-kinase protect the hear...,These results suggest that Rho-kinase plays a ...,0


In [30]:
hc_sampled_df['topic_lda'].value_counts()

topic_lda
0    465
1    465
Name: count, dtype: int64

In [31]:
hc_sampled_df.to_csv('data/healthcare/processed/1_healthcare_noanswer.csv', index=False)

#### Baseline

In [17]:
# Read the healthcare testing dataset
hc_sampled_df = pd.read_csv('data/healthcare/processed/1_healthcare_noanswer.csv', nrows=5)

#### Query transformation

In [52]:
# Baseline
populate_answers("healthcare", hc_sampled_df, QueryTransformationType.BASELINE)

Loading FAISS index from ..\models\embeddings\pubmedqa_faiss_index_subset.idx...
Progress saved at question 5/5.
Processing complete.
Output saved in: data/healthcare/processed/2_healthcare_baseline.csv


In [57]:
# Using cookbook rewriting
populate_answers("healthcare", hc_sampled_df, QueryTransformationType.COOKBOOK)

Loading FAISS index from ..\models\embeddings\pubmedqa_faiss_index_subset.idx...
Progress saved at question 5/930.
Progress saved at question 10/930.
Progress saved at question 15/930.
Progress saved at question 20/930.
Progress saved at question 25/930.
Progress saved at question 30/930.
Progress saved at question 35/930.
Progress saved at question 40/930.
Progress saved at question 45/930.
Progress saved at question 50/930.
Progress saved at question 55/930.
Progress saved at question 60/930.
Progress saved at question 65/930.
Progress saved at question 70/930.
Progress saved at question 75/930.
Progress saved at question 80/930.
Progress saved at question 85/930.
Progress saved at question 90/930.
Progress saved at question 95/930.
Progress saved at question 100/930.
Progress saved at question 105/930.
Progress saved at question 110/930.
Progress saved at question 115/930.
Progress saved at question 120/930.
Progress saved at question 125/930.
Progress saved at question 130/930.
Pro

In [58]:
# Using HyDE
populate_answers("healthcare", hc_sampled_df, QueryTransformationType.HYDE)

Loading FAISS index from ..\models\embeddings\pubmedqa_faiss_index_subset.idx...
Progress saved at question 5/930.
Progress saved at question 10/930.
Progress saved at question 15/930.
Progress saved at question 20/930.
Progress saved at question 25/930.
Progress saved at question 30/930.
Progress saved at question 35/930.
Progress saved at question 40/930.
Progress saved at question 45/930.
Progress saved at question 50/930.
Progress saved at question 55/930.
Progress saved at question 60/930.
Progress saved at question 65/930.
Progress saved at question 70/930.
Progress saved at question 75/930.
Progress saved at question 80/930.
Progress saved at question 85/930.
Progress saved at question 90/930.
Progress saved at question 95/930.
Progress saved at question 100/930.
Progress saved at question 105/930.
Progress saved at question 110/930.
Progress saved at question 115/930.
Progress saved at question 120/930.
Progress saved at question 125/930.
Progress saved at question 130/930.
Pro

In [55]:
# Using compression
populate_answers("healthcare", hc_sampled_df, QueryTransformationType.COMPRESSION)

Loading FAISS index from ..\models\embeddings\pubmedqa_faiss_index_subset.idx...
Progress saved at question 5/5.
Processing complete.
Output saved in: data/healthcare/processed/2_healthcare_compression.csv
