In [4]:
import os
import time
import pandas as pd
import yaml
from tqdm.notebook import tqdm  
from HyDE import HyDEConfig, Promptor, HyDE

def process_all_domains(input_parquet, output_folder):
    # Set the main_folder path for each domain
    domain_to_mainfolder = {
        "mix":          "/Users/chengze/Desktop/mix",
        "physics":      "/Users/chengze/Desktop/physics",
        "agriculture":  "/Users/chengze/Desktop/agriculture",
        "cs":           "/Users/chengze/Desktop/cs"
    }
    domains = ["mix", "physics", "agriculture", "cs"]
    # Read the configuration file (Hconfig.yaml)
    with open('Hconfig.yaml', 'r', encoding='utf-8') as f:
        base_config = yaml.safe_load(f)

    df_all = pd.read_parquet(input_parquet_path)
    # Store the DataFrame for each domain after processing
    domain_dfs = {}
    for domain in domains:
        print(f"\n=== Processing domain: {domain} ===")
        config_domain = base_config.copy()
        config_domain['config']['main_folder'] = domain_to_mainfolder[domain]

        # 4.2. Initialize HyDE
        hyde_config = HyDEConfig(config_domain)
        promptor = Promptor(hyde_config)
        hyde_obj = HyDE(hyde_config, promptor)
        df_domain = df_all[df_all['domain'] == domain].copy()
        # df_domain = df_domain.head(10)
        if 'processing time' not in df_domain.columns:
            df_domain['processing time'] = None
        if 'retrieval_document' not in df_domain.columns:
            df_domain['retrieval_document'] = None
        if 'response' not in df_domain.columns:
            df_domain['response'] = None

        for idx, row in tqdm(df_domain.iterrows(),
                             total=len(df_domain),
                             desc=f"Processing {domain}"):
            question = row['question_text']
            start_time = time.time()
            retrieval_document = hyde_obj.e2e_search(question)
            end_time = time.time()

            # If the retrieval result is a list, take the first one, otherwise pass an empty list
            if retrieval_document and isinstance(retrieval_document, list):
                best_hit = [retrieval_document[0]]
                response_data = hyde_obj.answer(best_hit, question)
            else:
                response_data = hyde_obj.answer([], question)

            final_answer = response_data

            df_domain.at[idx, 'processing time'] = end_time - start_time
            df_domain.at[idx, 'retrieval_document'] = str(retrieval_document)
            df_domain.at[idx, 'response'] = final_answer

        os.makedirs(output_folder, exist_ok=True)
        parquet_path = os.path.join(output_folder, f"domain_{domain}.parquet")
        df_domain.to_parquet(parquet_path, index=False)
        print(f"[{domain}] => Parquet saved: {parquet_path}")
        domain_dfs[domain] = df_domain

    df_merged = pd.concat(domain_dfs.values(), ignore_index=True)
    merged_parquet = os.path.join(output_folder, "domain_all.parquet")
    df_merged.to_parquet(merged_parquet, index=False)
    print(f"All 4 domains merged => Parquet saved: {merged_parquet}")
    return domain_dfs, df_merged

# Add a main() when run it in PyCharm or a script
if __name__ == "__main__":
    input_parquet_path = "/Users/chengze/Desktop/questions.parquet"
    output_folder_path = "/Users/chengze/Desktop/output_parquets" 

    domain_dfs, df_all_merged = process_all_domains(
        input_parquet=input_parquet_path,
        output_folder=output_folder_path
    )
    # domain_dfs is a dict, key = domain, value = DataFrame
    # df_all_merged is the result of merging four domains
    print("Done!")


=== Processing domain: mix ===


Processing mix:   0%|          | 0/125 [00:00<?, ?it/s]

In [1]:
from eval import (
    EvalLLM,
    Promptor,
    eval
)
import yaml
with open('EvalConfig.yaml','r') as f:
    config = yaml.safe_load(f)

config = EvalLLM(config)
# print(config.main_folder)
promptor = Promptor(config)
evaluation = eval('arena', config, promptor)





  from .autonotebook import tqdm as notebook_tqdm


In [2]:
evaluation.load_data()

In [3]:
evaluation.dataframes['graphrag_ultradomain_responses']

Unnamed: 0,domain,user,task,question_text,uid,response,query_method
0,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,How do organic farming principles discussed in...,f0d93b76-26fc-4d17-a1dc-f03b79c58ea4,## Comparison of Organic Farming Principles an...,global
1,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,What role does soil organic matter play in the...,8dc5aa16-dc17-45cf-a62f-669d9613bfc9,## The Role of Soil Organic Matter in Crop Hea...,global
2,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,How do different agricultural innovations affe...,1111a59f-e54c-48bb-b1be-667f27b98415,# Impact of Agricultural Innovations on Enviro...,global
3,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,In what ways do diverse cropping systems enhan...,5cea0bef-4fce-411b-826f-2a2a29103cd4,## Enhancing Yields and Resilience through Div...,global
4,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,How might the integration of livestock into cr...,2e8f6a2a-7ea8-4fe1-be52-db597eba290c,## Integration of Livestock into Crop Systems ...,global
...,...,...,...,...,...,...,...
120,physics,User 5: **Science Journalist**,Task 5: Develop a series of articles exploring...,How can the origins of the universe and cosmic...,0c59a8a0-6378-4007-b3b1-dda8922700eb,# Engaging Narratives of the Universe's Origin...,global
121,physics,User 5: **Science Journalist**,Task 5: Develop a series of articles exploring...,What themes in contemporary astrophysics shoul...,54ad2493-c494-4dc7-acc6-f1bac2876d11,## Key Themes in Contemporary Astrophysics\n\n...,global
122,physics,User 5: **Science Journalist**,Task 5: Develop a series of articles exploring...,"How can the interplay between dark energy, dar...",e354a09b-54c9-457f-956c-2af918db2a1e,## Engaging the Public with Cosmic Mysteries\n...,global
123,physics,User 5: **Science Journalist**,Task 5: Develop a series of articles exploring...,In what ways might historical perspectives of ...,3218640d-60d4-4992-86b5-0ba3905f7436,# Historical Perspectives in Cosmology and The...,global


In [4]:
df = evaluation.evaluation()

{
  "Comprehensiveness": { 
    "Winner": "Answer 1", 
    "Explanation": "Answer 1 provides a detailed exploration of both organic and conventional farming principles, covering various aspects such as soil health, biodiversity, pest management, and economic viability. It also discusses the short-term versus long-term productivity comparison in depth, making it more comprehensive than Answer 2." 
  }, 
  "Diversity": { 
    "Winner": "Answer 1", 
    "Explanation": "Answer 1 presents a wider range of perspectives, including the ecological benefits of organic farming, the economic implications, and the potential for long-term sustainability. It contrasts the two farming methods in a more nuanced way, while Answer 2, although informative, is more focused on the productivity aspect without as much variety in insights." 
  }, 
  "Empowerment": { 
    "Winner": "Answer 1", 
    "Explanation": "Answer 1 empowers the reader by providing a thorough understanding of the implications of both far

In [6]:
df.head()

Unnamed: 0,domain,user,task,question_text,uid,response,query_method,evaluation,answer1,answer2
0,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,How do organic farming principles discussed in...,f0d93b76-26fc-4d17-a1dc-f03b79c58ea4,### Comparing Organic Farming Principles to Co...,hybrid,"{\n ""Comprehensiveness"": { \n ""Winner"": ""A...",lightrag_ultradomain_responses,graphrag_ultradomain_responses
1,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,What role does soil organic matter play in the...,8dc5aa16-dc17-45cf-a62f-669d9613bfc9,### The Role of Soil Organic Matter in Crop He...,hybrid,"{\n ""Comprehensiveness"": { \n ""Winner"": ""A...",lightrag_ultradomain_responses,graphrag_ultradomain_responses
2,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,How do different agricultural innovations affe...,1111a59f-e54c-48bb-b1be-667f27b98415,### Introduction\n\nThe dataset provides rich ...,hybrid,"```json\n{\n ""Comprehensiveness"": { \n ""Wi...",lightrag_ultradomain_responses,graphrag_ultradomain_responses
3,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,In what ways do diverse cropping systems enhan...,5cea0bef-4fce-411b-826f-2a2a29103cd4,### Enhancing Yields and Resilience Through Di...,hybrid,"{\n ""Comprehensiveness"": { \n ""Winner"": ""A...",lightrag_ultradomain_responses,graphrag_ultradomain_responses
4,agriculture,User 1: Agricultural Researcher,Task 1: Analyze the impact of sustainable prac...,How might the integration of livestock into cr...,2e8f6a2a-7ea8-4fe1-be52-db597eba290c,### Enhancing Soil Health Through Livestock In...,hybrid,"{\n ""Comprehensiveness"": { \n ""Winner"": ""A...",lightrag_ultradomain_responses,graphrag_ultradomain_responses
