## 1. Imports and Setup

In [1]:
from pathlib import Path
import sys
import os

project_root = Path(os.getcwd()).parent
sys.path.append(str(project_root))

In [2]:
# Import required libraries
import pandas as pd
import os
import omegaconf
from easyllm_kit.utils.io_utils import write_to_database
from easyllm_kit.utils import get_logger, read_json
from easyllm_kit.models import LLM
from easyllm_kit.configs.llm_base_config import GenerationArguments
# from famma_runner.runners.base_runner import Runner
from famma_runner.utils import (
    collect_images_from_first_subquestion, 
    generate_response_from_llm, 
    safe_parse_response,
    QuestionPrompt, 
    LANGUAGE_ORDER, 
    DC, 
    order_by_language, 
    ProgramOfThoughtsQuestionPrompt
)

# Initialize logger
logger = get_logger('generation_runner', 'generation_runner.log')

  from .autonotebook import tqdm as notebook_tqdm


## 2. Configuration Management

In [3]:
from omegaconf import OmegaConf

In [21]:
config_dir = Path.joinpath(project_root, 'configs', 'gen_config_qwen.yaml')
config = OmegaConf.load(config_dir)
config

{'runner_name': 'generation', 'data': {'data_dir': './hf_data/release_v2406.json', 'question_id': None}, 'model': {'model_name': 'qwen_vl', 'use_api': True, 'api_key': 'xxx', 'api_url': 'xxx', 'model_full_name': 'qwen2.5-vl-72b-instruct', 'use_ocr': False, 'is_reasoning_model': False}, 'generation': {'temperature': 0.0, 'top_p': 0.9, 'max_length': 1024}}

## 3. Dataset Processing

In [5]:
# Load and process dataset
def load_dataset(data_path):
    """Load dataset from JSON file and prepare it for processing."""
    data = read_json(data_path)
    dataset_df = pd.DataFrame(data)
    
    # Create a new column for sorting languages
    order_by_language(dataset_df, LANGUAGE_ORDER, DC.MAIN_QUESTION_ID, DC.SUB_QUESTION_ID, DC.LANGUAGE)
    
    return dataset_df

In [6]:
# Filter dataset by question IDs
def filter_dataset_by_question_id(dataset_df, question_ids):
    """Filter dataset by specific question_ids."""
    filtered_main_question_ids = None
    
    if not question_ids:
        return dataset_df, filtered_main_question_ids
    
    # Convert single question_id to list for consistent processing
    if not isinstance(question_ids, omegaconf.ListConfig) and not isinstance(question_ids, list):
        question_ids = [question_ids]
    
    if len(question_ids) == 0:
        return dataset_df, filtered_main_question_ids
    
    # Create empty DataFrame to collect all filtered results
    filtered_results = pd.DataFrame()
    
    # Track unique language-main_question_id pairs to avoid duplicate processing
    processed_pairs = set()
    
    for question_id in question_ids:
        try:
            # Parse the question_id to extract components
            parts = question_id.split('_')
            if len(parts) >= 3:  # Ensure we have at least language, main_question_id, and sub_question_id
                language = parts[0]
                main_question_id = int(parts[1])
                
                # Create a unique identifier for this language-main_question_id pair
                pair_key = f"{language}_{main_question_id}"
                
                # Skip if we've already processed this pair
                if pair_key in processed_pairs:
                    continue
                
                processed_pairs.add(pair_key)
                
                logger.info(f"Filtering dataset for language: {language}, main_question_id: {main_question_id}")
                
                # Filter the dataset by both main_question_id and language
                current_filtered = dataset_df[
                    (dataset_df[DC.MAIN_QUESTION_ID] == main_question_id) & 
                    (dataset_df[DC.LANGUAGE] == language)
                ]
                
                if current_filtered.empty:
                    logger.warning(f"No matching questions found for {question_id}")
                else:
                    logger.info(f"Found {len(current_filtered)} questions matching {question_id}")
                    # Append to our results
                    filtered_results = pd.concat([filtered_results, current_filtered])
            else:
                logger.warning(f"Invalid question_id format: {question_id}")
        except Exception as e:
            logger.error(f"Error parsing question_id {question_id}: {str(e)}")
    
    # If we didn't find any matches, return the original dataset
    if filtered_results.empty:
        logger.warning("No matching questions found for any of the provided question_ids")
        return dataset_df, None
    logger.info(f"Total of {len(filtered_results)} questions matched across all filters")
    
    # Extract unique main_question_ids from filtered results
    filtered_main_question_ids = filtered_results[DC.MAIN_QUESTION_ID].unique().tolist()
    return filtered_results, filtered_main_question_ids


In [7]:
dataset_df = load_dataset(config['data']['data_dir'])
dataset_df.head()

Unnamed: 0,idx,question_id,context,question,options,image_1,image_2,image_3,image_4,image_5,...,main_question_id,sub_question_id,ans_image_1,ans_image_2,ans_image_3,ans_image_4,ans_image_5,ans_image_6,release,language_order
0,0,english_1_1_r1,Below image is a sceenshot of the Contract spe...,What is the coupon feature of the Korea Develo...,"[A. Fixed, B. Floating, C. Zero, D. Variable]",images_release_v2406/english_1_1_r1_image_1.jpg,,,,,...,1,1,,,,,,,release_v2406,0
1,1,english_1_2_r1,,"According to the information provided, what is...","[A. AAA, B. AA-, C. A+, D. BBB+]",,,,,,...,1,2,,,,,,,release_v2406,0
2,2,english_1_3_r1,,In which currency is the Korea Development Ban...,"[A. USD, B. EUR, C. JPY, D. GBP]",,,,,,...,1,3,,,,,,,release_v2406,0
3,3,english_1_4_r1,,What is the redemption yield of the Korea Deve...,"[A. 0.572%, B. 2.957%, C. 5.730%, D. 7.375%]",,,,,,...,1,4,,,,,,,release_v2406,0
4,4,english_1_5_r1,,"As of the date on the Bloomberg terminal, what...","[A. It will mature in less than a year, B. It ...",,,,,,...,1,5,,,,,,,release_v2406,0


In [8]:
config['data']['question_id']

In [9]:
dataset_df, filtered_main_question_ids = filter_dataset_by_question_id(
        dataset_df, config['data']['question_id'])

In [10]:
dataset_df.head()

Unnamed: 0,idx,question_id,context,question,options,image_1,image_2,image_3,image_4,image_5,...,main_question_id,sub_question_id,ans_image_1,ans_image_2,ans_image_3,ans_image_4,ans_image_5,ans_image_6,release,language_order
0,0,english_1_1_r1,Below image is a sceenshot of the Contract spe...,What is the coupon feature of the Korea Develo...,"[A. Fixed, B. Floating, C. Zero, D. Variable]",images_release_v2406/english_1_1_r1_image_1.jpg,,,,,...,1,1,,,,,,,release_v2406,0
1,1,english_1_2_r1,,"According to the information provided, what is...","[A. AAA, B. AA-, C. A+, D. BBB+]",,,,,,...,1,2,,,,,,,release_v2406,0
2,2,english_1_3_r1,,In which currency is the Korea Development Ban...,"[A. USD, B. EUR, C. JPY, D. GBP]",,,,,,...,1,3,,,,,,,release_v2406,0
3,3,english_1_4_r1,,What is the redemption yield of the Korea Deve...,"[A. 0.572%, B. 2.957%, C. 5.730%, D. 7.375%]",,,,,,...,1,4,,,,,,,release_v2406,0
4,4,english_1_5_r1,,"As of the date on the Bloomberg terminal, what...","[A. It will mature in less than a year, B. It ...",,,,,,...,1,5,,,,,,,release_v2406,0


In [11]:
len(dataset_df) ,filtered_main_question_ids

(1945, None)

## 4. Model Initialization

### 4.1 Setup the language model

In [12]:
# Setup the language model
def setup_model(model_config, generation_args):
    """Initialize and setup the language model."""
    # Convert generation_args to GenerationArguments if needed
    if isinstance(generation_args, dict):
        generation_config = GenerationArguments(**generation_args)
    else:
        generation_config = generation_args
        
    # Build the LLM model
    llm_config = {
        'model_config': model_config,
        'generation_config': generation_config
    }

    # If using custom model, load it from custom_llm.py
    if model_config.get("model_name") == "custom_llm":
        from custom_llm import MyCustomModel
        
    llm = LLM.build_from_config(llm_config)
    return llm


### 4.2 Setup OCR if needed

In [13]:
# Setup OCR if needed
def setup_ocr(use_ocr=False):
    """Setup OCR model if required."""
    ocr_model = None
    if use_ocr:
        try:
            from paddleocr import PaddleOCR
            ocr_model = PaddleOCR(use_angle_cls=True)
            logger.info("OCR model initialized successfully")
        except ImportError:
            logger.error("Failed to import PaddleOCR. Please install with: pip install paddleocr")
    return ocr_model

In [14]:
generation_config = config.get("generation", {})
generation_config

{'temperature': 0.0, 'top_p': 0.9, 'max_length': 1024}

In [15]:
model_config = config["model"]
model_config

{'model_name': 'qwen_vl', 'use_api': True, 'api_key': 'xxx', 'api_url': 'xxx', 'model_full_name': 'qwen2.5-vl-72b-instruct', 'use_ocr': False, 'is_reasoning_model': False}

In [16]:
llm = setup_model(model_config, generation_config)
llm

<easyllm_kit.models.qwen_vl.QwenVL at 0x17bebd9c0>

In [17]:
llm_name = model_config.get("model_full_name", "unknown_model")
llm_name

'qwen2.5-vl-72b-instruct'

In [18]:
use_ocr = model_config.get('use_ocr', False)
ocr_model = setup_ocr(use_ocr)
ocr_model

## 5. Setup Database

In [22]:
import dictdatabase as DDB
DDB.config.storage_directory = Path.joinpath(project_root, 'ddb_storage')
def initialize_database(output_db: str):
    """
    Initialize the database if it doesn't exist and return the database object.
    """
    db = DDB.at(output_db).read()
    if db is None:
        DDB.at(output_db).create()
        db = {}
        logger.info(f"Initialized new database: {output_db}")
    else:
        logger.info(f"Loaded existing database: {output_db} with {len(db)} entries.")
    return db

def setup_database(model_name, data_dir):
    """Initialize database for storing results."""
    release_version = data_dir.split('/')[-1].split('.')[0]
    target_db_name = f'{model_name}_ans_{release_version}'
    target_db = initialize_database(output_db=target_db_name)
    
    return target_db_name, target_db

def save_results(target_db_name, key, subquestion_responses):
    """Save responses to database."""
    write_to_database(target_db_name, key, subquestion_responses)

In [23]:
config['data']['data_dir']

'./hf_data/release_v2406.json'

In [24]:
target_db_name, target_db = setup_database(llm_name, config['data']['data_dir'])
target_db_name, target_db

[32m2025-03-29 00:36:14, generation_runner [generation_runner.initialize_database:13] INFO - Loaded existing database: qwen2.5-vl-72b-instruct_ans_release_v2406 with 658 entries.[39m


('qwen2.5-vl-72b-instruct_ans_release_v2406',
 {'chinese_1': {'chinese_1_1_r1': {'ans_image_1': 'None',
    'ans_image_2': 'None',
    'ans_image_3': 'None',
    'ans_image_4': 'None',
    'ans_image_5': 'None',
    'ans_image_6': 'None',
    'answers': '初级市场交易',
    'context': '<image_1> 它显示了美国黄金证券的发行',
    'explanation': 'nan',
    'idx': 1378,
    'image_1': 'images_release_v2406/chinese_1_1_r1_image_1.jpg',
    'image_2': 'None',
    'image_3': 'None',
    'image_4': 'None',
    'image_5': 'None',
    'image_6': 'None',
    'image_7': 'None',
    'image_type': 'screenshot',
    'language': 'chinese',
    'language_order': 1,
    'main_question_id': 1,
    'model_answer': '初级市场',
    'model_explanation': '根据图片中的信息，这是一次新发行的美国黄金证券。在金融市场上，新发行的证券通常属于初级市场交易，因为它们是首次向公众出售的。二级市场则是已发行证券的后续交易场所。因此，这次发行属于初级市场交易。',
    'options': 'None',
    'question': '该发行是初级市场还是二级市场交易?',
    'question_id': 'chinese_1_1_r1',
    'question_type': 'open question',
    'release': 'release_v2406',
    'sub_questi

## 6. Question Processing and Response Generation

In [25]:
def generate_answer_for_one_main_question(sub_question_set_df, llm, use_pot=False, 
                                          is_reasoning_model=False, use_ocr=False, 
                                          ocr_model=None, parent_dir=""):
    """Generate model responses for a set of related sub-questions."""
    # Get the context from the first sub_question
    context = sub_question_set_df.iloc[0].get("context", "")

    # Collect images from the first sub-question
    images = collect_images_from_first_subquestion(sub_question_set_df, parent_dir=parent_dir)

    # Format sub-questions
    sub_questions = []
    question_id_list = []
    for _, row in sub_question_set_df.iterrows():
        question_dict = {
            "id": row['question_id'],
            "type": row['question_type'],
            "question": row['question']
        }

        # Add options if it's a multiple-choice question
        if row['question_type'] == 'multiple-choice':
            question_dict["options"] = row['options']

        sub_questions.append(question_dict)
        question_id_list.append(row['question_id'])

    # Select prompt based on configuration
    if use_pot:
        prompt = ProgramOfThoughtsQuestionPrompt.init().format(
            context=context,
            sub_questions=sub_questions
        )
    else:
        prompt = QuestionPrompt.init().format(
            context=context,
            sub_questions=sub_questions
        )

    # Generate response
    model_output = generate_response_from_llm(llm, prompt, images, use_ocr=use_ocr, ocr_model=ocr_model)
    model_response = safe_parse_response(model_output, question_id_list, is_reasoning_model=is_reasoning_model)

    return model_response


## 7.Main Generation Process

In [25]:
# Get additional configuration parameters
use_pot = model_config.get('use_pot', False)
is_reasoning_model = model_config.get('is_reasoning_model', False)
# Create a copy of the DataFrame at the start
df_copy = dataset_df.copy()
    # Get parent directory for image paths
parent_dir = os.path.dirname(config['data']['data_dir'])


In [None]:
# Process each main question group
for (_, language, main_question_id), group in df_copy.groupby(
        ['language_order', DC.LANGUAGE, DC.MAIN_QUESTION_ID]):
    
    key = f'{language}_{main_question_id}'
    
    # Skip if already in database AND not specifically requested in filtered_main_question_ids
    if key in target_db and (filtered_main_question_ids is None or 
                                main_question_id not in filtered_main_question_ids):
        logger.info(f'Skipping {key} - already in database and not specifically requested')
        continue
        
    try:
        logger.info(f'Start generating answers for {language} -- main_question_id: {main_question_id}')
        
        # Generate answers for this question group
        model_response = generate_answer_for_one_main_question(
            group, llm, use_pot=use_pot, is_reasoning_model=is_reasoning_model,
            use_ocr=use_ocr, ocr_model=ocr_model, parent_dir=parent_dir
        )
        
        # Aggregate all subquestions with their answers 
        subquestion_responses = {}
        for idx in range(len(group)):
            output_key = group.iloc[idx]['question_id']
            
            # Create a JSON object with the original input data and the model response
            input_data_with_response = group.iloc[idx].to_dict()
            
            if is_reasoning_model:
                input_data_with_response.update({
                    'model_answer': model_response[output_key].get('answer', ''),
                    'model_explanation': model_response[output_key].get('explanation', ''),
                    'model_reasoning': model_response.get('reasoning_content', '')
                })
            else:
                input_data_with_response.update({
                    'model_answer': model_response[output_key].get('answer', ''),
                    'model_explanation': model_response[output_key].get('explanation', '')
                })
            
            # Store the response in the subquestion_responses dictionary
            subquestion_responses[output_key] = input_data_with_response
        
        # Write the aggregated subquestion responses to the database
        save_results(target_db_name, key, subquestion_responses)
        logger.info(f'Successfully processed and saved {key}')
        
    except Exception as e:
        logger.error(f"Error processing main_question_id {main_question_id}: {str(e)}")
        continue
