In [1]:
import shutil
import os
from os import path
import itertools
import tempfile
import re
import json
import time
import requests
import logging
import pandas as pd
from transformers import AutoTokenizer
from modules.data_processing import categorize_files_by_template, extract_text_from_pdf
from modules.chunking import get_chunk_size, generate_chunk
from modules.arrange_conf import get_log_directory, get_output_directory, get_data_paths
from modules.model_interaction import process_chunks_token_counts
from config import hf_token, MODEL_llama, few_shot_examples
from multiprocessing import current_process

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
directory = 'vrdu2/registration-form/few_shot-splits/'
# directory = 'vrdu2/ad-buy-form/few_shot-splits/'

# Arrange logging and output dirs accordıng to model and form type
model = MODEL_llama # MODEL_gpt_3 #MODEL_gpt_4 #  or 
log_directory = get_log_directory(directory, model) 
base_output_directory = get_output_directory(directory, model)

print("log_directory: ", log_directory)
print("base_output_directory: ", base_output_directory)

# Arrange folder and dataset dirs based on the provided main dir
folder_path, dataset_path, dtype = get_data_paths(directory)

print("folder_path:", folder_path)
print("dataset_path:", dataset_path)
print("dtype:", dtype)

log_file_path = path.join(log_directory, f"experiment_{current_process().pid}.log")
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(log_file_path),
                        logging.StreamHandler()
                    ])

# Example logging
logging.info("Logging setup complete.")

2025-02-19 11:10:48,274 - INFO - Logging setup complete.


log_directory:  llama3_70b_outputs/reg/
base_output_directory:  llama3_70b_outputs/reg/
folder_path: VT2/vrdu2/registration-form/main/pdfs
dataset_path: VT2/vrdu2/registration-form/main/dataset.jsonl.gz
dtype: reg


In [3]:
# Load tokenizer with authentication
tokenizer = AutoTokenizer.from_pretrained(
    "meta-llama/Meta-Llama-3-70B",
    token=hf_token
)

print("Tokenizer loaded successfully!")

Tokenizer loaded successfully!


In [4]:
def perform_experiment_token_count(sample_size, use_custom_ocr, experiment_id, prompt_type, chunk_size_category, **kwargs):
    logging.info(f"Starting experiment with ID {experiment_id}")
    try:
        logging.debug("Experiment parameters: "
                      f"sample_size={sample_size}, use_custom_ocr={use_custom_ocr}, "
                      f"prompt_type={prompt_type}, chunk_size_category={chunk_size_category}, kwargs={kwargs}")

        # Initialize variables and perform checks
        example_num = kwargs.get('example_num', None)
        no_schema = kwargs.get('no_schema', True)
        transformation_method = kwargs.get('transformation_method', 'naive')
        chunking_method = kwargs.get('chunking_method', 'fixed')
        overlap = kwargs.get('overlap', None)
        level_type = kwargs.get('level_type', 'STL')

        template_types = ['Amendment', 'Dissemination', 'Short-Form']
        logging.debug(f"Categorizing files by template for experiment ID {experiment_id}")

        categorized_files = categorize_files_by_template(os.path.join(directory, 'prompts'))
        logging.debug(f"Loaded categorized files for level_type {level_type}")

        output_data = []
        ocr_type = "custom" if use_custom_ocr else "vrdu"

        for template_type in template_types:
            print("s---------------Templete type: ", template_type)
            print("s---------------Level type: ", level_type) 
            logging.info(f"Processing template type: {template_type}")
            files_list = list(categorized_files[level_type][template_type])
            sample_files = files_list[:sample_size]
            logging.debug(f"Loaded sample files for template type {template_type}: {sample_files}")

            examples = few_shot_examples[level_type][template_type][example_num] if not no_schema and example_num else None

            for filename in sample_files:
                print("s---------------Fiename: ", filename) 
                if filename.endswith(".pdf"):
                    start_time = time.time()
                    logging.info(f"Experiment {experiment_id}: Processing file {filename}")

                    success = False
                    retry_count = 0
                    max_retries = 10

                    while not success and retry_count < max_retries:
                        try:
                            pdf_path = os.path.join(folder_path, filename)
                            with tempfile.TemporaryDirectory() as temp_dir:
                                temp_pdf_path = os.path.join(temp_dir, filename)
                                shutil.copy(pdf_path, temp_pdf_path)

                                logging.debug(f"Extracting text from PDF for file {filename}")
                                extracted_data = extract_text_from_pdf(temp_pdf_path, dataset_path, use_custom_ocr, transformation_method=transformation_method)
                                chunk_size = get_chunk_size(chunk_size_category, prompt_type, example_num)
                                logging.debug(f"Generating chunks for file {filename}")
                                chunks, prompt_token_size = generate_chunk(chunking_method, extracted_data, chunk_size, overlap)

                                if not chunks:
                                    raise ValueError(f"Chunk generation failed for file {filename}")
                                    break

                                logging.debug(f"Processing chunks for file {filename}")
                                token_counts = process_chunks_token_counts(
                                    chunks, prompt_type, examples, template_type, level_type,
                                    MODEL_llama, tokenizer, dtype,
                                    example_num
                                )
                                
                                logging.debug(f"Total count for file: {filename} is {token_counts}")
                                output_data.append({
                                    "experiment_id": experiment_id,
                                    "model_name": MODEL_llama,
                                    "sample_num": len(sample_files),
                                    "chunk_size": chunk_size,
                                    "prompt_type": prompt_type,
                                    "example_num": example_num,
                                    "level_type": level_type,
                                    "file_name": filename,
                                    "token_counts": token_counts
                                })
                            success = True
                        except requests.exceptions.HTTPError as e:
                            if e.response.status_code == 429:
                                retry_count += 1
                                error_details = e.response.json()
                                retry_after = parse_retry_after(error_details)

                                if retry_after:
                                    logging.warning(f"Rate limit reached. Retrying in {retry_after:.2f} seconds due to rate limit.")
                                    print(f"Rate limit reached. Retrying in {retry_after:.2f} seconds due to rate limit.")
                                    time.sleep(retry_after)
                                    logging.info(f"Waited for {retry_after:.2f} seconds before retrying.")
                                    print(f"Waited for {retry_after:.2f} seconds before retrying.")
                                else:
                                    delay = adaptive_delay(retry_count)
                                    logging.warning(f"No specific retry_after provided. Using adaptive delay of {delay:.2f} seconds.")
                                    print(f"No specific retry_after provided. Using adaptive delay of {delay:.2f} seconds.")
                                    time.sleep(delay)
                            else:
                                logging.error(f"HTTP error while processing file {filename}: {e}")
                                print(f"HTTP error while processing file {filename}: {e}")
                                break
                        except Exception as e:
                            logging.error(f"An error occurred while processing file {filename}: {e}")
                            print(f"An error occurred while processing file {filename}: {e}")
                            break

        experiment_file_path = os.path.join(base_output_directory, 'token_counts')
        output_file = os.path.join(experiment_file_path, f"experiment_{experiment_id}_results.csv")
        result_df = pd.DataFrame(output_data)
        result_df.to_csv(output_file, index=False)
        logging.info(f"Experiment {experiment_id}: Results written to {output_file}")
        return result_df
    except Exception as e:
        logging.error(f"Error during experiment {experiment_id}: {e}")
        return pd.DataFrame()

In [5]:
use_custom_ocr = False
sample_size = 40
prompt_types = ["no_schema", "few_shot", "chain_of_thought"]
example_num_options = [0, 1, 3, 5]
transformation_methods = ["layout-aware", "naive"]
chunking_method = 'fixed'
chunk_size_categories = ["max", "medium", "small"]
level_types = ["STL", "UTL"]

combinations = []
experiment_id = 0

for prompt_type, chunk_size_category, level_type in itertools.product(prompt_types, chunk_size_categories, level_types):
    if prompt_type == "no_schema":
        for transformation_method in transformation_methods:
            combination = {
                'experiment_id': experiment_id,
                'sample_size': sample_size,
                'use_custom_ocr': use_custom_ocr,
                'prompt_type': prompt_type,
                'chunk_size_category': chunk_size_category,
                'no_schema': True,
                'example_num': None,
                'transformation_method': transformation_method,
                'chunking_method': chunking_method,
                'level_type': level_type
            }
            combinations.append(combination)
            experiment_id += 1
    else:
        for example_num, transformation_method in itertools.product(example_num_options, transformation_methods):
            combination = {
                'experiment_id': experiment_id,
                'sample_size': sample_size,
                'use_custom_ocr': use_custom_ocr,
                'prompt_type': prompt_type,
                'chunk_size_category': chunk_size_category,
                'no_schema': False,
                'example_num': example_num,
                'transformation_method': transformation_method,
                'chunking_method': chunking_method,
                'level_type': level_type
            }
            combinations.append(combination)
            experiment_id += 1

# Creating a DataFrame
df = pd.DataFrame(combinations)

# Saving to CSV
csv_file_path = os.path.join(base_output_directory, 'all_experiment_combinations.csv')
df.to_csv(csv_file_path, index=False)

print(f"CSV file has been saved to {csv_file_path}")

CSV file has been saved to llama3_70b_outputs/reg/all_experiment_combinations.csv


In [None]:
def run_experiment(params):
    sample_size, use_custom_ocr, prompt_type, chunk_size_category, kwargs, experiment_id = params
    try:
        result = perform_experiment_token_count(sample_size, use_custom_ocr, experiment_id, prompt_type, chunk_size_category, **kwargs)
        return result
    except Exception as e:
        logging.error(f"An error occurred in experiment {experiment_id}: {e}")
        return pd.DataFrame()
        
def test_experiment_serially():
    for i, params in enumerate(experiment_params):
        try:
            result = run_experiment(params)
            if not result.empty:
                print("Experiment completed successfully.")
            else:
                print("Experiment did not return any results.")
        except Exception as e:
            print(f"Experiment resulted in an exception: {e}")

if __name__ == "__main__":
    experiment_params = [(exp['sample_size'], exp['use_custom_ocr'], exp['prompt_type'], exp['chunk_size_category'], {
                         'example_num': exp['example_num'], 'no_schema': exp['no_schema'], 'transformation_method': exp['transformation_method'],
                         'chunking_method': exp['chunking_method'], 'level_type': exp['level_type']
                         }, exp['experiment_id']) for exp in combinations]
    
    test_experiment_serially()

2025-02-19 11:10:48,853 - INFO - Starting experiment with ID 0
2025-02-19 11:10:48,857 - INFO - Processing template type: Amendment
2025-02-19 11:10:48,858 - INFO - Experiment 0: Processing file 20030714_Arnold _ Porter Kaye Scholer, LLP_Amendment_Amendment.pdf


s---------------Templete type:  Amendment
s---------------Level type:  STL
s---------------Fiename:  20030714_Arnold _ Porter Kaye Scholer, LLP_Amendment_Amendment.pdf


2025-02-19 11:11:02,120 - INFO - Text length: 4437, Chunk size: 2958
2025-02-19 11:11:02,122 - INFO - Generated chunk: OMB NO. 1105-0004 U.. Department of Justice Washin... with length 4436
2025-02-19 11:11:02,123 - INFO - Generated 1 chunks with total text length 4436
2025-02-19 11:11:02,124 - INFO - Generated 1 chunks with total prompt token size 694
2025-02-19 11:11:02,134 - INFO - Chunk 1/1: Token Count = 1003
2025-02-19 11:11:02,136 - INFO - Experiment 0: Processing file 20110103_Representative of the Turkish Republic of Northern Cyprus_Amendment_Amendment.pdf


s---------------Fiename:  20110103_Representative of the Turkish Republic of Northern Cyprus_Amendment_Amendment.pdf


2025-02-19 11:11:14,237 - INFO - Text length: 4652, Chunk size: 2958
2025-02-19 11:11:14,239 - INFO - Generated chunk: CO OMB NO. 1124-0003 U.S. Department of Justice Wa... with length 4651
2025-02-19 11:11:14,240 - INFO - Generated 1 chunks with total text length 4651
2025-02-19 11:11:14,241 - INFO - Generated 1 chunks with total prompt token size 716
2025-02-19 11:11:14,245 - INFO - Chunk 1/1: Token Count = 1062
2025-02-19 11:11:14,247 - INFO - Experiment 0: Processing file 19850801_St. Lucia Tourist Board_Amendment_Amendment.pdf


s---------------Fiename:  19850801_St. Lucia Tourist Board_Amendment_Amendment.pdf


2025-02-19 11:11:28,256 - INFO - Text length: 2079, Chunk size: 2958
2025-02-19 11:11:28,258 - INFO - Generated chunk: , Department of Justice · Washington, DC 20530 Ame... with length 2075
2025-02-19 11:11:28,260 - INFO - Generated 1 chunks with total text length 2075
2025-02-19 11:11:28,261 - INFO - Generated 1 chunks with total prompt token size 347
2025-02-19 11:11:28,263 - INFO - Chunk 1/1: Token Count = 607
2025-02-19 11:11:28,265 - INFO - Experiment 0: Processing file 19830401_Bermuda Tourism Authority_Amendment_Amendment.pdf


s---------------Fiename:  19830401_Bermuda Tourism Authority_Amendment_Amendment.pdf


2025-02-19 11:11:39,588 - INFO - Text length: 2150, Chunk size: 2958
2025-02-19 11:11:39,590 - INFO - Generated chunk: OMB No. 43-R226 Approval Expires Oct. 31, 1981 UNI... with length 2148
2025-02-19 11:11:39,591 - INFO - Generated 1 chunks with total text length 2148
2025-02-19 11:11:39,592 - INFO - Generated 1 chunks with total prompt token size 361
2025-02-19 11:11:39,595 - INFO - Chunk 1/1: Token Count = 617
2025-02-19 11:11:39,597 - INFO - Experiment 0: Processing file 20180105_Hill and Knowlton Strategies, LLC_Amendment_Amendment.pdf


s---------------Fiename:  20180105_Hill and Knowlton Strategies, LLC_Amendment_Amendment.pdf


2025-02-19 11:11:53,184 - INFO - Text length: 5192, Chunk size: 2958
2025-02-19 11:11:53,186 - INFO - Generated chunk: Received by NSD/FARA Registration Unit 01/05/2018 ... with length 5191
2025-02-19 11:11:53,188 - INFO - Generated 1 chunks with total text length 5191
2025-02-19 11:11:53,189 - INFO - Generated 1 chunks with total prompt token size 785
2025-02-19 11:11:53,193 - INFO - Chunk 1/1: Token Count = 1166
2025-02-19 11:11:53,194 - INFO - Experiment 0: Processing file 20170224_Alston _ Bird, LLP_Amendment_Amendment.pdf


s---------------Fiename:  20170224_Alston _ Bird, LLP_Amendment_Amendment.pdf
