In [3]:
import os
from os import path
import itertools
import re
import json
import time
import requests
import logging
import pandas as pd
from modules.data_processing import load_dataset
from modules.batch_processing import split_jsonl_file, send_batch_requests, retrieve_and_process_batch_results
from modules.arrange_conf_md import get_log_directory, get_output_directory, get_data_paths_md
from modules.run_pipeline_md import test_experiment_serially, load_experiment_results
from config import MODEL_gpt_3, MODEL_gpt_4, MODEL_llama
from multiprocessing import current_process

### Arrange working dirs

In [4]:
directory = 'vrdu2/registration-form/few_shot-splits/'
# directory = 'vrdu2/ad-buy-form/few_shot-splits/'

# Arrange logging and output dirs accordıng to model and form type
model = MODEL_llama  #MODEL_gpt_4 #  or 
log_directory = get_log_directory(directory, model) 
base_output_directory = get_output_directory(directory, model)

print("log_directory: ", log_directory)
print("base_output_directory: ", base_output_directory)

# Arrange folder and dataset dirs based on the provided main dir
folder_path, dataset_path, dtype = get_data_paths_md(directory)

print("folder_path:", folder_path)
print("dataset_path:", dataset_path)
print("dtype:", dtype)

log_file_path = path.join(log_directory, f"experiment_{current_process().pid}.log")
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[
                        logging.FileHandler(log_file_path),
                        logging.StreamHandler()
                    ])

# Example logging
logging.info("Logging setup complete.")

2025-02-24 11:37:15,207 - INFO - Logging setup complete.


log_directory:  gpt4_Markdown_Llama3_outputs/reg/
base_output_directory:  gpt4_Markdown_Llama3_outputs/reg/
folder_path: gpt4_Markdown_outputs/reg/outputs
dataset_path: vrdu2/registration-form/main/dataset.jsonl.gz
dtype: reg


### Create experıment confıguratıons

In [3]:
sample_size = 40
use_custom_ocr = False
prompt_types = ["no_schema", "few_shot", "chain_of_thought"]
example_num_options = [0, 1, 3, 5]
transformation_methods = ["layout-aware", "naive"]
chunking_method = 'fixed'
chunk_size_categories = ["max", "medium", "small"]
level_types = ["STL", "UTL"]

combinations = []
experiment_id = 0

for prompt_type, chunk_size_category, level_type in itertools.product(prompt_types, chunk_size_categories, level_types):
    if prompt_type == "no_schema":
        for transformation_method in transformation_methods:
            combination = {
                'experiment_id': experiment_id,
                'sample_size': sample_size,
                'use_custom_ocr': use_custom_ocr,
                'prompt_type': prompt_type,
                'chunk_size_category': chunk_size_category,
                'no_schema': True,
                'example_num': None,
                'transformation_method': transformation_method,
                'chunking_method': chunking_method,
                'level_type': level_type
            }
            combinations.append(combination)
            experiment_id += 1
    else:
        for example_num, transformation_method in itertools.product(example_num_options, transformation_methods):
            combination = {
                'experiment_id': experiment_id,
                'sample_size': sample_size,
                'use_custom_ocr': use_custom_ocr,
                'prompt_type': prompt_type,
                'chunk_size_category': chunk_size_category,
                'no_schema': False,
                'example_num': example_num,
                'transformation_method': transformation_method,
                'chunking_method': chunking_method,
                'level_type': level_type
            }
            combinations.append(combination)
            experiment_id += 1

# Creating a DataFrame
df = pd.DataFrame(combinations)

# Saving to CSV
csv_file_path = path.join(base_output_directory, 'all_experiment_combinations.csv')
df.to_csv(csv_file_path, index=False)

print(f"CSV file has been saved to {csv_file_path}")

CSV file has been saved to gpt4_Markdown_Llama3_outputs/reg/all_experiment_combinations.csv


### Run pıpelıne wıth created experıments

ıf you already run all experıments and just wanted to contınue wıth only postprocessıng stepsi skip next 2 cell(test_experiment_serially, split_jsonl_file and send_batch_requests) and contınue wıth followıng cell(load_experiment_results)

In [None]:
if __name__ == "__main__":
    experiment_params = [(exp['sample_size'], exp['use_custom_ocr'], exp['prompt_type'], exp['chunk_size_category'], {
                         'example_num': exp['example_num'], 'no_schema': exp['no_schema'], 'transformation_method': exp['transformation_method'],
                         'chunking_method': exp['chunking_method'], 'level_type': exp['level_type']
                         }, exp['experiment_id']) for exp in combinations]
    output_file = os.path.join(base_output_directory, 'batches/batch_requests.jsonl')
    
    all_filename_mappings, all_output_data = test_experiment_serially(directory, base_output_directory, dataset_path, dtype, folder_path, model, experiment_params, output_file)

2025-02-21 14:20:30,890 - INFO - Starting experiment with ID 0
2025-02-21 14:20:30,892 - INFO - Found 40 markdown files in gpt4_Markdown_outputs/reg/outputs/STL/Amendment
2025-02-21 14:20:30,894 - INFO - Found 40 markdown files in gpt4_Markdown_outputs/reg/outputs/STL/Dissemination
2025-02-21 14:20:30,897 - INFO - Found 40 markdown files in gpt4_Markdown_outputs/reg/outputs/STL/Short-Form
2025-02-21 14:20:44,791 - INFO - Experiment 0: Processing file 19950929_Hogan Lovells US LLP_Amendment_Amendment.md
2025-02-21 14:20:44,797 - INFO - Generating chunks for file 19950929_Hogan Lovells US LLP_Amendment_Amendment.md
2025-02-21 14:20:44,798 - INFO - Text length: 1620, Chunk size: 2958
2025-02-21 14:20:44,799 - INFO - Generated chunk: ```markdown U.S. Department of Justice Washington ... with length 1616
2025-02-21 14:20:44,801 - INFO - Generated 1 chunks with total text length 1616
2025-02-21 14:20:44,802 - INFO - Generated 1 chunks with total prompt token size 276
2025-02-21 14:20:45,809 


RAW PRED: [{'file_date': '1945-09-29', 'foreign_principle_name': ['Government of Haiti', 'Embassy of Japan'], 'registrant_name': 'Hogan & Hartson', 'registration_num': '2244', 'signer_name': 'BESSIE O. HOUCK', 'signer_title': 'Notary Public'}]

FLAT PRED:  [{'file_date': '1945-09-29', 'foreign_principle_name': ('Government of Haiti', 'Embassy of Japan'), 'registrant_name': 'Hogan & Hartson', 'registration_num': '2244', 'signer_name': 'BESSIE O. HOUCK', 'signer_title': 'Notary Public'}]

Reconciled PRED:  {'file_date': '1945-09-29', 'foreign_principle_name': ('Embassy of Japan', 'Government of Haiti'), 'registrant_name': 'Hogan & Hartson', 'registration_num': '2244', 'signer_name': 'BESSIE O. HOUCK', 'signer_title': 'Notary Public'}

initial_pred: {'file_date': '1945-09-29', 'foreign_principle_name': ('Embassy of Japan', 'Government of Haiti'), 'registrant_name': 'Hogan & Hartson', 'registration_num': '2244', 'signer_name': 'BESSIE O. HOUCK', 'signer_title': 'Notary Public'}

mapped_pr

2025-02-21 14:20:46,424 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:46,429 - INFO - Raw Model output: {
"file_date": "April 7, 1983",
"foreign_principle_name": "",
"registrant_name": "Bermuda Department of Tourism",
"registration_num": "430",
"signer_name": "Ronald N. Bassett",
"signer_title": "General Manager, North America"
}
2025-02-21 14:20:46,431 - INFO - Extracted Info List: [{'file_date': 'April 7, 1983', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}]
2025-02-21 14:20:46,434 - INFO - Processing annotations for file: 19830401_Bermuda Tourism Authority_Amendment_Amendment.pdf
2025-02-21 14:20:46,440 - INFO - Experiment 0: Processing file 19760201_Italian Government Tourist Board (ENIT), Los Angeles_Amendment_Amendment.md
2025-02-21 14:20:46,462 - INFO - Generating chun


RAW PRED: [{'file_date': 'April 7, 1983', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}]

FLAT PRED:  [{'file_date': 'April 7, 1983', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}]

Reconciled PRED:  {'file_date': 'April 7, 1983', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}

initial_pred: {'file_date': 'April 7, 1983', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}

mapped_pred: {'file_date': 'April

2025-02-21 14:20:46,927 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:46,932 - INFO - Raw Model output: {
"file_date": "",
"foreign_principle_name": "Italian Government",
"registrant_name": "Italian Government Travel Office-E.N.I.T.",
"registration_num": "#1884",
"signer_name": "",
"signer_title": ""
}
2025-02-21 14:20:46,934 - INFO - Extracted Info List: [{'file_date': '', 'foreign_principle_name': 'Italian Government', 'registrant_name': 'Italian Government Travel Office-E.N.I.T.', 'registration_num': '#1884', 'signer_name': '', 'signer_title': ''}]
2025-02-21 14:20:46,936 - INFO - Processing annotations for file: 19760201_Italian Government Tourist Board (ENIT), Los Angeles_Amendment_Amendment.pdf
2025-02-21 14:20:46,940 - INFO - Experiment 0: Processing file 19961018_DLA Piper US LLP_Amendment_Amendment.md
2025-02-21 14:20:46,953 - INFO - Generating chunks for file 19961018_DLA Piper US LLP_Amendment_Amendment.md
2025


RAW PRED: [{'file_date': '', 'foreign_principle_name': 'Italian Government', 'registrant_name': 'Italian Government Travel Office-E.N.I.T.', 'registration_num': '#1884', 'signer_name': '', 'signer_title': ''}]

FLAT PRED:  [{'file_date': '', 'foreign_principle_name': 'Italian Government', 'registrant_name': 'Italian Government Travel Office-E.N.I.T.', 'registration_num': '#1884', 'signer_name': '', 'signer_title': ''}]

Reconciled PRED:  {'file_date': '', 'foreign_principle_name': 'Italian Government', 'registrant_name': 'Italian Government Travel Office-E.N.I.T.', 'registration_num': '#1884', 'signer_name': '', 'signer_title': ''}

initial_pred: {'file_date': '', 'foreign_principle_name': 'Italian Government', 'registrant_name': 'Italian Government Travel Office-E.N.I.T.', 'registration_num': '#1884', 'signer_name': '', 'signer_title': ''}

mapped_pred: {'file_date': '', 'foreign_principle_name': 'Italian Government', 'registrant_name': 'Italian Government Travel Office-E.N.I.T.', 'r

2025-02-21 14:20:47,543 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:47,548 - INFO - Raw Model output: Here are the extracted values in JSON format:

```
{
    "file_date": "1996-10-18",
    "foreign_principle_name": "Government of Oman",
    "registrant_name": "Verner, Liipfert, Bernhard, McPherson and Hand, Chartered",
    "registration_num": "3712",
    "signer_name": "Daniel Manatt",
    "signer_title": ""
}
```
2025-02-21 14:20:47,550 - INFO - Extracted Info List: [{'file_date': '1996-10-18', 'foreign_principle_name': 'Government of Oman', 'registrant_name': 'Verner, Liipfert, Bernhard, McPherson and Hand, Chartered', 'registration_num': '3712', 'signer_name': 'Daniel Manatt', 'signer_title': ''}]
2025-02-21 14:20:47,551 - INFO - Processing annotations for file: 19961018_DLA Piper US LLP_Amendment_Amendment.pdf
2025-02-21 14:20:47,557 - INFO - Experiment 0: Processing file 20020611_CMGRP, Inc._Amendment_Amendment.md


RAW PRED: [{'file_date': '1996-10-18', 'foreign_principle_name': 'Government of Oman', 'registrant_name': 'Verner, Liipfert, Bernhard, McPherson and Hand, Chartered', 'registration_num': '3712', 'signer_name': 'Daniel Manatt', 'signer_title': ''}]

FLAT PRED:  [{'file_date': '1996-10-18', 'foreign_principle_name': 'Government of Oman', 'registrant_name': 'Verner, Liipfert, Bernhard, McPherson and Hand, Chartered', 'registration_num': '3712', 'signer_name': 'Daniel Manatt', 'signer_title': ''}]

Reconciled PRED:  {'file_date': '1996-10-18', 'foreign_principle_name': 'Government of Oman', 'registrant_name': 'Verner, Liipfert, Bernhard, McPherson and Hand, Chartered', 'registration_num': '3712', 'signer_name': 'Daniel Manatt', 'signer_title': ''}

initial_pred: {'file_date': '1996-10-18', 'foreign_principle_name': 'Government of Oman', 'registrant_name': 'Verner, Liipfert, Bernhard, McPherson and Hand, Chartered', 'registration_num': '3712', 'signer_name': 'Daniel Manatt', 'signer_title'

2025-02-21 14:20:48,372 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:48,376 - INFO - Raw Model output: {
"file_date": "3/28/02",
"foreign_principle_name": "",
"registrant_name": "BSMG Worldwide",
"registration_num": "3911",
"signer_name": "Lance Morgan",
"signer_title": ""
}
2025-02-21 14:20:48,378 - INFO - Extracted Info List: [{'file_date': '3/28/02', 'foreign_principle_name': '', 'registrant_name': 'BSMG Worldwide', 'registration_num': '3911', 'signer_name': 'Lance Morgan', 'signer_title': ''}]
2025-02-21 14:20:48,380 - INFO - Processing annotations for file: 20020611_CMGRP, Inc._Amendment_Amendment.pdf
2025-02-21 14:20:48,384 - INFO - Experiment 0: Processing file 20160406_Rokk Solutions, LLC_Amendment_Amendment.md
2025-02-21 14:20:48,401 - INFO - Generating chunks for file 20160406_Rokk Solutions, LLC_Amendment_Amendment.md
2025-02-21 14:20:48,402 - INFO - Text length: 2460, Chunk size: 2958
2025-02-21 14:20:48,404 


RAW PRED: [{'file_date': '3/28/02', 'foreign_principle_name': '', 'registrant_name': 'BSMG Worldwide', 'registration_num': '3911', 'signer_name': 'Lance Morgan', 'signer_title': ''}]

FLAT PRED:  [{'file_date': '3/28/02', 'foreign_principle_name': '', 'registrant_name': 'BSMG Worldwide', 'registration_num': '3911', 'signer_name': 'Lance Morgan', 'signer_title': ''}]

Reconciled PRED:  {'file_date': '3/28/02', 'foreign_principle_name': '', 'registrant_name': 'BSMG Worldwide', 'registration_num': '3911', 'signer_name': 'Lance Morgan', 'signer_title': ''}

initial_pred: {'file_date': '3/28/02', 'foreign_principle_name': '', 'registrant_name': 'BSMG Worldwide', 'registration_num': '3911', 'signer_name': 'Lance Morgan', 'signer_title': ''}

mapped_pred: {'file_date': '3/28/02', 'foreign_principle_name': '', 'registrant_name': 'BSMG Worldwide', 'registration_num': '3911', 'signer_name': 'Lance Morgan', 'signer_title': ''}

cleaned_pred: {'file_date': '2002-03-28', 'foreign_principle_name': 

2025-02-21 14:20:48,947 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:48,952 - INFO - Raw Model output: Here is the extracted information in JSON format:

```
{
  "file_date": "April 06, 2016",
  "foreign_principle_name": "Embassy of the Republic of Korea",
  "registrant_name": "Rock Solutions LLC",
  "registration_num": "6320",
  "signer_name": "Ron Bonjean",
  "signer_title": "Member"
}
```
2025-02-21 14:20:48,954 - INFO - Extracted Info List: [{'file_date': 'April 06, 2016', 'foreign_principle_name': 'Embassy of the Republic of Korea', 'registrant_name': 'Rock Solutions LLC', 'registration_num': '6320', 'signer_name': 'Ron Bonjean', 'signer_title': 'Member'}]
2025-02-21 14:20:48,956 - INFO - Processing annotations for file: 20160406_Rokk Solutions, LLC_Amendment_Amendment.pdf
2025-02-21 14:20:48,962 - INFO - Experiment 0: Processing file 20131127_Independent Diplomat, Inc._Amendment_Amendment.md
2025-02-21 14:20:48,976


RAW PRED: [{'file_date': 'April 06, 2016', 'foreign_principle_name': 'Embassy of the Republic of Korea', 'registrant_name': 'Rock Solutions LLC', 'registration_num': '6320', 'signer_name': 'Ron Bonjean', 'signer_title': 'Member'}]

FLAT PRED:  [{'file_date': 'April 06, 2016', 'foreign_principle_name': 'Embassy of the Republic of Korea', 'registrant_name': 'Rock Solutions LLC', 'registration_num': '6320', 'signer_name': 'Ron Bonjean', 'signer_title': 'Member'}]

Reconciled PRED:  {'file_date': 'April 06, 2016', 'foreign_principle_name': 'Embassy of the Republic of Korea', 'registrant_name': 'Rock Solutions LLC', 'registration_num': '6320', 'signer_name': 'Ron Bonjean', 'signer_title': 'Member'}

initial_pred: {'file_date': 'April 06, 2016', 'foreign_principle_name': 'Embassy of the Republic of Korea', 'registrant_name': 'Rock Solutions LLC', 'registration_num': '6320', 'signer_name': 'Ron Bonjean', 'signer_title': 'Member'}

mapped_pred: {'file_date': 'April 06, 2016', 'foreign_princip

2025-02-21 14:20:49,997 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:50,001 - INFO - Raw Model output: {
"file_date": "11/27/2013",
"foreign_principle_name": "The Government of the Republic of South Sudan",
"registrant_name": "Independent Diplomat",
"registration_num": "5860",
"signer_name": "Carne William Ross",
"signer_title": ""
}
2025-02-21 14:20:50,003 - INFO - Extracted Info List: [{'file_date': '11/27/2013', 'foreign_principle_name': 'The Government of the Republic of South Sudan', 'registrant_name': 'Independent Diplomat', 'registration_num': '5860', 'signer_name': 'Carne William Ross', 'signer_title': ''}]
2025-02-21 14:20:50,004 - INFO - Processing annotations for file: 20131127_Independent Diplomat, Inc._Amendment_Amendment.pdf
2025-02-21 14:20:50,011 - INFO - Experiment 0: Processing file 19820401_Bermuda Tourism Authority_Amendment_Amendment.md
2025-02-21 14:20:50,023 - INFO - Generating chunks for file 1982


RAW PRED: [{'file_date': '11/27/2013', 'foreign_principle_name': 'The Government of the Republic of South Sudan', 'registrant_name': 'Independent Diplomat', 'registration_num': '5860', 'signer_name': 'Carne William Ross', 'signer_title': ''}]

FLAT PRED:  [{'file_date': '11/27/2013', 'foreign_principle_name': 'The Government of the Republic of South Sudan', 'registrant_name': 'Independent Diplomat', 'registration_num': '5860', 'signer_name': 'Carne William Ross', 'signer_title': ''}]

Reconciled PRED:  {'file_date': '11/27/2013', 'foreign_principle_name': 'The Government of the Republic of South Sudan', 'registrant_name': 'Independent Diplomat', 'registration_num': '5860', 'signer_name': 'Carne William Ross', 'signer_title': ''}

initial_pred: {'file_date': '11/27/2013', 'foreign_principle_name': 'The Government of the Republic of South Sudan', 'registrant_name': 'Independent Diplomat', 'registration_num': '5860', 'signer_name': 'Carne William Ross', 'signer_title': ''}

mapped_pred: 

2025-02-21 14:20:50,907 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:50,911 - INFO - Raw Model output: {
"file_date": "1982-04-08",
"foreign_principle_name": "",
"registrant_name": "Bermuda Department of Tourism",
"registration_num": "430",
"signer_name": "Ronald N. Bassett",
"signer_title": "General Manager, North America"
}
2025-02-21 14:20:50,913 - INFO - Extracted Info List: [{'file_date': '1982-04-08', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}]
2025-02-21 14:20:50,915 - INFO - Processing annotations for file: 19820401_Bermuda Tourism Authority_Amendment_Amendment.pdf
2025-02-21 14:20:50,920 - INFO - Experiment 0: Processing file 20180123_Fleishman-Hillard, Inc._Amendment_Amendment.md
2025-02-21 14:20:50,929 - INFO - Generating chunks for file 20180123_Fleishman-Hill


RAW PRED: [{'file_date': '1982-04-08', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}]

FLAT PRED:  [{'file_date': '1982-04-08', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}]

Reconciled PRED:  {'file_date': '1982-04-08', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}

initial_pred: {'file_date': '1982-04-08', 'foreign_principle_name': '', 'registrant_name': 'Bermuda Department of Tourism', 'registration_num': '430', 'signer_name': 'Ronald N. Bassett', 'signer_title': 'General Manager, North America'}

mapped_pred: {'file_date': '1982-04-08', 'for

2025-02-21 14:20:51,689 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:51,694 - INFO - Raw Model output: {
"file_date": "1/23/2018",
"foreign_principle_name": "Special Department of Social Communication of the General Secretariat of the Presidency of the Republic of Brazil",
"registrant_name": "Fleishman-Hillard Inc.",
"registration_num": "5801",
"signer_name": "Kelly R. Garner",
"signer_title": "Senior Vice President, Senior Partner & Corporate Controller"
}
2025-02-21 14:20:51,695 - INFO - Extracted Info List: [{'file_date': '1/23/2018', 'foreign_principle_name': 'Special Department of Social Communication of the General Secretariat of the Presidency of the Republic of Brazil', 'registrant_name': 'Fleishman-Hillard Inc.', 'registration_num': '5801', 'signer_name': 'Kelly R. Garner', 'signer_title': 'Senior Vice President, Senior Partner & Corporate Controller'}]
2025-02-21 14:20:51,697 - INFO - Processing annotations for


RAW PRED: [{'file_date': '1/23/2018', 'foreign_principle_name': 'Special Department of Social Communication of the General Secretariat of the Presidency of the Republic of Brazil', 'registrant_name': 'Fleishman-Hillard Inc.', 'registration_num': '5801', 'signer_name': 'Kelly R. Garner', 'signer_title': 'Senior Vice President, Senior Partner & Corporate Controller'}]

FLAT PRED:  [{'file_date': '1/23/2018', 'foreign_principle_name': 'Special Department of Social Communication of the General Secretariat of the Presidency of the Republic of Brazil', 'registrant_name': 'Fleishman-Hillard Inc.', 'registration_num': '5801', 'signer_name': 'Kelly R. Garner', 'signer_title': 'Senior Vice President, Senior Partner & Corporate Controller'}]

Reconciled PRED:  {'file_date': '1/23/2018', 'foreign_principle_name': 'Special Department of Social Communication of the General Secretariat of the Presidency of the Republic of Brazil', 'registrant_name': 'Fleishman-Hillard Inc.', 'registration_num': '580

2025-02-21 14:20:52,216 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:52,220 - INFO - Raw Model output: {
"file_date": "06/13/2018",
"foreign_principle_name": "Republic of Turkey",
"registrant_name": "Ballard Partners",
"registration_num": "6415",
"signer_name": "Brian D. Ballard",
"signer_title": ""
}
2025-02-21 14:20:52,222 - INFO - Extracted Info List: [{'file_date': '06/13/2018', 'foreign_principle_name': 'Republic of Turkey', 'registrant_name': 'Ballard Partners', 'registration_num': '6415', 'signer_name': 'Brian D. Ballard', 'signer_title': ''}]
2025-02-21 14:20:52,224 - INFO - Processing annotations for file: 20180613_Ballard Partners_Amendment_Amendment.pdf
2025-02-21 14:20:52,230 - INFO - Experiment 0: Processing file 19920601_JETRO, San Francisco_Amendment_Amendment.md
2025-02-21 14:20:52,240 - INFO - Generating chunks for file 19920601_JETRO, San Francisco_Amendment_Amendment.md
2025-02-21 14:20:52,241 - INFO -


RAW PRED: [{'file_date': '06/13/2018', 'foreign_principle_name': 'Republic of Turkey', 'registrant_name': 'Ballard Partners', 'registration_num': '6415', 'signer_name': 'Brian D. Ballard', 'signer_title': ''}]

FLAT PRED:  [{'file_date': '06/13/2018', 'foreign_principle_name': 'Republic of Turkey', 'registrant_name': 'Ballard Partners', 'registration_num': '6415', 'signer_name': 'Brian D. Ballard', 'signer_title': ''}]

Reconciled PRED:  {'file_date': '06/13/2018', 'foreign_principle_name': 'Republic of Turkey', 'registrant_name': 'Ballard Partners', 'registration_num': '6415', 'signer_name': 'Brian D. Ballard', 'signer_title': ''}

initial_pred: {'file_date': '06/13/2018', 'foreign_principle_name': 'Republic of Turkey', 'registrant_name': 'Ballard Partners', 'registration_num': '6415', 'signer_name': 'Brian D. Ballard', 'signer_title': ''}

mapped_pred: {'file_date': '06/13/2018', 'foreign_principle_name': 'Republic of Turkey', 'registrant_name': 'Ballard Partners', 'registration_num

2025-02-21 14:20:52,822 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:52,826 - INFO - Raw Model output: {
"file_date": "1992-06-22",
"foreign_principle_name": "",
"registrant_name": "JETRO San Francisco",
"registration_num": "1813",
"signer_name": "Blancho S. Bergor",
"signer_title": "NOTARY PUBLIC CALIFORNIA"
}
2025-02-21 14:20:52,828 - INFO - Extracted Info List: [{'file_date': '1992-06-22', 'foreign_principle_name': '', 'registrant_name': 'JETRO San Francisco', 'registration_num': '1813', 'signer_name': 'Blancho S. Bergor', 'signer_title': 'NOTARY PUBLIC CALIFORNIA'}]
2025-02-21 14:20:52,830 - INFO - Processing annotations for file: 19920601_JETRO, San Francisco_Amendment_Amendment.pdf
2025-02-21 14:20:52,835 - INFO - Experiment 0: Processing file 20110103_Representative of the Turkish Republic of Northern Cyprus_Amendment_Amendment.md
2025-02-21 14:20:52,846 - INFO - Generating chunks for file 20110103_Representative 


RAW PRED: [{'file_date': '1992-06-22', 'foreign_principle_name': '', 'registrant_name': 'JETRO San Francisco', 'registration_num': '1813', 'signer_name': 'Blancho S. Bergor', 'signer_title': 'NOTARY PUBLIC CALIFORNIA'}]

FLAT PRED:  [{'file_date': '1992-06-22', 'foreign_principle_name': '', 'registrant_name': 'JETRO San Francisco', 'registration_num': '1813', 'signer_name': 'Blancho S. Bergor', 'signer_title': 'NOTARY PUBLIC CALIFORNIA'}]

Reconciled PRED:  {'file_date': '1992-06-22', 'foreign_principle_name': '', 'registrant_name': 'JETRO San Francisco', 'registration_num': '1813', 'signer_name': 'Blancho S. Bergor', 'signer_title': 'NOTARY PUBLIC CALIFORNIA'}

initial_pred: {'file_date': '1992-06-22', 'foreign_principle_name': '', 'registrant_name': 'JETRO San Francisco', 'registration_num': '1813', 'signer_name': 'Blancho S. Bergor', 'signer_title': 'NOTARY PUBLIC CALIFORNIA'}

mapped_pred: {'file_date': '1992-06-22', 'foreign_principle_name': '', 'registrant_name': 'JETRO San Fran

2025-02-21 14:20:54,112 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:54,117 - INFO - Raw Model output: {
"file_date": "12/15/2010",
"foreign_principle_name": "",
"registrant_name": "",
"registration_num": "2619",
"signer_name": "DILEK YANIK",
"signer_title": "Representative Director"
}
2025-02-21 14:20:54,119 - INFO - Extracted Info List: [{'file_date': '12/15/2010', 'foreign_principle_name': '', 'registrant_name': '', 'registration_num': '2619', 'signer_name': 'DILEK YANIK', 'signer_title': 'Representative Director'}]
2025-02-21 14:20:54,121 - INFO - Processing annotations for file: 20110103_Representative of the Turkish Republic of Northern Cyprus_Amendment_Amendment.pdf
2025-02-21 14:20:54,126 - INFO - Experiment 0: Processing file 20120424_Pillsbury Winthrop Shaw Pittman, LLP_Amendment_Amendment.md
2025-02-21 14:20:54,154 - INFO - Generating chunks for file 20120424_Pillsbury Winthrop Shaw Pittman, LLP_Amendment_Amen


RAW PRED: [{'file_date': '12/15/2010', 'foreign_principle_name': '', 'registrant_name': '', 'registration_num': '2619', 'signer_name': 'DILEK YANIK', 'signer_title': 'Representative Director'}]

FLAT PRED:  [{'file_date': '12/15/2010', 'foreign_principle_name': '', 'registrant_name': '', 'registration_num': '2619', 'signer_name': 'DILEK YANIK', 'signer_title': 'Representative Director'}]

Reconciled PRED:  {'file_date': '12/15/2010', 'foreign_principle_name': '', 'registrant_name': '', 'registration_num': '2619', 'signer_name': 'DILEK YANIK', 'signer_title': 'Representative Director'}

initial_pred: {'file_date': '12/15/2010', 'foreign_principle_name': '', 'registrant_name': '', 'registration_num': '2619', 'signer_name': 'DILEK YANIK', 'signer_title': 'Representative Director'}

mapped_pred: {'file_date': '12/15/2010', 'foreign_principle_name': '', 'registrant_name': '', 'registration_num': '2619', 'signer_name': 'DILEK YANIK', 'signer_title': 'Representative Director'}

cleaned_pred:

2025-02-21 14:20:55,100 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:55,104 - INFO - Raw Model output: {
"file_date": "04/24/2012",
"foreign_principle_name": "Secretaria de Economia (Secretariat of Economy)",
"registrant_name": "Pillsbury Winthrop Shaw Pittman LLP",
"registration_num": "5198",
"signer_name": "Stephan E. Becker",
"signer_title": ""
}
2025-02-21 14:20:55,106 - INFO - Extracted Info List: [{'file_date': '04/24/2012', 'foreign_principle_name': 'Secretaria de Economia (Secretariat of Economy)', 'registrant_name': 'Pillsbury Winthrop Shaw Pittman LLP', 'registration_num': '5198', 'signer_name': 'Stephan E. Becker', 'signer_title': ''}]
2025-02-21 14:20:55,109 - INFO - Processing annotations for file: 20120424_Pillsbury Winthrop Shaw Pittman, LLP_Amendment_Amendment.pdf
2025-02-21 14:20:55,113 - INFO - Experiment 0: Processing file 20170901_Bryan Cave, LLP_Amendment_Amendment.md
2025-02-21 14:20:55,124 - INFO -


RAW PRED: [{'file_date': '04/24/2012', 'foreign_principle_name': 'Secretaria de Economia (Secretariat of Economy)', 'registrant_name': 'Pillsbury Winthrop Shaw Pittman LLP', 'registration_num': '5198', 'signer_name': 'Stephan E. Becker', 'signer_title': ''}]

FLAT PRED:  [{'file_date': '04/24/2012', 'foreign_principle_name': 'Secretaria de Economia (Secretariat of Economy)', 'registrant_name': 'Pillsbury Winthrop Shaw Pittman LLP', 'registration_num': '5198', 'signer_name': 'Stephan E. Becker', 'signer_title': ''}]

Reconciled PRED:  {'file_date': '04/24/2012', 'foreign_principle_name': 'Secretaria de Economia (Secretariat of Economy)', 'registrant_name': 'Pillsbury Winthrop Shaw Pittman LLP', 'registration_num': '5198', 'signer_name': 'Stephan E. Becker', 'signer_title': ''}

initial_pred: {'file_date': '04/24/2012', 'foreign_principle_name': 'Secretaria de Economia (Secretariat of Economy)', 'registrant_name': 'Pillsbury Winthrop Shaw Pittman LLP', 'registration_num': '5198', 'signe

2025-02-21 14:20:55,843 - INFO - HTTP Request: POST https://api.groq.com/openai/v1/chat/completions "HTTP/1.1 200 OK"
2025-02-21 14:20:55,847 - INFO - Raw Model output: {"file_date": "09/01/2017", "foreign_principle_name": "", "registrant_name": "Bryan Cave LLP", "registration_num": "6356", "signer_name": "LaDawn Naegle", "signer_title": "Office Managing Partner"}
2025-02-21 14:20:55,849 - INFO - Extracted Info List: [{'file_date': '09/01/2017', 'foreign_principle_name': '', 'registrant_name': 'Bryan Cave LLP', 'registration_num': '6356', 'signer_name': 'LaDawn Naegle', 'signer_title': 'Office Managing Partner'}]
2025-02-21 14:20:55,851 - INFO - Processing annotations for file: 20170901_Bryan Cave, LLP_Amendment_Amendment.pdf
2025-02-21 14:20:55,856 - INFO - Experiment 0: Processing file 20171116_Wiley Rein, LLP_Amendment_Amendment.md
2025-02-21 14:20:55,870 - INFO - Generating chunks for file 20171116_Wiley Rein, LLP_Amendment_Amendment.md
2025-02-21 14:20:55,872 - INFO - Text length:


RAW PRED: [{'file_date': '09/01/2017', 'foreign_principle_name': '', 'registrant_name': 'Bryan Cave LLP', 'registration_num': '6356', 'signer_name': 'LaDawn Naegle', 'signer_title': 'Office Managing Partner'}]

FLAT PRED:  [{'file_date': '09/01/2017', 'foreign_principle_name': '', 'registrant_name': 'Bryan Cave LLP', 'registration_num': '6356', 'signer_name': 'LaDawn Naegle', 'signer_title': 'Office Managing Partner'}]

Reconciled PRED:  {'file_date': '09/01/2017', 'foreign_principle_name': '', 'registrant_name': 'Bryan Cave LLP', 'registration_num': '6356', 'signer_name': 'LaDawn Naegle', 'signer_title': 'Office Managing Partner'}

initial_pred: {'file_date': '09/01/2017', 'foreign_principle_name': '', 'registrant_name': 'Bryan Cave LLP', 'registration_num': '6356', 'signer_name': 'LaDawn Naegle', 'signer_title': 'Office Managing Partner'}

mapped_pred: {'file_date': '09/01/2017', 'foreign_principle_name': '', 'registrant_name': 'Bryan Cave LLP', 'registration_num': '6356', 'signer_n

In [6]:
batch_file_paths = split_jsonl_file(output_file, max_size=209715200, max_lines=1000)  # Adjust max_lines as needed
batch_ids = send_batch_requests(batch_file_paths)
print(f"Batch IDs: {batch_ids}")

2024-12-10 19:27:23,685 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
2024-12-10 19:27:24,692 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"
2024-12-10 19:27:25,467 - INFO - HTTP Request: POST https://api.openai.com/v1/files "HTTP/1.1 200 OK"
2024-12-10 19:27:26,984 - INFO - HTTP Request: POST https://api.openai.com/v1/batches "HTTP/1.1 200 OK"


Batch IDs: ['batch_6758961c868c8191b4364aa432c93e5b', 'batch_6758961ed0d081919ba1826826f5af80']


In [None]:
if __name__ == "__main__":
    all_filename_mappings, all_output_data = load_experiment_results(base_output_directory)
    print("DONE")

In [None]:
ground_truth_annotations, _ = load_dataset(dataset_path)
logging.info("Loaded ground truth annotations")
response_dir = os.path.join(base_output_directory, "responses")

retrieve_and_process_batch_results(response_dir, ground_truth_annotations, all_filename_mappings, all_output_data, base_output_directory)

### Concatanate all experıment result ınto one csv fıle and perform aggregatıon

In [None]:
import os
import pandas as pd
import logging

def concatenate_experiments(experiments_dir, output_filename):
    try:
        # Initialize an empty list to store DataFrames
        dataframes = []

        # Check if the experiments directory exists
        if not os.path.exists(experiments_dir):
            logging.error(f"Experiments directory not found: {full_experiments_path}")
            return None

        # Iterate over each file in the experiments directory
        for filename in os.listdir(experiments_dir):
            # Check if the file is a CSV
            if filename.endswith(".csv"):
                file_path = os.path.join(experiments_dir, filename)

                # Read the CSV file into a DataFrame
                df = pd.read_csv(file_path)

                # Add a column for the experiment ID derived from the filename
                experiment_id = os.path.splitext(filename)[0]
                df['experiment_id'] = experiment_id

                # Append the DataFrame to the list
                dataframes.append(df)

                logging.info(f"Loaded data from {file_path}")

        # Concatenate all DataFrames in the list into a single DataFrame
        combined_df = pd.concat(dataframes, ignore_index=True)

        # Save the combined DataFrame to a CSV file
        combined_df.to_csv(output_filename, index=False)

        logging.info(f"All experiment data concatenated and saved to {output_filename}")
        print(f"All experiment data concatenated and saved to {output_filename}")
        return output_filename

    except Exception as e:
        logging.error(f"Error while concatenating experiment data: {e}")
        print(f"Error while concatenating experiment data: {e}")
        return None

# Example usage
experiments_dir = os.path.join(base_output_directory, 'experiments')
output_filename = path.join(base_output_directory, 'gpt4_Markdown_gpt3-metrics.csv')

concatenate_experiments(experiments_dir, output_filename)


In [8]:
import pandas as pd
import json
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from modules.postprocessing import *
from modules.data_processing import *
from modules.metrics import *
from config_llama import *

df = pd.read_csv(output_filename)
grouped = df.groupby('experiment_id')

aggregated_results = grouped.agg({
    'precision_before': 'mean',
    'precision_after': 'mean',
    'recall_before': 'mean',
    'recall_after': 'mean',
    'f1_before': 'mean',
    'f1_after': 'mean',
    'accuracy_before': 'mean',
    'accuracy_after': 'mean'
}).reset_index()

output_csv_path = path.join(base_output_directory, 'gpt4_Markdown_gpt3-results.csv')
aggregated_results.to_csv(output_csv_path, index=False)
print("New CSV file created successfully.")

New CSV file created successfully.
