#### Correct batches that has error.jsonl file

In [4]:
import os
import json

# Define the correct directory and output file path
error_files_directory = 'gpt4_Markdown_gpt3_outputs/reg/error_files/'
output_file = os.path.join(error_files_directory, 'error.jsonl')

# List all .jsonl files in the specified directory
input_files = [f for f in os.listdir(error_files_directory) if f.endswith('.jsonl') and f != 'error.jsonl']

# Function to combine error files with error handling for missing files
def combine_error_files(input_files, output_file):
    with open(output_file, 'w') as outfile:
        for input_file in input_files:
            input_file_path = os.path.join(error_files_directory, input_file)
            try:
                with open(input_file_path, 'r') as infile:
                    # Read each line from the input file and write it to the output file
                    for line in infile:
                        outfile.write(line)  # Each line in .jsonl is a valid JSON object
                print(f"Successfully added {input_file} to {output_file}")
            except FileNotFoundError:
                print(f"Warning: {input_file} not found, skipping it.")
            except Exception as e:
                print(f"Error reading {input_file}: {e}")
    
    print(f"All error files have been combined into {output_file}")

# Call the function to combine files
combine_error_files(input_files, output_file)


Successfully added batch_673d080399c481909f5e2da38bc62a54_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d07e73ab081908af93e972e446a43_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d07e88224819099032953c0c274a4_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d0800d1948190a1a5bb3d6fbc116f_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d07f7d3848190b5848f840869111b_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d07e464c881909f763681be62463c_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d080237f08190a2f9a8528f0f022d_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl
Successfully added batch_673d07f1c5908190b0d98381352860fa_error.jsonl to gpt4_Markdown_gpt3_outputs/reg/

In [5]:
import os
import json

# Define the correct directory for error files
error_files_directory = 'gpt4_Markdown_gpt3_outputs/reg/error_files/'

# Function to count the number of tasks in a .jsonl file
def count_tasks_in_jsonl(file_path):
    try:
        with open(file_path, 'r') as file:
            # Count lines (assuming each line represents a task)
            return sum(1 for line in file)
    except Exception as e:
        print(f"Error counting tasks in {file_path}: {e}")
        return 0

# Count tasks in all files except error.jsonl
total_tasks_in_files = 0
input_files = [f for f in os.listdir(error_files_directory) if f.endswith('.jsonl') and f != 'error.jsonl']

# Count tasks in the individual files
for input_file in input_files:
    input_file_path = os.path.join(error_files_directory, input_file)
    tasks_count = count_tasks_in_jsonl(input_file_path)
    total_tasks_in_files += tasks_count

# Count tasks in the error.jsonl file
error_file_path = os.path.join(error_files_directory, 'error.jsonl')
tasks_in_error_file = count_tasks_in_jsonl(error_file_path)

# Compare the lengths
if total_tasks_in_files == tasks_in_error_file:
    print(f"The total number of tasks in all files: {total_tasks_in_files} matches the number of tasks in error.jsonl: {tasks_in_error_file}.")
else:
    print(f"Mismatch! Total tasks in files: {total_tasks_in_files}, tasks in error.jsonl: {tasks_in_error_file}")


The total number of tasks in all files: 820 matches the number of tasks in error.jsonl: 820.


In [52]:
import json
import re

# Define the paths to the files
batch_requests_file = 'gpt4_Markdown_gpt3_outputs/reg/batches/batch_requests.jsonl'
error_file = 'gpt4_Markdown_gpt3_outputs/reg/error_files/error.jsonl'
new_batch_requests_file = 'gpt4_Markdown_gpt3_outputs/reg/batches/new_batch_requests.jsonl'

# Function to load JSONL files
def load_jsonl(file_path):
    with open(file_path, 'r') as f:
        return [json.loads(line) for line in f]

# Function to clean repetitive patterns from the prompt content
def clean_repetitive_patterns(content):
    # Remove excessive sequences of "|"
    cleaned_content = re.sub(r'(\| ){3,}', '', content)
    # Remove blocks of repeated empty lines
    cleaned_content = re.sub(r'\n{2,}', '\n', cleaned_content)
    # Remove Markdown tables with excessive pipe usage
    cleaned_content = re.sub(r'\|[-\s]*\|', '', cleaned_content)
    # Normalize multiple spaces
    cleaned_content = re.sub(r'\s{2,}', ' ', cleaned_content)
    # Trim leading and trailing whitespace
    return cleaned_content.strip()

# Load the error requests and batch requests
error_requests = load_jsonl(error_file)
batch_requests = load_jsonl(batch_requests_file)

# Set of custom_ids from the error requests
error_custom_ids = {error['custom_id'] for error in error_requests}

# Create a list to hold only the cleaned error requests
cleaned_error_requests = []

# Process each batch request that corresponds to an error
for request in batch_requests:
    custom_id = request['custom_id']
    
    # Check if this request has a corresponding error
    if custom_id in error_custom_ids:
        # Debug: Print the custom_id for tracking
        print(f"Processing custom_id: {custom_id}")
        
        # Iterate over all the messages in the request body
        for message in request['body']['messages']:
            if message['role'] == 'user' and 'content' in message:
                original_content = message['content']
                
                # Clean repetitive patterns in the content
                cleaned_content = clean_repetitive_patterns(original_content)
                message['content'] = cleaned_content  # Update the content
                
                # Debug: Confirm content is updated
                print(f"Original Content:\n{original_content[:500]}...")  # Print part of the content for review
                print(f"Cleaned Content:\n{cleaned_content[:500]}...")  # Print part of the cleaned content for review
        
        # Add the cleaned request to the cleaned_error_requests list
        cleaned_error_requests.append(request)

# Save the cleaned error requests to the new batch requests file
with open(new_batch_requests_file, 'w') as f:
    for request in cleaned_error_requests:
        f.write(json.dumps(request) + '\n')

print(f"Resolved tasks have been saved to {new_batch_requests_file}")


Processing custom_id: 0|STL|Amendment|20170815_Pillsbury_Winthrop_Shaw_Pittman_LLP_Amendment_Amendmentmd|no_schema|None|max|layout-aware|0|25
Original Content:
<Document>
```markdown 1. Name of Registrant: Pillsbury Winthrop Shaw Pittman LLP Registration No.: 5198 This amendment is filed to accomplish the following indicated purpose or purposes: To give notice of change in an exhibit previously filed. If this amendment requires the filing of a document or documents, please list: N/A Each item checked above must be explained below in full detail together with, where appropriate, specific reference to and identity of the item in the registration statemen...
Cleaned Content:
<Document>
```markdown 1. Name of Registrant: Pillsbury Winthrop Shaw Pittman LLP Registration No.: 5198 This amendment is filed to accomplish the following indicated purpose or purposes: To give notice of change in an exhibit previously filed. If this amendment requires the filing of a document or documents, please l

In [53]:
tasks_in_batch_file = count_tasks_in_jsonl(new_batch_requests_file)
tasks_in_batch_file

820

In [54]:
error_file_path = os.path.join(error_files_directory, 'error.jsonl')
tasks_in_error_file = count_tasks_in_jsonl(error_file_path)
tasks_in_error_file

820

In [55]:
tasks_in_batch_requests_file = count_tasks_in_jsonl(batch_requests_file)
tasks_in_batch_requests_file

17050

#### Correct batches before sending to the model

In [3]:
import json

# File paths
file1_path = "gpt4_Markdown_gpt4_outputs/reg/batches/batch_requests.jsonl"
file2_path = "gpt4_Markdown_gpt3_outputs/reg/batches/new_batch_requests.jsonl"
output_file_path = "gpt4_Markdown_gpt4_outputs/reg/batches/updated_batch_requests.jsonl"

# Load JSONL data
def load_jsonl(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

# Save JSONL data
def save_jsonl(data, file_path):
    with open(file_path, 'w', encoding='utf-8') as file:
        for item in data:
            file.write(json.dumps(item) + '\n')

# Load the files
file1_tasks = load_jsonl(file1_path)
file2_tasks = load_jsonl(file2_path)

# Create a mapping of custom_id to tasks for the second file
file2_task_map = {task['custom_id']: task for task in file2_tasks}

# Replace tasks in file1 with corresponding tasks from file2, if they exist
counter = 0
updated_tasks = []
for task in file1_tasks:
    custom_id = task['custom_id']
    if custom_id in file2_task_map:
        # Replace the task with the version from file2
        updated_tasks.append(file2_task_map[custom_id])
        counter += 1
    else:
        # Keep the original task if no replacement is found
        updated_tasks.append(task)

# Save the updated tasks to the output file
save_jsonl(updated_tasks, output_file_path)

print(f"Updated batch requests saved to {output_file_path}")


Updated batch requests saved to gpt4_Markdown_gpt4_outputs/reg/batches/updated_batch_requests.jsonl


In [4]:
counter

820

In [6]:
tasks_in_batch_requests_file = count_tasks_in_jsonl(file1_path)
tasks_in_batch_requests_file

17050

In [7]:
output_file = count_tasks_in_jsonl(output_file_path)
output_file

17050

In [2]:
import os
import json

# Directory path provided by the user
directory_path = 'gpt4_Markdown_gpt4_outputs/reg/batches'

# Function to process a single file and update model values
def update_models_in_file(file_path, target_model="gpt-4o"):
    updated_lines = []
    with open(file_path, 'r') as file:
        for line in file:
            request = json.loads(line)
            if "body" in request and "model" in request["body"]:
                request["body"]["model"] = target_model  # Update the model
            updated_lines.append(json.dumps(request))
    # Save the updated file (overwrite the original)
    with open(file_path, 'w') as file:
        file.write("\n".join(updated_lines))

# Walk through the directory and process each JSONL file
for root, _, files in os.walk(directory_path):
    for file in files:
        if file.endswith('.jsonl'):
            file_path = os.path.join(root, file)
            update_models_in_file(file_path)

print("All .jsonl files have been updated to use 'gpt-4o' as the model.")


All .jsonl files have been updated to use 'gpt-4o' as the model.
