# create JSONL files for Finetuning - Train

In [8]:
import os
import json
import pandas as pd

# Define the directory paths
splits_dir = 'splits'  # Directory where your CSV split files are located
parquet_path = 'pypdfextraction/extracted_text.parquet'  # Path to your extracted text parquet file
output_dir = 'finetune_data'  # Directory to save fine-tuning data

# Check if the directories and files exist
if not os.path.exists(splits_dir):
    raise FileNotFoundError(f"The splits directory '{splits_dir}' does not exist.")
else:
    # Check for CSV files in the splits directory
    csv_files = [f for f in os.listdir(splits_dir) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in the splits directory '{splits_dir}'.")
    else:
        print(f"Found {len(csv_files)} CSV files in '{splits_dir}'.")

if not os.path.isfile(parquet_path):
    raise FileNotFoundError(f"The parquet file '{parquet_path}' does not exist.")
else:
    print(f"Parquet file '{parquet_path}' exists.")

if not os.path.exists(output_dir):
    print(f"The output directory '{output_dir}' does not exist. Creating it now.")
    os.makedirs(output_dir)
else:
    print(f"The output directory '{output_dir}' already exists.")
    # Check if the output directory is empty
    if not os.listdir(output_dir):
        print(f"The output directory '{output_dir}' is empty.")
    else:
        print(f"The output directory '{output_dir}' contains files or subdirectories.")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the extracted text DataFrame
extracted_text_df = pd.read_parquet(parquet_path)

# Preprocess the 'filename' column in extracted_text_df to match the 'node' format
# Remove the '.pdf' extension and replace '@' with '/'
extracted_text_df['node'] = extracted_text_df['filename'].str.replace('.pdf', '', regex=False)
extracted_text_df['node'] = extracted_text_df['node'].str.replace('@', '/')

# Set of filenames in extracted_text_df for quick lookup
available_nodes = set(extracted_text_df['node'])

# List of attributes and their corresponding prompts and JSON keys
attributes_info = {
    'name': {
        'json_key': 'compoundName',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the compound names from the following text. "
            "Provide the answers in JSON format: [{\"compoundName\": \"Example Compound Name 1\"}, {\"compoundName\": \"Example Compound Name 2\"}]. "
            "If the compound names are not specified, leave it empty like \"\"."
        )
    },
    'bioActivity': {
        'json_key': 'bioactivity',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the bioactivities from the following text. "
            "Provide the answers in JSON format: [{\"bioactivity\": \"Example Bioactivity 1\"}, {\"bioactivity\": \"Example Bioactivity 2\"}]. "
            "If the bioactivities are not specified, leave it empty like \"\"."
        )
    },
    'collectionSpecie': {
        'json_key': 'species',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the species from the following text. "
            "Provide the answers in JSON format: [{\"species\": \"Example Species 1\"}, {\"species\": \"Example Species 2\"}]. "
            "If the species are not specified, leave it empty like \"\"."
        )
    },
    'collectionSite': {
        'json_key': 'collectionSite',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the collection sites from the following text. "
            "Provide the answers in JSON format: [{\"collectionSite\": \"Example Collection Site 1\"}, {\"collectionSite\": \"Example Collection Site 2\"}]. "
            "If the collection sites are not specified, leave it empty like \"\"."
        )
    },
    'collectionType': {
        'json_key': 'isolationType',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the isolation types from the following text. "
            "Provide the answers in JSON format: [{\"isolationType\": \"Example Isolation Type 1\"}, {\"isolationType\": \"Example Isolation Type 2\"}]. "
            "If the isolation types are not specified, leave it empty like \"\"."
        )
    }
}

# Iteration and split to use
iteration = '0'
split = '1st'

for attribute, info in attributes_info.items():
    # Construct the filename for the CSV file
    csv_filename = f'train_doi_{attribute}_{iteration}_{split}.csv'
    csv_path = os.path.join(splits_dir, csv_filename)

    # Check if the CSV file exists
    if not os.path.exists(csv_path):
        print(f"File {csv_path} does not exist. Skipping attribute '{attribute}'.")
        continue

    # Load the CSV file
    csv_df = pd.read_csv(csv_path)

    # Preprocess the 'node' column in csv_df to match the 'node' in extracted_text_df
    csv_df['node'] = csv_df['node'].str.replace('@', '/')  # Replace '@' with '/'

    # Filter out rows where the node doesn't have a corresponding PDF text
    csv_df = csv_df[csv_df['node'].isin(available_nodes)]

    if csv_df.empty:
        print(f"No matching nodes found for attribute '{attribute}'. Skipping.")
        continue

    # Merge with the extracted text DataFrame on the 'node'
    merged_df = pd.merge(csv_df, extracted_text_df[['node', 'text']], on='node')

    finetune_data = []

    # System prompt for the current attribute
    system_prompt = info['prompt']
    json_key = info['json_key']

    # Group the merged DataFrame by 'node' to handle multiple expected values
    grouped = merged_df.groupby('node')

    for node, group in grouped:
        user_input = group['text'].iloc[0]  # Text extracted from the PDF

        # Collect all expected outputs for the current node
        expected_values = group['neighbor'].dropna().tolist()

        if expected_values:
            # Create a list of expected output dictionaries
            expected_output = [{json_key: value} for value in expected_values]
        else:
            # If no expected values, provide an empty string
            expected_output = [{json_key: ""}]

        # Create the message object with all expected outputs
        message = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": json.dumps(expected_output, ensure_ascii=False)}
            ]
        }
        finetune_data.append(message)

    if not finetune_data:
        print(f"No data to write for attribute '{attribute}'. Skipping.")
        continue

    # Save to JSONL file
    output_filename = f'finetune_{attribute}_{iteration}_{split}.jsonl'
    output_path = os.path.join(output_dir, output_filename)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for entry in finetune_data:
            json_line = json.dumps(entry, ensure_ascii=False)
            f_out.write(json_line + '\n')

    print(f"Saved fine-tune data for attribute '{attribute}' to {output_path}")


Found 400 CSV files in 'splits'.
Parquet file 'pypdfextraction/extracted_text.parquet' exists.
The output directory 'finetune_data' already exists.
The output directory 'finetune_data' contains files or subdirectories.
Saved fine-tune data for attribute 'name' to finetune_data/finetune_name_0_1st.jsonl
Saved fine-tune data for attribute 'bioActivity' to finetune_data/finetune_bioActivity_0_1st.jsonl
Saved fine-tune data for attribute 'collectionSpecie' to finetune_data/finetune_collectionSpecie_0_1st.jsonl
Saved fine-tune data for attribute 'collectionSite' to finetune_data/finetune_collectionSite_0_1st.jsonl
Saved fine-tune data for attribute 'collectionType' to finetune_data/finetune_collectionType_0_1st.jsonl


# create JSONL files for Finetuning - Test

In [9]:
import os
import json
import pandas as pd

# Define the directory paths
splits_dir = 'splits'  # Directory where your CSV split files are located
parquet_path = 'pypdfextraction/extracted_text.parquet'  # Path to your extracted text parquet file
output_dir = 'finetune_data'  # Directory to save fine-tuning data

# Check if the directories and files exist
if not os.path.exists(splits_dir):
    raise FileNotFoundError(f"The splits directory '{splits_dir}' does not exist.")
else:
    # Check for CSV files in the splits directory
    csv_files = [f for f in os.listdir(splits_dir) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in the splits directory '{splits_dir}'.")
    else:
        print(f"Found {len(csv_files)} CSV files in '{splits_dir}'.")

if not os.path.isfile(parquet_path):
    raise FileNotFoundError(f"The parquet file '{parquet_path}' does not exist.")
else:
    print(f"Parquet file '{parquet_path}' exists.")

if not os.path.exists(output_dir):
    print(f"The output directory '{output_dir}' does not exist. Creating it now.")
    os.makedirs(output_dir)
else:
    print(f"The output directory '{output_dir}' already exists.")
    # Check if the output directory is empty
    if not os.listdir(output_dir):
        print(f"The output directory '{output_dir}' is empty.")
    else:
        print(f"The output directory '{output_dir}' contains files or subdirectories.")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the extracted text DataFrame
extracted_text_df = pd.read_parquet(parquet_path)

# Preprocess the 'filename' column in extracted_text_df to match the 'node' format
# Remove the '.pdf' extension and replace '@' with '/'
extracted_text_df['node'] = extracted_text_df['filename'].str.replace('.pdf', '', regex=False)
extracted_text_df['node'] = extracted_text_df['node'].str.replace('@', '/')

# Set of filenames in extracted_text_df for quick lookup
available_nodes = set(extracted_text_df['node'])

# List of attributes and their corresponding prompts and JSON keys
attributes_info = {
    'name': {
        'json_key': 'compoundName',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the compound names from the following text. "
            "Provide the answers in JSON format: [{\"compoundName\": \"Example Compound Name 1\"}, {\"compoundName\": \"Example Compound Name 2\"}]. "
            "If the compound names are not specified, leave it empty like \"\"."
        )
    },
    'bioActivity': {
        'json_key': 'bioactivity',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the bioactivities from the following text. "
            "Provide the answers in JSON format: [{\"bioactivity\": \"Example Bioactivity 1\"}, {\"bioactivity\": \"Example Bioactivity 2\"}]. "
            "If the bioactivities are not specified, leave it empty like \"\"."
        )
    },
    'collectionSpecie': {
        'json_key': 'species',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the species from the following text. "
            "Provide the answers in JSON format: [{\"species\": \"Example Species 1\"}, {\"species\": \"Example Species 2\"}]. "
            "If the species are not specified, leave it empty like \"\"."
        )
    },
    'collectionSite': {
        'json_key': 'collectionSite',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the collection sites from the following text. "
            "Provide the answers in JSON format: [{\"collectionSite\": \"Example Collection Site 1\"}, {\"collectionSite\": \"Example Collection Site 2\"}]. "
            "If the collection sites are not specified, leave it empty like \"\"."
        )
    },
    'collectionType': {
        'json_key': 'isolationType',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the isolation types from the following text. "
            "Provide the answers in JSON format: [{\"isolationType\": \"Example Isolation Type 1\"}, {\"isolationType\": \"Example Isolation Type 2\"}]. "
            "If the isolation types are not specified, leave it empty like \"\"."
        )
    }
}

# Iteration and split to use
iteration = '0'
split = '1st'

for attribute, info in attributes_info.items():
    # Construct the filename for the CSV file
    csv_filename = f'test_doi_{attribute}_{iteration}_{split}.csv'
    csv_path = os.path.join(splits_dir, csv_filename)

    # Check if the CSV file exists
    if not os.path.exists(csv_path):
        print(f"File {csv_path} does not exist. Skipping attribute '{attribute}'.")
        continue

    # Load the CSV file
    csv_df = pd.read_csv(csv_path)

    # Preprocess the 'node' column in csv_df to match the 'node' in extracted_text_df
    csv_df['node'] = csv_df['node'].str.replace('@', '/')  # Replace '@' with '/'

    # Filter out rows where the node doesn't have a corresponding PDF text
    csv_df = csv_df[csv_df['node'].isin(available_nodes)]

    if csv_df.empty:
        print(f"No matching nodes found for attribute '{attribute}'. Skipping.")
        continue

    # Merge with the extracted text DataFrame on the 'node'
    merged_df = pd.merge(csv_df, extracted_text_df[['node', 'text']], on='node')

    finetune_data = []

    # System prompt for the current attribute
    system_prompt = info['prompt']
    json_key = info['json_key']

    # Group the merged DataFrame by 'node' to handle multiple expected values
    grouped = merged_df.groupby('node')

    for node, group in grouped:
        user_input = group['text'].iloc[0]  # Text extracted from the PDF

        # Collect all expected outputs for the current node
        expected_values = group['neighbor'].dropna().tolist()

        if expected_values:
            # Create a list of expected output dictionaries
            expected_output = [{json_key: value} for value in expected_values]
        else:
            # If no expected values, provide an empty string
            expected_output = [{json_key: ""}]

        # Create the message object with all expected outputs
        message = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": json.dumps(expected_output, ensure_ascii=False)}
            ]
        }
        finetune_data.append(message)

    if not finetune_data:
        print(f"No data to write for attribute '{attribute}'. Skipping.")
        continue

    # Save to JSONL file
    output_filename = f'finetune_test_{attribute}_{iteration}_{split}.jsonl'
    output_path = os.path.join(output_dir, output_filename)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for entry in finetune_data:
            json_line = json.dumps(entry, ensure_ascii=False)
            f_out.write(json_line + '\n')

    print(f"Saved fine-tune data for attribute '{attribute}' to {output_path}")


Found 400 CSV files in 'splits'.
Parquet file 'pypdfextraction/extracted_text.parquet' exists.
The output directory 'finetune_data' already exists.
The output directory 'finetune_data' contains files or subdirectories.
Saved fine-tune data for attribute 'name' to finetune_data/finetune_test_name_0_1st.jsonl
Saved fine-tune data for attribute 'bioActivity' to finetune_data/finetune_test_bioActivity_0_1st.jsonl
Saved fine-tune data for attribute 'collectionSpecie' to finetune_data/finetune_test_collectionSpecie_0_1st.jsonl
Saved fine-tune data for attribute 'collectionSite' to finetune_data/finetune_test_collectionSite_0_1st.jsonl
Saved fine-tune data for attribute 'collectionType' to finetune_data/finetune_test_collectionType_0_1st.jsonl


# Validate JSONL files

In [11]:
import os
import json
from jsonschema import validate
from jsonschema.exceptions import ValidationError

# Define the schema
schema = {
    "type": "object",
    "properties": {
        "messages": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "role": {"type": "string", "enum": ["system", "user", "assistant"]},
                    "content": {"type": "string"}
                },
                "required": ["role", "content"]
            },
            "minItems": 3,
            "maxItems": 3
        }
    },
    "required": ["messages"]
}

def validate_jsonl(file_path):
    print(f"Validating {file_path}")
    errors = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            try:
                data = json.loads(line)
                
                # Validate against schema
                validate(instance=data, schema=schema)
                
                # Additional custom checks
                messages = data['messages']
                
                # Check system message
                assert messages[0]['role'] == 'system', f"First message must have role 'system' on line {line_number}"
                assert "You are a chemist expert in natural products." in messages[0]['content'], f"Incorrect system prompt on line {line_number}"
                
                # Check user message
                assert messages[1]['role'] == 'user', f"Second message must have role 'user' on line {line_number}"
                
                # Check assistant message
                assert messages[2]['role'] == 'assistant', f"Third message must have role 'assistant' on line {line_number}"
                assistant_content = json.loads(messages[2]['content'])
                assert isinstance(assistant_content, list), f"Assistant content must be a list on line {line_number}"
                for item in assistant_content:
                    assert isinstance(item, dict) and len(item) == 1, f"Each item in assistant content must be a dictionary with exactly one key-value pair on line {line_number}"
                
            except json.JSONDecodeError:
                errors.append(f"Invalid JSON on line {line_number}")
            except ValidationError as e:
                errors.append(f"Schema validation error on line {line_number}: {e}")
            except AssertionError as e:
                errors.append(f"Custom validation error on line {line_number}: {e}")
    
    if errors:
        print(f"Errors found in {file_path}:")
        for error in errors:
            print(f"  - {error}")
    else:
        print(f"No errors found in {file_path}")
    print()  # Add a blank line for readability

# Directory containing the JSONL files
finetune_data_dir = 'finetune_data'

# Validate all JSONL files in the directory
for filename in os.listdir(finetune_data_dir):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(finetune_data_dir, filename)
        validate_jsonl(file_path)

print("Validation complete.")

Validating finetune_data/finetune_test_collectionType_0_1st.jsonl
No errors found in finetune_data/finetune_test_collectionType_0_1st.jsonl

Validating finetune_data/finetune_test_collectionSpecie_0_1st.jsonl
No errors found in finetune_data/finetune_test_collectionSpecie_0_1st.jsonl

Validating finetune_data/finetune_test_bioActivity_0_1st.jsonl
No errors found in finetune_data/finetune_test_bioActivity_0_1st.jsonl

Validating finetune_data/finetune_test_collectionSite_0_1st.jsonl
No errors found in finetune_data/finetune_test_collectionSite_0_1st.jsonl

Validating finetune_data/finetune_collectionSite_0_1st.jsonl
No errors found in finetune_data/finetune_collectionSite_0_1st.jsonl

Validating finetune_data/finetune_bioActivity_0_1st.jsonl
No errors found in finetune_data/finetune_bioActivity_0_1st.jsonl

Validating finetune_data/finetune_collectionSpecie_0_1st.jsonl
No errors found in finetune_data/finetune_collectionSpecie_0_1st.jsonl

Validating finetune_data/finetune_name_0_1st.js

# Calculate token count for the dataset

In [6]:
import json
import tiktoken
import numpy as np
from collections import defaultdict
import os

# Load the dataset
finetune_data_dir = 'finetune_data'
dataset = []
for filename in os.listdir(finetune_data_dir):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(finetune_data_dir, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            dataset.extend([json.loads(line) for line in f])

print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

# Token counting utilities
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_messages(messages, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values):.2f}, {np.median(values):.2f}")
    print(f"p5 / p95: {np.quantile(values, 0.05):.2f}, {np.quantile(values, 0.95):.2f}")

# Data warnings and token counts
n_missing_system = 0
n_missing_user = 0
n_messages = []
convo_lens = []
assistant_message_lens = []

for ex in dataset:
    messages = ex["messages"]
    if not any(message["role"] == "system" for message in messages):
        n_missing_system += 1
    if not any(message["role"] == "user" for message in messages):
        n_missing_user += 1
    n_messages.append(len(messages))
    convo_lens.append(num_tokens_from_messages(messages))
    assistant_message_lens.append(num_assistant_tokens_from_messages(messages))
    
print("Num examples missing system message:", n_missing_system)
print("Num examples missing user message:", n_missing_user)
print_distribution(n_messages, "num_messages_per_example")
print_distribution(convo_lens, "num_total_tokens_per_example")
print_distribution(assistant_message_lens, "num_assistant_tokens_per_example")
n_too_long = sum(l > 16385 for l in convo_lens)
print(f"\n{n_too_long} examples may be over the 16,385 token limit, they will be truncated during fine-tuning")

# Cost estimation
MAX_TOKENS_PER_EXAMPLE = 16385
TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")
print("See https://openai.com/pricing to estimate total costs.")


Num examples: 145
First example:
{'role': 'system', 'content': 'You are a chemist expert in natural products. Extract the collection sites from the following text. Provide the answers in JSON format: [{"collectionSite": "Example Collection Site 1"}, {"collectionSite": "Example Collection Site 2"}]. If the collection sites are not specified, leave it empty like "".'}
{'role': 'user', 'content': 'Casearin X, Its Degradation Product and Other Clerodane Diterpenes from Leaves of Casearia sylvestris: Evaluation of Cytotoxicity against Normal and Tumor Human Cells by Andre´ Gonzaga dos Santosa), Paulo Michel Pinheiro Ferreirab), Gerardo Magela Vieira Ju´niora), Carla Cristina Pereza), Aristeu Gomes Tininisa), Geraldo Humberto Silvaa), Vanderlan da Silva Bolzania), Letcia Veras Costa-Lotufoc), Cla´udia do O´ Pessoac), and Alberto Jose´ Cavalheiro*a) a) Nu´cleo de Bioensaios, Biossntese e Ecofisiologia de Produtos Naturais, Instituto de Qumica, Sa˜o Paulo State University, CP 355, 14800-900, A