# create JSONL files for Finetuning - Train

In [3]:
import os
import json
import pandas as pd

# Define the directory paths
splits_dir = 'splits/train'  # Directory where your CSV split files are located
parquet_path = 'pypdfextraction/extracted_text.parquet'  # Path to your extracted text parquet file
output_dir = 'finetune_data'  # Directory to save fine-tuning data

# Check if the directories and files exist
if not os.path.exists(splits_dir):
    raise FileNotFoundError(f"The splits directory '{splits_dir}' does not exist.")
else:
    # Check for CSV files in the splits directory
    csv_files = [f for f in os.listdir(splits_dir) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in the splits directory '{splits_dir}'.")
    else:
        print(f"Found {len(csv_files)} CSV files in '{splits_dir}'.")

if not os.path.isfile(parquet_path):
    raise FileNotFoundError(f"The parquet file '{parquet_path}' does not exist.")
else:
    print(f"Parquet file '{parquet_path}' exists.")

if not os.path.exists(output_dir):
    print(f"The output directory '{output_dir}' does not exist. Creating it now.")
    os.makedirs(output_dir)
else:
    print(f"The output directory '{output_dir}' already exists.")
    # Check if the output directory is empty
    if not os.listdir(output_dir):
        print(f"The output directory '{output_dir}' is empty.")
    else:
        print(f"The output directory '{output_dir}' contains files or subdirectories.")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the extracted text DataFrame
extracted_text_df = pd.read_parquet(parquet_path)

# Preprocess the 'filename' column in extracted_text_df to match the 'node' format
# Remove the '.pdf' extension and replace '@' with '/'
extracted_text_df['node'] = extracted_text_df['filename'].str.replace('.pdf', '', regex=False)
extracted_text_df['node'] = extracted_text_df['node'].str.replace('@', '/')

# Set of filenames in extracted_text_df for quick lookup
available_nodes = set(extracted_text_df['node'])

# List of attributes and their corresponding prompts and JSON keys
attributes_info = {
    'name': {
        'json_key': 'compoundName',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the compound name from the following text. "
            "Provide the answer in JSON format: [{\"compoundName\": \"Example Compound Name\"}]. "
            "If the compound name is not specified, leave it empty like \"\"."
        )
    },
    'bioActivity': {
        'json_key': 'bioactivity',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the bioactivity from the following text. "
            "Provide the answer in JSON format: [{\"bioactivity\": \"Example Bioactivity\"}]. "
            "If the bioactivity is not specified, leave it empty like \"\"."
        )
    },
    'collectionSpecie': {
        'json_key': 'species',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the species from the following text. "
            "Provide the answer in JSON format: [{\"species\": \"Example Species\"}]. "
            "If the species is not specified, leave it empty like \"\"."
        )
    },
    'collectionSite': {
        'json_key': 'collectionSite',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the collection site from the following text. "
            "Provide the answer in JSON format: [{\"collectionSite\": \"Example Collection Site\"}]. "
            "If the collection site is not specified, leave it empty like \"\"."
        )
    },
    'collectionType': {
        'json_key': 'isolationType',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the isolation type from the following text. "
            "Provide the answer in JSON format: [{\"isolationType\": \"Example Isolation Type\"}]. "
            "If the isolation type is not specified, leave it empty like \"\"."
        )
    }
}

# Iteration and split to use
iteration = '0'
split = '1st'

for attribute, info in attributes_info.items():
    # Construct the filename for the CSV file
    csv_filename = f'train_doi_{attribute}_{iteration}_{split}.csv'
    csv_path = os.path.join(splits_dir, csv_filename)

    # Check if the CSV file exists
    if not os.path.exists(csv_path):
        print(f"File {csv_path} does not exist. Skipping attribute '{attribute}'.")
        continue

    # Load the CSV file
    csv_df = pd.read_csv(csv_path)

    # Preprocess the 'node' column in csv_df to match the 'node' in extracted_text_df
    csv_df['node'] = csv_df['node'].str.replace('@', '/')  # Replace '@' with '/'

    # Filter out rows where the node doesn't have a corresponding PDF text
    csv_df = csv_df[csv_df['node'].isin(available_nodes)]

    if csv_df.empty:
        print(f"No matching nodes found for attribute '{attribute}'. Skipping.")
        continue

    # Merge with the extracted text DataFrame on the 'node'
    merged_df = pd.merge(csv_df, extracted_text_df[['node', 'text']], on='node')

    finetune_data = []

    # System prompt for the current attribute
    system_prompt = info['prompt']
    json_key = info['json_key']

    for _, row in merged_df.iterrows():
        user_input = row['text']  # Text extracted from the PDF

        # Prepare the expected output
        expected_output = {json_key: row['neighbor'] if pd.notna(row['neighbor']) else ""}

        # Create the message object
        message = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": json.dumps([expected_output], ensure_ascii=False)}
            ]
        }
        finetune_data.append(message)

    if not finetune_data:
        print(f"No data to write for attribute '{attribute}'. Skipping.")
        continue

    # Save to JSONL file
    output_filename = f'finetune_{attribute}_{iteration}_{split}.jsonl'
    output_path = os.path.join(output_dir, output_filename)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for entry in finetune_data:
            json_line = json.dumps(entry, ensure_ascii=False)
            f_out.write(json_line + '\n')

    print(f"Saved fine-tune data for attribute '{attribute}' to {output_path}")


Found 200 CSV files in 'splits/train'.
Parquet file 'pypdfextraction/extracted_text.parquet' exists.
The output directory 'finetune_data' already exists.
The output directory 'finetune_data' is empty.
Saved fine-tune data for attribute 'name' to finetune_data\finetune_name_0_1st.jsonl
Saved fine-tune data for attribute 'bioActivity' to finetune_data\finetune_bioActivity_0_1st.jsonl
Saved fine-tune data for attribute 'collectionSpecie' to finetune_data\finetune_collectionSpecie_0_1st.jsonl
Saved fine-tune data for attribute 'collectionSite' to finetune_data\finetune_collectionSite_0_1st.jsonl
Saved fine-tune data for attribute 'collectionType' to finetune_data\finetune_collectionType_0_1st.jsonl


# create JSONL files for Finetuning - Test

In [1]:
import os
import json
import pandas as pd

# Define the directory paths
splits_dir = 'splits'  # Changed from 'splits/train' to just 'splits'
parquet_path = 'pypdfextraction/extracted_text.parquet'  # Path to your extracted text parquet file
output_dir = 'finetune_data'  # Directory to save fine-tuning data

# Check if the directories and files exist
if not os.path.exists(splits_dir):
    raise FileNotFoundError(f"The splits directory '{splits_dir}' does not exist.")
else:
    # Check for CSV files in the splits directory
    csv_files = [f for f in os.listdir(splits_dir) if f.endswith('.csv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in the splits directory '{splits_dir}'.")
    else:
        print(f"Found {len(csv_files)} CSV files in '{splits_dir}'.")

if not os.path.isfile(parquet_path):
    raise FileNotFoundError(f"The parquet file '{parquet_path}' does not exist.")
else:
    print(f"Parquet file '{parquet_path}' exists.")

if not os.path.exists(output_dir):
    print(f"The output directory '{output_dir}' does not exist. Creating it now.")
    os.makedirs(output_dir)
else:
    print(f"The output directory '{output_dir}' already exists.")
    # Check if the output directory is empty
    if not os.listdir(output_dir):
        print(f"The output directory '{output_dir}' is empty.")
    else:
        print(f"The output directory '{output_dir}' contains files or subdirectories.")

# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Load the extracted text DataFrame
extracted_text_df = pd.read_parquet(parquet_path)

# Preprocess the 'filename' column in extracted_text_df to match the 'node' format
# Remove the '.pdf' extension and replace '@' with '/'
extracted_text_df['node'] = extracted_text_df['filename'].str.replace('.pdf', '', regex=False)
extracted_text_df['node'] = extracted_text_df['node'].str.replace('@', '/')

# Set of filenames in extracted_text_df for quick lookup
available_nodes = set(extracted_text_df['node'])

# List of attributes and their corresponding prompts and JSON keys
attributes_info = {
    'name': {
        'json_key': 'compoundName',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the compound name from the following text. "
            "Provide the answer in JSON format: [{\"compoundName\": \"Example Compound Name\"}]. "
            "If the compound name is not specified, leave it empty like \"\"."
        )
    },
    'bioActivity': {
        'json_key': 'bioactivity',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the bioactivity from the following text. "
            "Provide the answer in JSON format: [{\"bioactivity\": \"Example Bioactivity\"}]. "
            "If the bioactivity is not specified, leave it empty like \"\"."
        )
    },
    'collectionSpecie': {
        'json_key': 'species',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the species from the following text. "
            "Provide the answer in JSON format: [{\"species\": \"Example Species\"}]. "
            "If the species is not specified, leave it empty like \"\"."
        )
    },
    'collectionSite': {
        'json_key': 'collectionSite',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the collection site from the following text. "
            "Provide the answer in JSON format: [{\"collectionSite\": \"Example Collection Site\"}]. "
            "If the collection site is not specified, leave it empty like \"\"."
        )
    },
    'collectionType': {
        'json_key': 'isolationType',
        'prompt': (
            "You are a chemist expert in natural products. "
            "Extract the isolation type from the following text. "
            "Provide the answer in JSON format: [{\"isolationType\": \"Example Isolation Type\"}]. "
            "If the isolation type is not specified, leave it empty like \"\"."
        )
    }
}

# Iteration and split to use
iteration = '0'
split = '1st'

for attribute, info in attributes_info.items():
    # Construct the filename for the CSV file
    csv_filename = f'test_doi_{attribute}_{iteration}_{split}.csv'  # Changed from 'train_doi_' to 'test_doi_'
    csv_path = os.path.join(splits_dir, csv_filename)

    # Check if the CSV file exists
    if not os.path.exists(csv_path):
        print(f"File {csv_path} does not exist. Skipping attribute '{attribute}'.")
        continue

    # Load the CSV file
    csv_df = pd.read_csv(csv_path)

    # Preprocess the 'node' column in csv_df to match the 'node' in extracted_text_df
    csv_df['node'] = csv_df['node'].str.replace('@', '/')  # Replace '@' with '/'

    # Filter out rows where the node doesn't have a corresponding PDF text
    csv_df = csv_df[csv_df['node'].isin(available_nodes)]

    if csv_df.empty:
        print(f"No matching nodes found for attribute '{attribute}'. Skipping.")
        continue

    # Merge with the extracted text DataFrame on the 'node'
    merged_df = pd.merge(csv_df, extracted_text_df[['node', 'text']], on='node')

    finetune_data = []

    # System prompt for the current attribute
    system_prompt = info['prompt']
    json_key = info['json_key']

    for _, row in merged_df.iterrows():
        user_input = row['text']  # Text extracted from the PDF

        # Prepare the expected output
        expected_output = {json_key: row['neighbor'] if pd.notna(row['neighbor']) else ""}

        # Create the message object
        message = {
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_input},
                {"role": "assistant", "content": json.dumps([expected_output], ensure_ascii=False)}
            ]
        }
        finetune_data.append(message)

    if not finetune_data:
        print(f"No data to write for attribute '{attribute}'. Skipping.")
        continue

    # Save to JSONL file
    output_filename = f'finetune_test_doi_{attribute}_{iteration}_{split}.jsonl'  # Changed to include 'test_doi_'
    output_path = os.path.join(output_dir, output_filename)
    with open(output_path, 'w', encoding='utf-8') as f_out:
        for entry in finetune_data:
            json_line = json.dumps(entry, ensure_ascii=False)
            f_out.write(json_line + '\n')

    print(f"Saved fine-tune test data for attribute '{attribute}' to {output_path}")


Found 400 CSV files in 'splits'.
Parquet file 'pypdfextraction/extracted_text.parquet' exists.
The output directory 'finetune_data' already exists.
The output directory 'finetune_data' contains files or subdirectories.
Saved fine-tune test data for attribute 'name' to finetune_data/finetune_test_doi_name_0_1st.jsonl
Saved fine-tune test data for attribute 'bioActivity' to finetune_data/finetune_test_doi_bioActivity_0_1st.jsonl
Saved fine-tune test data for attribute 'collectionSpecie' to finetune_data/finetune_test_doi_collectionSpecie_0_1st.jsonl
Saved fine-tune test data for attribute 'collectionSite' to finetune_data/finetune_test_doi_collectionSite_0_1st.jsonl
Saved fine-tune test data for attribute 'collectionType' to finetune_data/finetune_test_doi_collectionType_0_1st.jsonl


# Validate JSONL files

In [2]:
import os
import json
from jsonschema import validate
from jsonschema.exceptions import ValidationError

# Define the schema
schema = {
    "type": "object",
    "properties": {
        "messages": {
            "type": "array",
            "items": {
                "type": "object",
                "properties": {
                    "role": {"type": "string", "enum": ["system", "user", "assistant"]},
                    "content": {"type": "string"}
                },
                "required": ["role", "content"]
            },
            "minItems": 3,
            "maxItems": 3
        }
    },
    "required": ["messages"]
}

def validate_jsonl(file_path):
    print(f"Validating {file_path}")
    errors = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_number, line in enumerate(file, 1):
            try:
                data = json.loads(line)
                
                # Validate against schema
                validate(instance=data, schema=schema)
                
                # Additional custom checks
                messages = data['messages']
                
                # Check system message
                assert messages[0]['role'] == 'system', f"First message must have role 'system' on line {line_number}"
                assert "You are a chemist expert in natural products." in messages[0]['content'], f"Incorrect system prompt on line {line_number}"
                
                # Check user message
                assert messages[1]['role'] == 'user', f"Second message must have role 'user' on line {line_number}"
                
                # Check assistant message
                assert messages[2]['role'] == 'assistant', f"Third message must have role 'assistant' on line {line_number}"
                assistant_content = json.loads(messages[2]['content'])
                assert isinstance(assistant_content, list) and len(assistant_content) == 1, f"Assistant content must be a list with one item on line {line_number}"
                assert len(assistant_content[0]) == 1, f"Assistant content must have exactly one key-value pair on line {line_number}"
                
            except json.JSONDecodeError:
                errors.append(f"Invalid JSON on line {line_number}")
            except ValidationError as e:
                errors.append(f"Schema validation error on line {line_number}: {e}")
            except AssertionError as e:
                errors.append(f"Custom validation error on line {line_number}: {e}")
    
    if errors:
        print(f"Errors found in {file_path}:")
        for error in errors:
            print(f"  - {error}")
    else:
        print(f"No errors found in {file_path}")
    print()  # Add a blank line for readability

# Directory containing the JSONL files
finetune_data_dir = 'finetune_data'

# Validate all JSONL files in the directory
for filename in os.listdir(finetune_data_dir):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(finetune_data_dir, filename)
        validate_jsonl(file_path)

print("Validation complete.")

Validating finetune_data/finetune_test_doi_collectionSpecie_0_1st.jsonl
No errors found in finetune_data/finetune_test_doi_collectionSpecie_0_1st.jsonl

Validating finetune_data/finetune_collectionSite_0_1st.jsonl
No errors found in finetune_data/finetune_collectionSite_0_1st.jsonl

Validating finetune_data/finetune_bioActivity_0_1st.jsonl
No errors found in finetune_data/finetune_bioActivity_0_1st.jsonl

Validating finetune_data/finetune_collectionSpecie_0_1st.jsonl
No errors found in finetune_data/finetune_collectionSpecie_0_1st.jsonl

Validating finetune_data/finetune_name_0_1st.jsonl
No errors found in finetune_data/finetune_name_0_1st.jsonl

Validating finetune_data/finetune_collectionType_0_1st.jsonl
No errors found in finetune_data/finetune_collectionType_0_1st.jsonl

Validating finetune_data/finetune_test_doi_bioActivity_0_1st.jsonl
No errors found in finetune_data/finetune_test_doi_bioActivity_0_1st.jsonl

Validating finetune_data/finetune_test_doi_collectionType_0_1st.jsonl
N