In [None]:
import re
import json
import ast
import chardet  # Detect encoding

# Set file paths
input_file = '../scripts/outputs/smiles2gen.txt'      # Your input file containing logs
output_file = '../scripts/outputs/smiles2gen.jsonl'   # The output JSONL file

# Detect file encoding to handle UnicodeDecodeError
with open(input_file, 'rb') as f:
    raw_data = f.read(10000)  # Read a sample chunk of the file
    detected_encoding = chardet.detect(raw_data)['encoding']

print(f"Detected encoding: {detected_encoding}")

def extract_json_object(line):
    """
    Extracts a JSON-like object from a line by finding balanced curly braces.
    Returns the substring starting at the first '{' up to the matching '}'.
    """
    start = line.find('{')
    if start == -1:
        return None
    brace_count = 0
    for i, char in enumerate(line[start:], start=start):
        if char == '{':
            brace_count += 1
        elif char == '}':
            brace_count -= 1
            if brace_count == 0:
                return line[start:i+1]
    return None

def clean_entry(entry):
    """
    Cleans the extracted dictionary by:
    - Removing non-serializable objects (e.g., PIL images)
    - Ensuring proper formatting
    """
    cleaned_entry = {}
    for key, value in entry.items():
        # Remove PIL image objects and other unrecognized Python objects
        if isinstance(value, str) or isinstance(value, (int, float, list, dict)):
            cleaned_entry[key] = value
        else:
            print(f"Skipping non-serializable field: {key} = {type(value)}")
    return cleaned_entry

def safe_parse(entry_str):
    """
    Attempts to parse a dictionary from entry_str.
    - First tries ast.literal_eval for Python-style dicts
    - Falls back to json.loads after replacing single quotes with double quotes
    """
    try:
        # First attempt: parse using ast.literal_eval (for Python-style dicts)
        parsed_entry = ast.literal_eval(entry_str)
        if isinstance(parsed_entry, dict):
            return clean_entry(parsed_entry)
    except (SyntaxError, ValueError):
        pass  # Fall back to JSON conversion

    try:
        # Convert single quotes to double quotes for valid JSON format
        json_str = entry_str.replace("'", "\"")
        parsed_entry = json.loads(json_str)
        if isinstance(parsed_entry, dict):
            return clean_entry(parsed_entry)
    except json.JSONDecodeError as e:
        raise e

    return None

json_entries = []

# Open file with detected encoding
with open(input_file, 'r', encoding=detected_encoding, errors='replace') as f:
    for line in f:
        entry_str = extract_json_object(line)
        if entry_str is None:
            continue
        try:
            entry = safe_parse(entry_str)
            if entry:
                json_entries.append(entry)
            else:
                print(f"Skipping invalid entry: {entry_str}")
        except Exception as e:
            print(f"Skipping entry (failed to parse): {entry_str}\nError: {e}")

# Write valid JSON entries to JSONL
with open(output_file, 'w', encoding='utf-8') as f:
    for entry in json_entries:
        json_line = json.dumps(entry, ensure_ascii=False)  # Preserve Unicode characters
        f.write(json_line + '\n')

print(f"Extracted {len(json_entries)} JSON entries and saved them to {output_file}")


Detected encoding: Windows-1252
Skipping entry (failed to parse): {'image_id': 'US07321040-20080122-C00061', 'file_path': 'synthetic/indigo_resize/US07321040-20080122-C00061.png', 'SMILES': 'CC(C)C1=NN=C2C=CC(SC3=CC=CC=C3CNC(N)=O)=CN12', 'hydrogen_atom_count': 19.0, 'image': <PIL.PngImagePlugin.PngImageFile image mode=RGB size=384x384 at 0x7FBD24FDB430>}
Error: Expecting value: line 1 column 211 (char 210)
Skipping entry (failed to parse): {'loss': 1.6956, 'grad_norm': 62.95611572265625, 'learning_rate': 9.55002337540907e-07, 'reward': -0.8355000138282775, 'completion_length': 32.6, 'kl': 21.502279472351074, 'example_completion': 'Let me solve this step by step.\n<think> The figure presented is for a chemical structrue</think>\n\n<answer>C</anticlockwise></anticlockwiseO</anti-clockwiseT"></clockwise}
Error: Expecting ',' delimiter: line 1 column 356 (char 355)
Skipping invalid entry: {}
Skipping invalid entry: {}
Skipping invalid entry: {}
Skipping entry (failed to parse): {(CH3)}
Err