In [1]:
from genson import SchemaBuilder

DATASET = 'claude'

seed_schema_chatgpt = {
    'type': 'array',
    'items': {
        'type': 'object',
        'properties': {
            'mapping': {
                'type': 'object',
                'patternProperties': {
                    # UUID pattern - GenSON will fill in the actual schema
                    r'^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$': None,
                    # Also handle the client-created-root pattern
                    r'^client-created-': None
                }
            }
        }
    }
}

seed_schema_claude = {
    'type': 'array',
    'items': {
        'type': 'object',
        'properties': {
            'uuid': {
                'type': 'string',
                'pattern': r'^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$'
            },
            'name': {
                'type': 'string'
            },
            'created_at': {
                'type': 'string',
                'format': 'date-time'
            },
            'updated_at': {
                'type': 'string',
                'format': 'date-time'
            },
            'account': {
                'type': 'object',
                'properties': {
                    'uuid': {
                        'type': 'string',
                        'pattern': r'^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$'
                    }
                }
            },
            'chat_messages': {
                'type': 'array',
                'items': {
                    'type': 'object',
                    'properties': {
                        'uuid': {
                            'type': 'string',
                            'pattern': r'^[a-fA-F0-9]{8}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{4}-[a-fA-F0-9]{12}$'
                        },
                        'text': {
                            'type': 'string'
                        },
                        'content': {
                            'type': 'array',
                            'items': {
                                'type': 'object',
                                'properties': {
                                    'start_timestamp': {
                                        'type': 'string',
                                        'format': 'date-time'
                                    },
                                    'stop_timestamp': {
                                        'type': 'string',
                                        'format': 'date-time'
                                    },
                                    'type': {
                                        'type': 'string'
                                    },
                                    'text': {
                                        'type': 'string'
                                    },
                                    'citations': {
                                        'type': 'array'
                                    }
                                }
                            }
                        },
                        'sender': {
                            'type': 'string'
                        },
                        'created_at': {
                            'type': 'string',
                            'format': 'date-time'
                        },
                        'updated_at': {
                            'type': 'string',
                            'format': 'date-time'
                        },
                        'attachments': {
                            'type': 'array',
                            'items': {
                                'type': 'object',
                                'properties': {
                                    'file_name': {
                                        'type': 'string'
                                    },
                                    'file_size': {
                                        'type': 'integer'
                                    },
                                    'file_type': {
                                        'type': 'string'
                                    },
                                    'extracted_content': {
                                        'type': 'string'
                                    }
                                }
                            }
                        }
                    }
                }
            }
        }
    }
}

seed_schema = seed_schema_chatgpt
if DATASET == 'claude':
    seed_schema = seed_schema_claude
builder = SchemaBuilder()
builder.add_schema(seed_schema)

In [2]:
import sys
from pathlib import Path
import json
PATH=str((Path().cwd().parent /'src').absolute())
print(PATH)
if PATH not in sys.path:
    sys.path.append(PATH)

DATASET = 'claude'

/Users/dmarx/proj/chat2obs/src


In [3]:
root_chatgpt = "../data/ingestion/chatgpt/a40ff5f79c1b3edd3c366f0f628fb79170bae83ecf3a1758b5b258c71f843f53-2025-06-05-03-28-15-df2ed357a4e64443bf464446686c9692/"
root_claude = "/Users/dmarx/Downloads/data-2025-07-06-16-51-06"
root = root_chatgpt
if DATASET == 'claude':
    root = root_claude
fpath = Path(root) / "conversations.json"
convs = json.load(fpath.open())

In [4]:
convs[0]['chat_messages']

[{'uuid': 'de06099d-65b6-48ea-8c3b-0591f4c1c106',
  'text': '',
  'content': [{'start_timestamp': '2024-09-25T03:10:44.580237Z',
    'stop_timestamp': '2024-09-25T03:10:44.580237Z',
    'type': 'text',
    'text': '',
    'citations': []}],
  'sender': 'human',
  'created_at': '2024-09-25T03:10:44.580237Z',
  'updated_at': '2024-09-25T03:10:44.580237Z',
  'attachments': [{'file_name': 'paste.txt',
    'file_size': 51684,
    'file_type': 'txt',
    'extracted_content': "summarize this talk for me:\n\nhello how are you nice to see you thank you to Defcon thank you especially to\n0:06\nthe goons and the other volunteers who are out here busting their humps so we can have such a great time uh and thank\n0:12\nyou all for coming uh I thought I had 50 minutes I have 45 I'm going to talk fast\n0:18\nand instead of Q&A there's a book seller in the book signing areas bringing in some of my books you don't have to buy a\n0:23\nbook just come over and say hello I'm going to sit there until every

In [5]:
from conversation_tagger import create_default_tagger

tagger = create_default_tagger(source=DATASET)
tagged_results = [tagger.tag_conversation(c) for c in convs]

In [6]:
for i, tr in enumerate(tagged_results):
    print(tr.annotations)
    if i > 10:
        break   


{'conversation_length': {'count': 1, 'category': 'single'}}
{'has_code_structure_patterns': True, 'first_user_has_large_content': True, 'conversation_length': {'count': 9, 'category': 'medium'}, 'conversation_has_code_structure_patterns': {'exchange_count': 9, 'total_exchanges': 9, 'percentage': 100.0}}
{'conversation_length': {'count': 4, 'category': 'medium'}}
{'has_code_structure_patterns': True, 'first_user_has_large_content': True, 'has_wiki_links': True, 'conversation_length': {'count': 24, 'category': 'long'}, 'conversation_has_code_structure_patterns': {'exchange_count': 16, 'total_exchanges': 24, 'percentage': 66.7}, 'conversation_has_wiki_links': {'exchange_count': 1, 'total_exchanges': 24, 'percentage': 4.2}}
{'has_code_structure_patterns': True, 'first_user_has_large_content': True, 'conversation_length': {'count': 5, 'category': 'medium'}, 'conversation_has_code_structure_patterns': {'exchange_count': 5, 'total_exchanges': 5, 'percentage': 100.0}}
{'conversation_length': {

In [7]:
tr.exchanges

[Exchange(exchange_id='b31bf92e-3e4e-4bad-8148-ece3a7f3f39b', conversation_id='699ee842-1866-44cd-9a20-5be5eec04fe4', messages=[
 2024-10-23T21:07:34.105205Z - USER: Let's start with a workflow that builds the README dynamically from a jinja template that compiles sections of the readme dynamically from tempaltes defined for each chunk. starter sections: introduct..., 
 2024-10-23T21:07:34.105205Z - ASSISTANT:  I'll help you create a workflow to dynamically generate a README using Jinja2 templates. I'll create a basic structure that can be easily extended.
 
 <antThinking>This is a good candidate for an artif...], annotations={}),
 Exchange(exchange_id='b04ad408-8acc-4e1c-aa90-2024c6ce9145', conversation_id='699ee842-1866-44cd-9a20-5be5eec04fe4', messages=[
 2024-10-23T21:09:43.800394Z - USER: let's put the templates for this readme in docs/readme/, 
 2024-10-23T21:09:43.800394Z - ASSISTANT:  I'll restructure the project to organize the README templates in the `docs/readme/` directory.

In [8]:
tr.exchanges[-1].messages[-1]
# TODO: parse out `<antThinking>` spans


2024-10-24T21:30:37.356343Z - ASSISTANT:  Good call. Let's simplify by moving the features directly into the introduction template.

<antThinking>Need to remove features from config and place them directly in the introduction template, makin...