# Clean Conversations Data

Extract English user messages from conversations and save as prompts for energy measurement.


In [None]:
import pandas as pd
import json
from pathlib import Path


In [8]:
data = pd.read_json('data/conversations.jsonl', lines=True)
print(f"Loaded {len(data)} conversations")


Loaded 853062 conversations


In [9]:
english_data = data[data['language'].str.lower() == 'english']
print(f"Found {len(english_data)} English conversations")

def extract_user_prompts(conversations, max_prompts=100):
    prompts = []
    for _, row in conversations.iterrows():
        conversation = row['conversation']
        if not conversation:
            continue
            
        for message in conversation:
            if (message.get('role') == 'user' and 
                'content' in message and 
                message['content'].strip()):
                prompts.append({
                    'prompt_text': message['content'].strip(),
                    'processed': 0
                })
                
                if len(prompts) >= max_prompts:
                    return prompts
    return prompts

prompts = extract_user_prompts(english_data)
prompts_df = pd.DataFrame(prompts)
print(f"Extracted {len(prompts_df)} user prompts")


Found 663049 English conversations
Extracted 100 user prompts


In [10]:
output_path = Path('data/prompts.jsonl')
prompts_df.to_json(output_path, orient='records', lines=True)
print(f"Saved {len(prompts_df)} prompts to {output_path}")


Saved 100 prompts to data\prompts.jsonl
