In [None]:
import pandas as pd
import re

# Your input data should already be defined as 'data'

def process_data(data):
    records = []
    for conversation_id, conversation in data.items():
        for dialogue in conversation:
            for entry in dialogue:
                records.append({
                    "conversation_id": conversation_id,
                    "turn": entry["turn"],
                    "speaker": entry["speaker"],
                    "utterance": entry["utterance"],
                    "emotion": entry.get("emotion", "neutral"),
                    "cause_evidence": entry.get("expanded emotion cause span", [])
                })
    return pd.DataFrame(records)

# Abbreviations we want to protect
protected_abbreviations = ["Mr .", "Mrs .", "Ms .", "Dr .", "Prof .", "Sr .", "Jr .", "St .", "vs ."]

# Replace protected abbreviations with temporary placeholders
def replace_abbreviations(text, reverse=False):
    if not reverse:
        for i, abbr in enumerate(protected_abbreviations):
            text = text.replace(abbr, f"__ABBR{i}__")
    else:
        for i, abbr in enumerate(protected_abbreviations):
            text = text.replace(f"__ABBR{i}__", abbr)
    return text

def split_into_clauses(text):
    text = replace_abbreviations(text)  # Protect abbreviations
    clauses = re.split(r'[.?!,;:]', text)  # Safe regex now
    clauses = [clause.strip() for clause in clauses if clause.strip()]
    clauses = [replace_abbreviations(clause, reverse=True) for clause in clauses]  # Restore abbreviations
    return clauses

# Process and transform dataset
df = process_data(data)
df['clauses'] = df['utterance'].apply(split_into_clauses)

# Save processed DataFrame to JSON file
df.to_json("processed_data.json", orient="records", indent=2)
print(df.head())

tell me the processing algorithm steps