In [None]:
import pandas as pd
import json
import ast
import re

def clean_lmsys_dataframe(df, csv_path="", save_json=True):
    """
    Cleans an LMSYS dataframe by:
      - Extracting and parsing JSON-like blocks from 'conversation' and 'openai_moderation' columns
      - Removing unnecessary columns
      - Saving the cleaned DataFrame to CSV (and optional JSON)

    Args:
        df (pd.DataFrame): Original LMSYS dataframe
        csv_path (str): Output path for cleaned CSV
        save_json (bool): Whether to save extracted JSON data separately

    Returns:
        pd.DataFrame: Cleaned dataframe
    """

    # ---------- Helper functions ----------
    def extract_brace_blocks(text):
        """Extract top-level {...} blocks from a string."""
        results = []
        stack = []
        start = None
        if not isinstance(text, str):
            return results
        for i, ch in enumerate(text):
            if ch == '{':
                if not stack:
                    start = i
                stack.append(ch)
            elif ch == '}':
                if stack:
                    stack.pop()
                    if not stack and start is not None:
                        results.append(text[start:i+1])
                        start = None
        return results

    def safe_block_to_dict(s):
        """Convert a single JSON-like string block into a dict."""
        s = s.strip().replace("\n", " ")
        s = re.sub(r"(?<=\{|,)\s*'([^']+)'\s*:", r'"\1":', s)
        s = re.sub(r":\s*'([^']+)'(?=,|\})", r': "\1"', s)
        try:
            return json.loads(s)
        except Exception:
            try:
                return ast.literal_eval(s)
            except Exception:
                return None

    def clean_list_of_malformed_strings(lst):
        """Clean a list of possibly malformed strings into parsed dicts."""
        cleaned = []
        for s in lst if isinstance(lst, list) else [lst]:
            if not isinstance(s, str):
                continue
            blocks = extract_brace_blocks(s)
            for b in blocks:
                d = safe_block_to_dict(b)
                if d:
                    cleaned.append(d)
        return cleaned

    # ---------- Clean conversation column ----------
    df["conversation_cleaned"] = df["conversation"].apply(clean_list_of_malformed_strings)

    # ---------- Clean openai_moderation column ----------
    df["moderation_cleaned"] = df["openai_moderation"].apply(clean_list_of_malformed_strings)

    # ---------- Replace old columns ----------
    df["conversation"] = df["conversation_cleaned"]
    df["openai_moderation"] = df["moderation_cleaned"]
    df.drop(columns=["conversation_cleaned", "moderation_cleaned"], inplace=True, errors="ignore")
    cols_to_drop = [
        "conversation_id", "moderation_top2", "language",
        "model", "conversation_top2", "turn", "moderation"
    ]
    df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)

    # ---------- Save outputs ----------
    df.to_csv(csv_path, index=False)

    if save_json:
        with open("cleaned_conversations_final.json", "w") as f:
            json.dump(df["conversation"].to_list(), f, indent=2)
        with open("openai_moderation_cleaned.json", "w") as f:
            json.dump(df["openai_moderation"].to_list(), f, indent=2)

    print(f"✅ Cleaning complete. Saved to '{csv_path}'")
    return df


In [3]:
import pandas as pd
for i in range(1,56):
    dfi=pd.read_csv(f'dataset/splits/lmsys_chat_train_part_{i}.csv')
    dfi=clean_lmsys_dataframe(dfi, csv_path=f'dataset/splits/lmsys_chat_train_part_{i}.csv', save_json=False)
    

✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_1.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_2.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_3.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_4.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_5.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_6.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_7.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_8.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_9.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_10.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_11.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_12.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_13.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_14.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_15.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_16.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_17.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_18.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_19.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_20.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_21.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_22.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_23.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_24.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_25.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_26.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_27.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_28.csv'
✅ Cleaning c



✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_35.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_36.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_37.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_38.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_39.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_40.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_41.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_42.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_43.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_44.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_45.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_46.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_47.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_48.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_49.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_50.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_51.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_52.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_53.csv'




✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_54.csv'
✅ Cleaning complete. Saved to 'dataset/splits/lmsys_chat_train_part_55.csv'
