In [None]:
# 🛠️ Step 1: Install ConvoKit
!pip install convokit

# 📦 Step 2: Import and Download the Dataset
from convokit import Corpus, download
import pandas as pd
from collections import defaultdict

# Download the CMV dataset
corpus = Corpus(filename=download("winning-args-corpus"))



Collecting convokit
  Downloading convokit-3.2.0.tar.gz (205 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/205.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m205.0/205.0 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting msgpack-numpy>=0.4.3.2 (from convokit)
  Downloading msgpack_numpy-0.4.8-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting clean-text>=0.6.0 (from convokit)
  Downloading clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting unidecode>=1.1.1 (from convokit)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting pymongo>=4.0 (from convokit)
  Downloading pymongo-4.13.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (22 kB)
Collecting dnspython>=1.16.0 (from 


Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
No configuration file found at /root/.convokit/config.yml; writing with contents: 
# Default Backend Parameters
db_host: localhost:27017
data_directory: ~/.convokit/saved-corpora
model_directory: ~/.convokit/saved-models
default_backend: mem
Downloading winning-args-corpus to /root/.convokit/saved-corpora/winning-args-corpus
Downloading winning-args-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/winning-args-corpus/winning-args-corpus.zip (73.7MB)... Done
Number of Speakers: 34911
Number of Utterances: 293297
Number of Conversations: 3051


In [None]:
# 🌳 Step 3: Extract exactly 10 full conversation trees

all_conversations = []

for i, conv in enumerate(corpus.iter_conversations()):
    utterances = {}
    tree = defaultdict(list)

    # Build reply tree
    for utt in conv.iter_utterances():
        utterances[utt.id] = utt
        tree[utt.reply_to].append(utt.id)

    # Recursive function to build tree for a comment
    def build_tree(utt_id):
        return {
            "id": utt_id,
            "speaker": utterances[utt_id].speaker.id,
            "text": utterances[utt_id].text.strip(),
            "children": [build_tree(child_id) for child_id in tree[utt_id]]
        }

    # Start from top-level replies to the OP
    conversation_tree = [build_tree(child_id) for child_id in tree[None]]

    all_conversations.append({
        "conversation_id": conv.id,
        "title": conv.meta.get("op-title", ""),
        "op_text": conv.meta.get("op-text-body", ""),
        "tree": conversation_tree
    })

    if i >= 9:  # ✅ exactly 10 conversations (index 0 to 9)
        break


In [None]:
# 💾 Step 4: Save the 10 trees to JSON
with open("cmv_10_conversation_trees.json", "w", encoding="utf-8") as f:
    json.dump(all_conversations, f, ensure_ascii=False, indent=2)

print("✅ Saved 10 conversation trees to 'cmv_10_conversation_trees.json'")


✅ Saved 10 conversation trees to 'cmv_10_conversation_trees.json'
