In [None]:
# ✅ Install dependencies
!pip install convokit pandas tqdm

# ✅ Import packages
from convokit import Corpus, download
import pandas as pd
from tqdm import tqdm

# ✅ Load the dataset
corpus = Corpus(filename=download("winning-args-corpus"))

# ✅ Extract (claim, reply, delta) triples
rows = []

for convo in tqdm(corpus.iter_conversations()):
    utterances = list(convo.iter_utterances())

    for i in range(len(utterances) - 1):
        utt = utterances[i]
        reply = utterances[i + 1]

        # Only take pairs where the reply is from a different speaker
        if utt.speaker.id != reply.speaker.id:
            claim = utt.text.strip()
            response = reply.text.strip()
            delta = reply.meta.get("success")  # 1 if argument was persuasive, 0 if not, None otherwise

            if claim and response and delta is not None:
                rows.append({
                    "claim": claim,
                    "reply": response,
                    "delta": delta  # 1 = successful, 0 = unsuccessful
                })

# ✅ Create DataFrame
df = pd.DataFrame(rows)
print(f"Total successful/unsuccessful pairs: {len(df)}")

# ✅ Remove repeated claims by selecting one reply per claim
df_unique_claims = df.groupby("claim").apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)

# ✅ Save to CSV
df_unique_claims.to_csv("cmv_argument_pairs_unique_claims.csv", index=False)

# ✅ Show final result
print(f"Total unique claims: {len(df_unique_claims)}")
df_unique_claims.sample(5)




Please restructure your imports with 'import unsloth' at the top of your file.
  import unsloth


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Downloading winning-args-corpus to /root/.convokit/saved-corpora/winning-args-corpus
Downloading winning-args-corpus from http://zissou.infosci.cornell.edu/convokit/datasets/winning-args-corpus/winning-args-corpus.zip (73.7MB)... Done


3051it [00:01, 1576.43it/s]


Total successful/unsuccessful pairs: 19439


  df_unique_claims = df.groupby("claim").apply(lambda x: x.sample(1, random_state=42)).reset_index(drop=True)


Total unique claims: 19292


Unnamed: 0,claim,reply,delta
6007,"First off, let me say that I fully understand ...","&gt; One person saying, ""no, I won't buy that""...",0
11712,Let us define these concepts first:\n\n**Patri...,I'd say that a lot of feminists aren't quite a...,1
11386,It's more like dressing up as a Nazi and expec...,&gt; I suppose a good example of that point wo...,0
14707,The Japanese were also training civilian milit...,There were three days betweent the first and t...,0
154,"&gt; Do you mean factually wrong, morally wro...",&gt; I'm not talking about legal free speech h...,1
