In [1]:
import pandas as pd
import json

In [2]:
df = pd.read_json('debater_combined_filled.jsonl', lines=True)

In [3]:
df['turns'].apply(lambda x: x[-1]['role']).value_counts()

Judge    477
Name: turns, dtype: int64

In [4]:
prompt = '''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

The following are rules from the game of Debate.
- There are three players: 2 debaters (Debater_A, Debater_B) and a Judge. Each debater is given an answer to argue.
- There are a fixed number of rounds. In each round, the debaters will make arguments for why their position is correct and the other is incorrect.
- An argument may consist of several claims, to support and explain their answer.  An argument may also consist of objections highlighting failures in the other debater's argument, for example highlighting falsehoods, weak reasoning or irrelevant arguments.
- Eventually we reach the final round and the debate ends. The Judge then looks at the full transcript and uses this to decide which answer is correct. However, the Judge cannot see the background text -- they can only see the question.
- The Debaters' goal is to win the debate, by persuading the Judge that their answer is correct.
- The debate surrounds a given question that has a single correct answer.


You are DEBATER, an intelligent AI with the goal of winning the debate. 
  - You are concise and clearly state your arguments.
  - Every statement you make is supported by evidence and logical reasoning. 
  - Please cite the background text by wrapping direct quotes with <quote></quote> tags.<|eot_id|><|start_header_id|>user<|end_header_id|>

Here is the background text that you will be referring to when debating: 

STORY

Here is the topic you will be debating:

TOPIC

OTHER_DEBATER will defend the position that the answer is "OTHER_POSITION".
DEBATER will defend the position that the answer is "POSITION".


You are DEBATER, an intelligent AI with the goal of winning the debate. 
- You should defend your position that the answer is "POSITION" while attacking OTHER_DEBATER's position. 
PREVIOUS_SPEECHES
Now it is your turn as DEBATER.
- Remember to both criticize the arguments made by OTHER_DEBATER and build up your own arguments.

Remember that it is your job to defend the position that "POSITION" is the correct answer and that OTHER_DEBATER's position is wrong.

Write out your speech:<|eot_id|><|start_header_id|>assistant<|end_header_id|>

'''

In [5]:
previous_speeches_prompt = '''
<br>

This is what you, DEBATER, said during your previous speech.

<br>

PREVIOUS_SPEECH

<br>

This is what your opponent, OTHER_DEBATER, said during their speech.

<br>

OTHER_PREVIOUS_SPEECH
'''

In [6]:
first_speech = True
for i, row in df.iterrows():
    print(row['answers'])
    print(row['question'])
    for t in row['turns']:
        if t['role'] == 'Judge':
            first_speech = True
        else:
            print(t.keys())
    break

['10', '11']
How many other expeditions ventured to the planet without noticing the city?
dict_keys(['role', 'index', 'text', 'probabilities', 'chars', 'charLimit', 'quoteChars', 'quoteCharLimit'])
dict_keys(['role', 'index', 'text', 'probabilities', 'chars', 'charLimit', 'quoteChars', 'quoteCharLimit'])
dict_keys(['role', 'index', 'text', 'probabilities', 'chars', 'charLimit', 'quoteChars', 'quoteCharLimit'])
dict_keys(['role', 'index', 'text', 'probabilities', 'chars', 'charLimit', 'quoteChars', 'quoteCharLimit'])


In [7]:
instructions = []
outputs = []

for i, row in df.iterrows():
    story = row['story']
    topic = row['question']
    position_a = row['answers'][0]
    position_b = row['answers'][1]
    previous_speeches = ''
    speeches_a = []
    speeches_b = []
    for t in row['turns']:
        if t['index'] == 0:
            debater = 'Debater_A'
        elif t['index'] == 1:
            debater = 'Debater_B'
        else:
            break

        output = t['text']
        if debater == 'Debater_A':
            position = position_a
            other_debater = 'Debater_B'
            other_position = position_b
            for i in range(len(speeches_a)):
                previous_speeches += previous_speeches_prompt.replace('OTHER_DEBATER', other_debater).replace('OTHER_PREVIOUS_SPEECH', speeches_b[i]).\
                replace('DEBATER', debater).replace('PREVIOUS_SPEECH', speeches_a[i])
            speeches_a.append(output)
        else:
            position = position_b
            other_debater = 'Debater_A'
            other_position = position_a
            for i in range(len(speeches_b)):
                previous_speeches += previous_speeches_prompt.replace('OTHER_DEBATER', other_debater).replace('OTHER_PREVIOUS_SPEECH', speeches_a[i]).\
                replace('DEBATER', debater).replace('PREVIOUS_SPEECH', speeches_b[i])
            speeches_b.append(output)
        if len(previous_speeches) > 0:
            previous_speeches += '\n<br>\n'
        instruction = prompt.replace('STORY', story).replace('TOPIC', topic).\
            replace('OTHER_DEBATER', other_debater).replace('OTHER_POSITION', other_position).\
                replace('DEBATER', debater).replace('POSITION', position).\
                replace('PREVIOUS_SPEECHES', previous_speeches)
        
        instructions.append(instruction)
        outputs.append(output)

In [8]:
df = pd.DataFrame({'instruction': instructions, 'output': outputs})

In [9]:
df

Unnamed: 0,instruction,output
0,<|begin_of_text|><|start_header_id|>system<|en...,It's pretty self-explanatory: <quote>Theirs wa...
1,<|begin_of_text|><|start_header_id|>system<|en...,<quote>The men from the ship knew only the sur...
2,<|begin_of_text|><|start_header_id|>system<|en...,"Sure. When they decide to go in: \n\n<quote>""L..."
3,<|begin_of_text|><|start_header_id|>system<|en...,This quote just indicates that an expedition e...
4,<|begin_of_text|><|start_header_id|>system<|en...,its definitely lighthearted in the beginning: ...
...,...,...
2733,<|begin_of_text|><|start_header_id|>system<|en...,Yrtok's fall is the best representation of her...
2734,<|begin_of_text|><|start_header_id|>system<|en...,The central flaw in my opponent's argument is ...
2735,<|begin_of_text|><|start_header_id|>system<|en...,The representation of Yrtok's discovery of pur...
2736,<|begin_of_text|><|start_header_id|>system<|en...,My opponent's claim overlooks the importance o...


In [10]:
df = df.assign(
    total_len = df['instruction'].str.len() 
              + df['output'].str.len()
)


to_drop = df.nlargest(1000, 'total_len').index

df = (
    df
    .drop(to_drop)
    .drop(columns='total_len')
)
df

Unnamed: 0,instruction,output
0,<|begin_of_text|><|start_header_id|>system<|en...,It's pretty self-explanatory: <quote>Theirs wa...
1,<|begin_of_text|><|start_header_id|>system<|en...,<quote>The men from the ship knew only the sur...
2,<|begin_of_text|><|start_header_id|>system<|en...,"Sure. When they decide to go in: \n\n<quote>""L..."
3,<|begin_of_text|><|start_header_id|>system<|en...,This quote just indicates that an expedition e...
4,<|begin_of_text|><|start_header_id|>system<|en...,its definitely lighthearted in the beginning: ...
...,...,...
2733,<|begin_of_text|><|start_header_id|>system<|en...,Yrtok's fall is the best representation of her...
2734,<|begin_of_text|><|start_header_id|>system<|en...,The central flaw in my opponent's argument is ...
2735,<|begin_of_text|><|start_header_id|>system<|en...,The representation of Yrtok's discovery of pur...
2736,<|begin_of_text|><|start_header_id|>system<|en...,My opponent's claim overlooks the importance o...


In [11]:
df.to_csv('training_dataset_for_debater_no_judge_speeches.csv')

In [27]:
'<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n' == "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"

True

In [26]:
df['instruction'][0][-60:]

'ch:<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [6]:
def remove_background_text(text):
    start_phrase = "Here is the background text that you will be referring to when debating:"
    end_phrase = "Here is the topic you will be debating:"
    
    # Find the positions of the start and end phrases
    start_idx = text.find(start_phrase)
    end_idx = text.find(end_phrase)
    
    # If both phrases exist and are in the correct order
    if start_idx != -1 and end_idx != -1 and start_idx < end_idx:
        # Keep the start phrase, remove the content in between
        return text[:start_idx + len(start_phrase)] + " " + text[end_idx:]
    
    # If phrases not found or in wrong order, return original text
    return text
df['no_story_instruction'] = df['instruction'].apply(remove_background_text)

KeyError: 'instruction'

In [None]:
print(df['no_story_instruction'][3])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

The following are rules from the game of Debate.
- There are three players: 2 debaters (Debater_A, Debater_B) and a Judge. Each debater is given an answer to argue.
- There are a fixed number of rounds. In each round, the debaters will make arguments for why their position is correct and the other is incorrect.
- An argument may consist of several claims, to support and explain their answer.  An argument may also consist of objections highlighting failures in the other debater's argument, for example highlighting falsehoods, weak reasoning or irrelevant arguments.
- Eventually we reach the final round and the debate ends. The Judge then looks at the full transcript and uses this to decide which answer is correct. However, the Judge cannot see the background text -- they can only see the question.
- The Debaters' goal is to win the debate, by persuading the Judge that their answer is correct.
- The debate surrounds a given que

In [13]:
s = '''P. F. Christiano, J. Leike, T. B. Brown, M. Martic, S. Legg, and D. Amodei, "Deep reinforcement learning from human preferences," in NeurIPS, 2017. [Online]. Available: https://papers.neurips.cc/paper/7017-deep-reinforcement-learning-from-human-preferences.pdf. Accessed: Jul. 29, 2025.

L. Ouyang et al., "Training language models to follow instructions with human feedback," in NeurIPS, 2022. [Online]. Available: https://proceedings.neurips.cc/paper_files/paper/2022/hash/b1efde53be364a73914f58805a001731-Abstract-Conference.html. Accessed: Jul. 29, 2025.

Y. Bai, S. Kadavath, S. Kundu, A. Askell, J. Kernion et al., "Constitutional AI: Harmlessness from AI Feedback," arXiv preprint arXiv:2212.08073, 2022. [Online]. Available: https://arxiv.org/abs/2212.08073. Accessed: Jul. 29, 2025.

E. Wallace, J. Feng, N. Kandpal, M. Gardner, and D. Singh, "Universal adversarial triggers for attacking and analyzing NLP," in ACL, 2019. [Online]. Available: https://aclanthology.org/D19-1221/. Accessed: Jul. 28, 2025.

X. Zhang, D. Jing, and X. Wan, "DPP-based adversarial prompt searching for language models," arXiv preprint arXiv:2403.00292, 2024. [Online]. Available: https://arxiv.org/html/2403.00292v1. Accessed: Jul. 29, 2025.

C. Guo, A. Sablayrolles, H. Jégou, and D. Kiela, "Gradient-based adversarial attacks against text transformers," in EMNLP, 2021. [Online]. Available: https://arxiv.org/abs/2104.13733. Accessed: Jul. 28, 2025.

S. Wen, Y. Shen, Q. Huang, H. Shen, and B. Zhou, "Hard prompts made easy: Gradient-based discrete optimization for prompt tuning and discovery," arXiv preprint arXiv:2302.03668, 2023. [Online]. Available: https://arxiv.org/abs/2302.03668. Accessed: Jul. 29, 2025.

Z. Zhao, Y. Ma, S. Jha, M. Pavone, and C. Xiao, "ARMOR: Aligning secure and safe large language models via meticulous reasoning," arXiv preprint arXiv:2507.11500, 2025. [Online]. Available: https://arxiv.org/html/2507.11500v1. Accessed: Aug. 8, 2025.

C. Pathade, "A systematic evaluation of prompt injection and jailbreak vulnerabilities in LLMs," arXiv preprint arXiv:2505.04806, 2025. [Online]. Available: https://arxiv.org/html/2505.04806v1. Accessed: Aug. 8, 2025.

C. Zhou, T. He, Y. Jiang, and J. Zhao, "Security concerns for large language models: A survey," arXiv preprint arXiv:2505.18889, 2025. [Online]. Available: https://arxiv.org/abs/2505.18889. Accessed: Aug. 8, 2025.

D. Qin et al., "A survey of attacks on large language models," arXiv preprint arXiv:2505.12567, 2025. [Online]. Available: https://arxiv.org/html/2505.12567v1. Accessed: Aug. 1, 2025.

"Paper: Constitutional AI: Harmlessness from AI Feedback (Anthropic)," LessWrong, Aug. 1, 2025. [Online]. Available: https://www.lesswrong.com/posts/aLhLGns2BSun3EzXB/paper-constitutional-ai-harmlessness-from-ai-feedback. Accessed: Aug. 1, 2025.

K. Zhang et al., "PRM-free security alignment of large models via red teaming," arXiv preprint arXiv:2507.14202, 2025. [Online]. Available: https://arxiv.org/html/2507.14202v1. Accessed: Aug. 1, 2025.

"Don't expect quick fixes in 'red-teaming' of AI models. Security was an afterthought," AP News, Aug. 8, 2025. [Online]. Available: https://apnews.com/article/1f4c8d874195c9ffcc2cdffa71e4f44b. Accessed: Aug. 8, 2025.

A. Paulus, A. Zharmagambetov, C. Guo, B. Amos, and Y. Tian, "AdvPrompter: Fast adaptive adversarial prompting for LLMs," arXiv preprint arXiv:2404.16873v2, 2024. [Online]. Available: https://arxiv.org/html/2404.16873v2. Accessed: Aug. 8, 2025.

A. Paulus, A. Zharmagambetov, C. Guo, B. Amos, and Y. Tian, "AdvPrompter: Fast adaptive adversarial prompting for LLMs," arXiv preprint arXiv:2404.16873v1, 2024. [Online]. Available: https://arxiv.org/html/2404.16873v1. Accessed: Aug. 8, 2025.

Y. Wu et al., "Sysformer: Safeguarding frozen large language models via lightweight continual learning," arXiv preprint arXiv:2506.15751, 2025. [Online]. Available: https://arxiv.org/pdf/2506.15751. Accessed: Aug. 8, 2025.

E. Wallace et al., "Universal adversarial triggers for attacking and analyzing NLP," arXiv preprint arXiv:1908.07125, 2019. [Online]. Available: https://arxiv.org/abs/1908.07125. Accessed: Aug. 8, 2025.

M. Zhao et al., "Soft prompt threats: Attacking safety alignment and unlearning in open-source LLMs through the embedding space," arXiv preprint arXiv:2402.09063, 2024. [Online]. Available: https://arxiv.org/abs/2402.09063. Accessed: Aug. 8, 2025.

A. Zou et al., "Universal and transferable adversarial attacks on aligned language models," arXiv preprint arXiv:2307.15043, 2023. [Online]. Available: https://arxiv.org/abs/2307.15043. Accessed: Aug. 8, 2025.

A. Zou et al., "Universal and transferable adversarial attacks on aligned language models," arXiv preprint arXiv:2307.15043, 2023. [Online]. Available: https://arxiv.org/pdf/2307.15043. Accessed: Aug. 8, 2025.

S. Wen et al., "Attacking large language models with projected gradient descent," arXiv preprint arXiv:2402.09154v2, 2024. [Online]. Available: https://arxiv.org/html/2402.09154v2. Accessed: Aug. 8, 2025.

C. Zhou et al., "Security concerns for large language models: A survey," arXiv preprint arXiv:2505.18889v2, 2025. [Online]. Available: https://arxiv.org/html/2505.18889v2. Accessed: Aug. 8, 2025.

"Most AI chatbots easily tricked into giving dangerous responses, study finds," The Guardian, May 21, 2025. [Online]. Available: https://www.theguardian.com/technology/2025/may/21/most-ai-chatbots-easily-tricked-into-giving-dangerous-responses-study-finds. Accessed: Aug. 8, 2025.

"What is a prompt injection attack?," IBM Think, Aug. 8, 2025. [Online]. Available: https://www.ibm.com/think/topics/prompt-injection. Accessed: Aug. 8, 2025.

P. R. Burrell and D. E. Phillips, Post-Positivism and Scientific Realism, Routledge, 2000.

K. Robson and C. McCartan, Real World Research, 4th ed., Wiley, 2016.

T. S. Kuhn, The Structure of Scientific Revolutions, 3rd ed., Univ. of Chicago Press, 1996.

J. W. Creswell and V. L. Poth, Qualitative Inquiry and Research Design, 4th ed., Sage, 2018.

M. D. Lee and G. Karelitz, "Standards for reproducibility," Nature Methods, vol. 16, pp. 917–920, 2019.

T. Dettmers, A. Pagnoni, A. Holtzman, and L. Zettlemoyer, "QLoRA: Efficient fine-tuning of quantized LLMs," arXiv preprint arXiv:2305.14314, 2023.

Y. Bai et al., "Constitutional AI: Harmlessness from AI Feedback," arXiv preprint arXiv:2212.08073, 2022.

A. Zhang et al., "Suffix-based trigger effectiveness in aligned LLMs," arXiv preprint arXiv:2307.15043, 2023.

T. Wolf, L. Debut, V. Sanh, J. Chaumond, C. Delangue et al., "Transformers: State-of-the-art natural language processing," in EMNLP Demo, 2020, pp. 38–45.

A. Paszke et al., "PyTorch: An imperative style, high-performance deep learning library," in NeurIPS, 2019.

OpenAI, "Policy on Security Research," 2024. [Online]. Available: https://openai.com/index/update-on-safety-and-security-practices/. Accessed: Aug. 9, 2025.

J. Hirschberg and Y. S. Wong, "Beam Search Strategies in Speech Recognition," in ICASSP, 1991.

E. Jang, S. Gu, and B. Poole, "Categorical reparameterization with Gumbel-Softmax," in ICLR, 2017.

P. Zaharia et al., "Accelerating the machine learning lifecycle with MLflow," in Data and AI Summit, 2018.

S. Chacon and B. Straub, Pro Git, 2nd ed., Apress, 2014.

E. Wallace et al., "Universal adversarial triggers for attacking and analyzing NLP," arXiv preprint arXiv:1908.07125, 2019.

C. Li and D. Liang, "Prefix-Tuning: Optimizing continuous prompts for generation," in ACL, 2021.

A. Wei et al., "Jailbroken: How alignment training fails," arXiv preprint arXiv:2307.02483, 2023.

W. Zenke et al., "Soft-Prompt fine-tuning for LLM control," in ACL Workshops, 2022.

P. Michel, X. Li, G. Neubig, and J. Pino, "On evaluation of adversarial perturbations for Seq2Seq," in NAACL, 2019.

S. Wen et al., "Hard prompts made easy: Gradient-based discrete optimization for prompt tuning and discovery," arXiv preprint arXiv:2302.03668, 2023.

T. Kurita, M. Wu, and H. Nagata, "Weight-efficient fine-tuning of LLMs via soft prompts," in EMNLP, 2023.

A. Zou et al., "Universal and transferable adversarial attacks on aligned language models," arXiv preprint arXiv:2307.15043, 2023.

A. Krishna et al., "Automated evaluation of LLM safety responses," in ML Safety Workshop, 2024.

K. He, X. Zhang, S. Ren, and J. Sun, "Iterative coordinate descent in adversarial optimization," in ICCV, 2019.

A. Rahimi and B. Recht, "Automatic guardrails for LLM safety," in AAAI Ethics in AI, 2025.

M. Mazeika et al., "HarmBench: Benchmarking adversarial attacks on LLMs," arXiv preprint arXiv:2405.00976, 2024.

Z. Li et al., "Transferable adversarial triggers across LLM architectures," arXiv preprint arXiv:2404.16020v2, 2024.

J. Liu et al., "Data-free mining for natural adversarial triggers," in AAAI, 2023.

E. Huang et al., "Token-efficiency in adversarial prompting," in NeurIPS Workshop on Security, 2023.

A. Zou et al., "Greedy coordinate gradient for universal attacks," arXiv preprint arXiv:2307.15043, 2023.

T. Shin et al., "AutoPrompt: Eliciting knowledge from language models with automatically generated prompts," arXiv preprint arXiv:2010.15980, 2020.

J. Ebrahimi et al., "HotFlip: White-box adversarial examples for text classification," arXiv preprint arXiv:1712.06751, 2017.

S. Wen et al., "PEZ: Prompts made easy via projected embedding quantization," arXiv preprint arXiv:2302.03668, 2023.

C. Guo et al., "GBDA: Gradient-based distributional attacks," arXiv preprint arXiv:2104.13733, 2021.

D. Kim et al., "Best practices for reproducing ML research," J. Mach. Learn. Res., vol. 23, pp. 1–28, 2022.

Center for AI Safety, "Responsible disclosure guidelines," 2024.
'''

In [14]:
l = s.split('\n\n')
for i in range(len(l)):
    l[i] = f'[{i+1}] ' + l[i] 
print('\n'.join(l))

[1] P. F. Christiano, J. Leike, T. B. Brown, M. Martic, S. Legg, and D. Amodei, "Deep reinforcement learning from human preferences," in NeurIPS, 2017. [Online]. Available: https://papers.neurips.cc/paper/7017-deep-reinforcement-learning-from-human-preferences.pdf. Accessed: Jul. 29, 2025.
[2] L. Ouyang et al., "Training language models to follow instructions with human feedback," in NeurIPS, 2022. [Online]. Available: https://proceedings.neurips.cc/paper_files/paper/2022/hash/b1efde53be364a73914f58805a001731-Abstract-Conference.html. Accessed: Jul. 29, 2025.
[3] Y. Bai, S. Kadavath, S. Kundu, A. Askell, J. Kernion et al., "Constitutional AI: Harmlessness from AI Feedback," arXiv preprint arXiv:2212.08073, 2022. [Online]. Available: https://arxiv.org/abs/2212.08073. Accessed: Jul. 29, 2025.
[4] E. Wallace, J. Feng, N. Kandpal, M. Gardner, and D. Singh, "Universal adversarial triggers for attacking and analyzing NLP," in ACL, 2019. [Online]. Available: https://aclanthology.org/D19-1221/