# liaisons-experiments - Framework Try-out

In [None]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

In [None]:
from liaisons_experiments.experiments.multi_experiment import MultiExperiment
from tqdm.notebook import tqdm
from langchain_community.chat_models import ChatOllama
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import GoogleGenerativeAI
import os

self_hosted_llms = [
    (ChatOllama(
        model="llama3:8b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 5
    }),
    (ChatOllama(
        model="phi3:3.8b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 10
    }),
    (ChatOllama(
        model="phi3:14b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 3
    }),
    (ChatOllama(
        model="gemma:2b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 10
    }),
    (ChatOllama(
        model="gemma:2b-it",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 10
    }),
    (ChatOllama(
        model="gemma:7b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 5
    }),
    (ChatOllama(
        model="gemma:7b-it",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 5
    }),
    (ChatOllama(
        model="gemma2:2b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 10
    }),
    (ChatOllama(
        model="gemma2:2b-it",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 10
    }),
    (ChatOllama(
        model="gemma2:9b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 5
    }),
    (ChatOllama(
        model="gemma2:9b-it",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 5
    }),
    (ChatOllama(
        model="gemma2:27b",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 2
    }),
    (ChatOllama(
        model="gemma2:27b-it",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
    ), {
        "num_workers": 2
    }),
]

platform_hosted_llms = [
    (ChatOpenAI(
        model="gpt-3.5-turbo-0125",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_OPENAI_API_KEY"],
    ), {
        "num_workers": 16,
    }),
    (ChatOpenAI(
        model="gpt-4-turbo-2024-04-09",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_OPENAI_API_KEY"],
    ), {
        "num_workers": 1,
    }),
    (ChatOpenAI(
        model="gpt-4o-2024-05-13",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_OPENAI_API_KEY"],
    ), {
        "num_workers": 1,
    }),
    (ChatOpenAI(
        model="gpt-4o-mini-2024-07-18",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_OPENAI_API_KEY"],
    ), {
        "num_workers": 16,
    }),
    (ChatAnthropic(
        model="claude-3-haiku-20240307",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_ANTHROPIC_API_KEY"],
    ),{
        "num_workers": 2,
    }),
    (ChatAnthropic(
        model="claude-3-sonnet-20240229",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_ANTHROPIC_API_KEY"],
    ),{
        "num_workers": 2,
    }),
    (ChatAnthropic(
        model="claude-3-opus-20240229",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_ANTHROPIC_API_KEY"],
    ),{
        "num_workers": 1,
    }),
    (ChatAnthropic(
        model="claude-3-5-sonnet-20240620",
        temperature=0.7,
        max_tokens=2,
        top_p=1,
        api_key=os.environ["LIAISONS_EXPERIMENTS_ANTHROPIC_API_KEY"],
    ),{
        "num_workers": 2,
    }),
    (GoogleGenerativeAI(
        model="gemini-1.5-flash",
        temperature=0.7,
        max_output_tokens=2,
        top_p=1,
        google_api_key=os.environ["LIAISONS_EXPERIMENTS_GOOGLE_API_KEY"],
    ),{
        "num_workers": 16,
    }),
    (GoogleGenerativeAI(
        model="gemini-1.5-pro",
        temperature=0.7,
        max_output_tokens=2,
        top_p=1,
        google_api_key=os.environ["LIAISONS_EXPERIMENTS_GOOGLE_API_KEY"],
    ),{
        "num_workers": 16,
    }),
]

exps = MultiExperiment(self_hosted_llms, tqdm=tqdm)

In [None]:
from datasets import load_dataset

hf_token = os.environ.get("LIAISONS_HUGGING_FACE_API_KEY")

dataset = load_dataset("coding-kelps/liaisons-claim-stance-sample", token=hf_token)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

def plot_binary_results(binary_results, title: str | None = None):
    binary_plot_results = pd.merge(binary_results.f1_scores, binary_results.metadata) \
        .melt(id_vars='model_name', var_name='Metric', value_name='Value')

    # Set the size of the plot
    plt.figure(figsize=(14, 8))

    # Define a list of colors for the palette
    colors = ["#1F77B4", "#FF7F0F", "#2BA02B", "#D62727"]

    # Create a grouped bar plot
    ax = sns.barplot(data=binary_plot_results, x='model_name', y='Value', hue='Metric', palette=colors)

    plt.title(title)
    plt.xlabel("Model Name")
    plt.ylabel("Benchmarks")

    # Fix ticks position to avoid hazardous position
    # https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.set_xticklabels.html
    ax.set_xticks(ax.get_xticks())
    # Rotate labels and align to the right
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    # Show the plot
    plt.tight_layout()
    plt.show()

def plot_ternary_results(ternary_results, title: str | None = None):
    ternary_plot_results = pd.merge(ternary_results.f1_scores, ternary_results.metadata) \
        .melt(id_vars='model_name', var_name='Metric', value_name='Value')
    
    # Set the size of the plot
    plt.figure(figsize=(14, 8))
    
    
    # Define a list of colors for the palette
    colors = ["#1F77B4", "#FF7F0F", "#9467BD", "#2BA02B", "#D62727"]
    
    # Create a grouped bar plot
    ax = sns.barplot(data=ternary_plot_results, x='model_name', y='Value', hue='Metric', palette=colors)
    
    plt.title(title)
    plt.xlabel("Model Name")
    plt.ylabel("Benchmarks")
    
    # Fix ticks position to avoid hazardous position
    # https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.set_xticklabels.html
    ax.set_xticks(ax.get_xticks())
    # Rotate labels and align to the right
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
    
    # Show the plot
    plt.tight_layout()
    plt.show()

In [None]:
def binary_ibm_few_shot_prompting(parent_argment: str, child_argument: str) -> str:
    prompt = f"""
Arg1: Even in the case of provocateurs, it can be an effective strategy to call their bluff, by offering them a chance to have a rational conversation. In this case, the failure to do so is their responsibility alone.
Arg2: No-platforming hinders productive discourse.
Relation: attack

Arg1: A country used to receiving ODA may be perpetually bound to depend on handouts (pp. 197).
Arg2: Government structures adapt to handle and distribute incoming ODA. As the funding from ODA is significant, countries have vested bureaucratic interest to remain bound to aid (pp. 197).
Relation: support

Arg1: Elections would limit the influence of lobbyists on the appointment of Supreme Court judges.
Arg2: The more individuals take part in a decision, as would be the case in a popular vote compared to a vote in the Senate, the harder it is to sway the outcome.
Relation: support

Arg1: ChatGPT will reach AGI level before 2030.
Arg2: To reach AGI it should be able to generate its own goals and intentions: where would it draw these from?
Relation: attack

Arg1: {parent_argment}
Arg2: {child_argument}
Relation: 
"""
    
    return prompt

def binary_ibm_augmented_few_shot_prompting(parent_argment: str, child_argument: str) -> str:
    prompt = f"""
Arg1: Even in the case of provocateurs, it can be an effective strategy to call their bluff, by offering them a chance to have a rational conversation. In this case, the failure to do so is their responsibility alone.
Arg2: No-platforming hinders productive discourse.
Relation: attack

Arg1: A country used to receiving ODA may be perpetually bound to depend on handouts (pp. 197).
Arg2: Government structures adapt to handle and distribute incoming ODA. As the funding from ODA is significant, countries have vested bureaucratic interest to remain bound to aid (pp. 197).
Relation: support

Arg1: Elections would limit the influence of lobbyists on the appointment of Supreme Court judges.
Arg2: The more individuals take part in a decision, as would be the case in a popular vote compared to a vote in the Senate, the harder it is to sway the outcome.
Relation: support

Arg1: ChatGPT will reach AGI level before 2030.
Arg2: To reach AGI it should be able to generate its own goals and intentions: where would it draw these from?
Relation: attack

---

What the relation between Arg1 and Arg2, respond with one word: support or attack:

Arg1: {parent_argment}
Arg2: {child_argument}
Relation: 
"""
    
    return prompt

In [None]:
binary_df = dataset['binary'].to_pandas()

In [None]:
few_shot_binary_results = exps.run_from_df(binary_df, binary_ibm_few_shot_prompting, relation_dim="binary")

plot_binary_results(few_shot_binary_results, title="Large Language Models for binary argumentative relation prediction over the IBM Debater preprocessed dataset sample using few shot prompting")

In [None]:
augmented_few_shot_binary_results = exps.run_from_df(binary_df, binary_ibm_augmented_few_shot_prompting, relation_dim="binary")

plot_binary_results(augmented_few_shot_binary_results, title="Large Language Models for binary argumentative relation prediction over the IBM Debater preprocessed dataset sample using few augmented shot prompting")

In [None]:
def ternary_ibm_few_shot_prompting(parent_argment: str, child_argument: str) -> str:
    prompt = f"""
Arg1: Even in the case of provocateurs, it can be an effective strategy to call their bluff, by offering them a chance to have a rational conversation. In this case, the failure to do so is their responsibility alone.
Arg2: No-platforming hinders productive discourse.
Relation: attack

Arg1: ChatGPT will reach AGI level before 2030.
Arg2: Government structures adapt to handle and distribute incoming ODA. As the funding from ODA is significant, countries have vested bureaucratic interest to remain bound to aid (pp. 197).
Relation: unrelated

Arg1: Elections would limit the influence of lobbyists on the appointment of Supreme Court judges.
Arg2: The more individuals take part in a decision, as would be the case in a popular vote compared to a vote in the Senate, the harder it is to sway the outcome.
Relation: support

Arg1: A country used to receiving ODA may be perpetually bound to depend on handouts (pp. 197).
Arg2: To reach AGI it should be able to generate its own goals and intentions: where would it draw these from?
Relation: unrelated

Arg1: {parent_argment}
Arg2: {child_argument}
Relation: 
"""
    
    return prompt

def ternary_ibm_augmented_few_shot_prompting(parent_argment: str, child_argument: str) -> str:
    prompt = f"""
Arg1: Even in the case of provocateurs, it can be an effective strategy to call their bluff, by offering them a chance to have a rational conversation. In this case, the failure to do so is their responsibility alone.
Arg2: No-platforming hinders productive discourse.
Relation: attack

Arg1: ChatGPT will reach AGI level before 2030.
Arg2: Government structures adapt to handle and distribute incoming ODA. As the funding from ODA is significant, countries have vested bureaucratic interest to remain bound to aid (pp. 197).
Relation: unrelated

Arg1: Elections would limit the influence of lobbyists on the appointment of Supreme Court judges.
Arg2: The more individuals take part in a decision, as would be the case in a popular vote compared to a vote in the Senate, the harder it is to sway the outcome.
Relation: support

Arg1: A country used to receiving ODA may be perpetually bound to depend on handouts (pp. 197).
Arg2: To reach AGI it should be able to generate its own goals and intentions: where would it draw these from?
Relation: unrelated

---

What the relation between Arg1 and Arg2, respond with one word: support, attack, or unrelated:

Arg1: {parent_argment}
Arg2: {child_argument}
Relation: 
"""
    
    return prompt

In [None]:
ternary_df = dataset['ternary'].to_pandas()

In [None]:
few_shot_ternary_results = exps.run_from_df(ternary_df, ternary_ibm_few_shot_prompting, relation_dim="ternary")

plot_ternary_results(few_shot_ternary_results, title="Large Language Models for ternary argumentative relation prediction over the IBM Debater preprocessed dataset sample using few shot prompting")

In [None]:
augmented_few_shot_ternary_results = exps.run_from_df(ternary_df, ternary_ibm_augmented_few_shot_prompting, relation_dim="ternary")

plot_ternary_results(augmented_few_shot_ternary_results, title="Large Language Models for ternary argumentative relation prediction over the IBM Debater preprocessed dataset sample using augmented few shot prompting")

In [None]:
binary_f1 = pd.concat([augmented_few_shot_binary_results.f1_scores.assign(prompting="augmented_few_shot"), few_shot_binary_results.f1_scores.assign(prompting="few_shot")])

binary_metadata = pd.concat([augmented_few_shot_binary_results.metadata, few_shot_binary_results.metadata])

binary_results = binary_f1.merge(binary_metadata)

binary_results.to_csv("binary_results.csv", index=False)

In [None]:
ternary_f1 = pd.concat([augmented_few_shot_ternary_results.f1_scores.assign(prompting="augmented_few_shot"), few_shot_ternary_results.f1_scores.assign(prompting="few_shot")])

ternary_metadata = pd.concat([augmented_few_shot_ternary_results.metadata, few_shot_ternary_results.metadata])

ternary_results = ternary_f1.merge(ternary_metadata)

ternary_results.to_csv("ternary_results.csv", index=False)