## Setup

In [1]:
%load_ext dotenv
%dotenv ../.env

In [2]:
import os
assert "HUGGINGFACEHUB_API_TOKEN" in os.environ

In [3]:
from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint

llm = HuggingFaceEndpoint(
    repo_id="HuggingFaceH4/zephyr-7b-beta",
    task="text-generation",
    max_new_tokens=512,
    do_sample=True,
    temperature=0.4,
)

chat_model = ChatHuggingFace(llm=llm)

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /Users/ggbetz/.cache/huggingface/token
Login successful


In [4]:
# test API
#from langchain_core.prompts import ChatPromptTemplate
#chain = ChatPromptTemplate.from_messages([("user","Tell me a joke about {x}!")]) | chat_model
#chain.invoke({"x":"kids"})

## Tests

In [5]:
import random 

with open('../data/tags.txt') as file:
    tags = [line.rstrip() for line in file]
testtags = random.sample(tags, 8)
print(f"Loaded {len(tags)} tags. Choose: {testtags}.")


Loaded 300 tags. Choose: ['Copyright', 'Art', 'Basketball', 'Police', 'HumanRights', 'Protest', 'Religion', 'FBI'].


In [6]:
from syncialo.chains.debate_design import SuggestTopicsChain

chain = SuggestTopicsChain.build(chat_model)

In [7]:
import ujson

from langchain.globals import set_verbose

set_verbose(True)

output = chain.invoke({
    "tags": testtags,
    "debates_per_tag_cluster": 5
})

print(ujson.dumps(output, indent=2))

[
  {
    "idx": 1,
    "topic": "Should artists have the right to copyright their street art, and how should police handle the removal or destruction of these works?"
  },
  {
    "idx": 2,
    "topic": "Should the FBI investigate cases of perceived religious extremism, and how should human rights be balanced in such investigations?"
  },
  {
    "idx": 3,
    "topic": "Is basketball a form of protest, and should athletes be encouraged to speak out on social and political issues through their sports platform?"
  },
  {
    "idx": 4,
    "topic": "Should human rights be considered a higher priority than police actions in cases of deadly force?"
  },
  {
    "idx": 5,
    "topic": "Should artists' right to freedom of expression take precedence over copyright laws, and should religious organizations be able to claim copyright on religious texts and artwork?"
  }
]


## Pipeline

1. partition 300 tags:

    * 280 universal tags
    * 10 validation tags
    * 10 test tags

2. determine debate tag-clusters 

    * train: 100 clusters composed of universal tags
    * eval: 5 mixed tag-clusters with 50% eval tags, 50% univeral tags each
    * test: 5 mixed tag-clusters with 50% test tags, 50% univeral tags each

3. topics and motions

4. recursively generate balanced argument tree
    


In [7]:
TAGS_PER_CLUSTER = 8
DEBATES_PER_TAG_CLUSTER = 10
TRAIN_SPLIT_SIZE = 1000
EVAL_SPLIT_SIZE = 50
TEST_SPLIT_SIZE = 50
DEGREE_CONFIGS = [
    [6,6,1,0],
    [5,5,2,0],
    [3,2,2,1,1,0],
    [4,3,2,1,0],
    [3,4,2,1,0]     
]


## Tags

In [8]:
with open('tags.txt') as file:
    tags = [line.rstrip() for line in file]
len(tags)

300

In [9]:
import random

random_tagging = random.Random(42)
tags_universal = tags.copy()
tags_eval = random_tagging.sample(tags_universal, k=10)
tags_universal = [t for t in tags_universal if t not in tags_eval]
tags_test = random_tagging.sample(tags_universal, k=10)
tags_universal = [t for t in tags_universal if t not in tags_test]

assert set(tags) == set(tags_universal) | set(tags_test) | set(tags_eval)

In [10]:
# train tag clusters
n = TRAIN_SPLIT_SIZE // DEBATES_PER_TAG_CLUSTER
tag_clusters_train = []
for i in range(n):
    tc = random.Random(42+i).sample(tags_universal, k=TAGS_PER_CLUSTER)
    tag_clusters_train.append({"tags":tc, "split":"train"})

# eval tag clusters
n = EVAL_SPLIT_SIZE // DEBATES_PER_TAG_CLUSTER
tag_clusters_eval = []
for i in range(n):
    tc = random.Random(42+i).sample(tags_universal, k=TAGS_PER_CLUSTER//2)
    tc += random.Random(42+i).sample(tags_eval, k=TAGS_PER_CLUSTER//2)
    random.Random(42+i).shuffle(tc)
    tag_clusters_eval.append({"tags":tc, "split":"eval"})

# test tag clusters
n = TEST_SPLIT_SIZE // DEBATES_PER_TAG_CLUSTER
tag_clusters_test = []
for i in range(n):
    tc = random.Random(42+i).sample(tags_universal, k=TAGS_PER_CLUSTER//2)
    tc += random.Random(42+i).sample(tags_test, k=TAGS_PER_CLUSTER//2)
    random.Random(42+i).shuffle(tc)
    tag_clusters_test.append({"tags":tc, "split":"test"})

## Topics and motions

In [11]:
import os
import pandas as pd
import random
from tqdm import tqdm
import uuid

import queries

if os.path.isdir(corpus_path):
    raise ValueError(f"Directory {corpus_path} exists. Delete before creating topics and motions.")
os.makedirs(corpus_path)

metadata = []

logger.info("Creating topics")
for tags in tqdm(tag_clusters_train+tag_clusters_eval+tag_clusters_test):
    topics = await queries.suggest_topics(
        model=model,
        tags=tags["tags"],
        debates_per_tag_cluster=DEBATES_PER_TAG_CLUSTER,
        decoder="sample",
        temperature=.6,
    )
    for topic in topics:
        degree_config = random.choice(DEGREE_CONFIGS)
        metadata.append({**tags, "topic": topic, "degree_config": degree_config})

        
logger.info("Creating motions")
for record in tqdm(metadata):
    motion = await queries.suggest_motion(
        model=model,
        tags=record["tags"],
        topic=record["topic"],
        decoder="sample",
        temperature=.6,
    )
    record.update({"motion": motion, "uid": str(uuid.uuid4())})

    
df_metadata = pd.DataFrame(metadata)
df_metadata.to_csv(os.path.join(corpus_path,"metadata.csv"), index=False)

df_metadata.head()    

ValueError: Directory ./debates/SOLAR-10.7B-Instruct-v1.0-AWQ exists. Delete before creating topics and motions.

In [None]:
# number of claims in each debate
import math 

def n_claims(profile):
    profile_d = [2*i for i in profile]
    total = 0
    for i in range(len(profile_d)):
        total += math.prod(profile_d[:i+1])
    return total

nc = df_metadata.degree_config.apply(n_claims)

for split in ["train", "eval", "test"]:
    print(f"{split}: {nc[df_metadata.split.eq(split)].sum()}")

## Debate Generation

In [None]:
# multithreaded

from ast import literal_eval
import asyncio
import json
import networkx as nx
import pandas as pd
import uuid

from loguru import logger

from builder import DebateBuilder, to_kialo

BATCH_SIZE = 128

logger.remove()
logger.add("debate_creation_{time}.log", level="INFO")
logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")

builder = DebateBuilder(
    tags_universal=tags_universal,
    tags_per_cluster=TAGS_PER_CLUSTER,
    model=model,
)

df_metadata = pd.read_csv(os.path.join(corpus_path,"metadata.csv"))


async def build(i, row):
    kialo_file_path = os.path.join(corpus_path,"kialo",row.split,f"{row.uid}.txt") 
    json_file_path = os.path.join(corpus_path,"json",row.split,f"{row.uid}.json") 

    if os.path.exists(kialo_file_path):
        logger.warning(f"File '{kialo_file_path}' exists, skipping debate #{i+1}.")
        return
    if os.path.exists(json_file_path):
        logger.warning(f"File '{json_file_path}' exists, skipping debate #{i+1}.")
        return

    logger.info(f"Building debate #{i+1} (of {len(df_metadata)}) with topic '{row.topic}' and central claim '{row.motion}' ...")    
        
    degree_config = literal_eval(row.degree_config)
        
    tree = await builder.build_debate(
        root_claim=row.motion,
        topic=row.topic,
        tag_cluster=row.tags,
        degree_config=degree_config,
    )

    os.makedirs(os.path.dirname(kialo_file_path), exist_ok=True)
    with open(kialo_file_path, 'w') as f:
        for line in to_kialo(tree, topic=row.topic):
            f.write(f"{line}\n")

    os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
    with open(json_file_path, 'w') as f:
        json.dump(nx.node_link_data(tree), f)

        
sem = asyncio.Semaphore(BATCH_SIZE)

async def safe_build(i, row):
    async with sem:  # semaphore limits num of simultaneous debate builds
        return await build(i, row)


tasks = [
    asyncio.ensure_future(safe_build(i, row))  # creating task starts coroutine
    for i, row 
    in df_metadata.iterrows()
]
await asyncio.gather(*tasks)  # await moment all debates built    
    


2024-01-22T09:41:50.046323+0000 INFO Building debate #4 (of 1100) with topic 'The Ethics of Genetic Engineering: Can it Enhance or Endanger Future Generations?' and central claim 'Genetic engineering, when strictly regulated and ethically applied, has the potential to enhance future generations without causing significant endangerment to human life and values.' ...
tokenizer_config.json: 100%|██████████| 1.41k/1.41k [00:00<00:00, 9.62MB/s]
tokenizer.model: 100%|██████████| 493k/493k [00:00<00:00, 59.0MB/s]
tokenizer.json: 100%|██████████| 1.80M/1.80M [00:00<00:00, 55.8MB/s]
special_tokens_map.json: 100%|██████████| 551/551 [00:00<00:00, 4.30MB/s]
2024-01-22T09:41:50.829504+0000 INFO Building debate #5 (of 1100) with topic 'Parental Controls in the Digital Age: Should Apple Play a Greater Role or Leave it to Parents?' and central claim 'Apple should significantly enhance parental controls in their digital products to ensure a safer online environment for children, beyond the responsibil

In [None]:
# no threading

from ast import literal_eval
import json
import networkx as nx
import pandas as pd
import uuid

from loguru import logger

from builder import DebateBuilder, to_kialo

logger.remove()
logger.add("debate_creation_{time}.log", level="INFO")
logger.add(sys.stderr, format="{time} {level} {message}", level="INFO")

builder = DebateBuilder(
    tags_universal=tags_universal,
    tags_per_cluster=TAGS_PER_CLUSTER,
    model=model,
)

df_metadata = pd.read_csv(os.path.join(corpus_path,"metadata.csv"))

for i, row in df_metadata.iterrows():

    kialo_file_path = os.path.join(corpus_path,"kialo",row.split,f"{row.uid}.txt") 
    json_file_path = os.path.join(corpus_path,"json",row.split,f"{row.uid}.json") 

    if os.path.exists(kialo_file_path):
        logger.warning(f"File '{kialo_file_path}' exists, skipping debate #{i+1}.")
        continue
    if os.path.exists(json_file_path):
        logger.warning(f"File '{json_file_path}' exists, skipping debate #{i+1}.")
        continue

    logger.info(f"Building debate #{i+1} (of {len(df_metadata)}) with topic '{row.topic}' and central claim '{row.motion}' ...")    
        
    degree_config = literal_eval(row.degree_config)
        
    tree = await builder.build_debate(
        root_claim=row.motion,
        topic=row.topic,
        tag_cluster=row.tags,
        degree_config=degree_config,
    )

    os.makedirs(os.path.dirname(kialo_file_path), exist_ok=True)
    with open(kialo_file_path, 'w') as f:
        for line in to_kialo(tree, topic=row.topic):
            f.write(f"{line}\n")

    os.makedirs(os.path.dirname(json_file_path), exist_ok=True)
    with open(json_file_path, 'w') as f:
        json.dump(nx.node_link_data(tree), f)
    
    


In [None]:
def to_kialo(tree, topic = ""):

    lines = []
    lines.append(f"Discussion Title: {topic}")
    lines.append("")
    
    def print_edge(target, counter, val = None):

        if val is None:
            sym = " "
        else:
            sym = " PRO: " if val==queries.PRO else " CON: "

        line = counter + sym + tree.nodes[target]["claim"]
        lines.append(line)

        i = 0
        for source, _, data in tree.in_edges(target, data=True):
            i += 1
            print_edge(
                source,
                counter+f"{i}.",
                data['valence']
            )    

    root_id = next(n for n in tree.nodes if len(tree.out_edges(n))==0)
    counter = "1."

    print_edge(root_id, counter)
    
    return lines
    
print("\n".join(to_kialo(tree)))

# Appendices

## LMQL Queries Tests

In [None]:
import queries

In [None]:
# test topic suggestion
result = await queries.suggest_topics(
    model=model,
    tags=tag_clusters_train[0]["tags"],
    debates_per_tag_cluster=DEBATES_PER_TAG_CLUSTER,
)
result

In [None]:
result = await queries.suggest_motion(
    model=model,
    tags=tag_clusters_train[0]["tags"],
    topic="The Effects of Lockdowns on Children's Mental Health: A Worldwide Analysis."
)
import pprint
pprint.pprint(result)

In [None]:
result = await queries.identify_premises(
    model=model,
    argument="Racial and Ethnic Disparities: Systemic racism has historically restricted educational opportunities for certain minority communities, such as Black, Hispanic, and Native American students, who continue to face barriers to accessing higher education.",
    conclusion="Improved Equity and Inclusion: Free college education addresses historical disparities faced by underrepresented groups in higher education, promoting inclusivity and equal opportunity across various racial, ethnic, gender, and socioeconomic identities.",
    valence=queries.PRO,
)

In [None]:
pprint.pprint(result)

In [None]:
result = await queries.identify_premises(
    model=model,
    argument="Improved Equity and Inclusion: Free college education addresses historical disparities faced by underrepresented groups in higher education, promoting inclusivity and equal opportunity across various racial, ethnic, gender, and socioeconomic identities.",
    conclusion="College education should be free.",
    valence=queries.CON,
)

In [None]:
pprint.pprint(result)

In [None]:
premises = ['Free college education addresses historical disparities in higher education.',
 'Promoting inclusivity and equal opportunity is important across various '
 'racial, ethnic, gender, and socioeconomic identities.',
 'Underrepresented groups face challenges in higher education access.',
 'Improving equity and inclusion in higher education can be achieved through '
 'free college education.']

In [None]:
result = await queries.rank_by_plausibility(
    model=model,
    premises=premises,
    tags=tag_clusters_train[0]["tags"],
    decoder="beam",
    n=2,
)
print(result[0])

In [None]:
result = await queries.supporting_argument(
    model=model,
    premises=premises,
    target_idx=1,
    n=4,
    tags=tag_clusters_train[0]["tags"],
    decoder="sample",
    temperature=.6,
)
pprint.pprint(result)

In [None]:
result = await queries.supporting_argument(
    model=model,
    premises=premises,
    target_idx=2,
    tags=tag_clusters_train[0]["tags"],
    decoder="sample",
    temperature=.6,    
)
pprint.pprint(result)

In [None]:
result = await queries.attacking_argument(
    model=model,
    premises=premises,
    target_idx=2,
    tags=tag_clusters_train[0]["tags"],
    decoder="sample",
    temperature=.6,
)
pprint.pprint(result)

In [None]:
# back of enevlope: args per debate

import math 

profile = [6,6,1,0] # [6,6,1,0]  [5,5,2,0]  [3,2,2,1,1,0] # [4,3,2,1,0] [3,4,2,1,0] 
profile = [2*i for i in profile]
total = 0
for i in range(len(profile)):
    total += math.prod(profile[:i+1])
    print(total)
total

In [None]:
def print_argtree(tree):

    def print_edge(target, indent, val = None):

        if val is None:
            sym = ""
        else:
            sym = "+ " if val==queries.PRO else "- "

        line = indent*" " + sym + tree.nodes[target]["claim"]
        print(line)

        for source, _, data in tree.in_edges(target, data=True):
            print_edge(
                source,
                indent+2,
                data['valence']
            )    

    root_id = next(n for n in tree.nodes if len(tree.out_edges(n))==0)
    indent = 0

    print_edge(root_id, 0)
    
print_argtree(tree)