### CaseHOLD labeling

In this notebook, we run a labeling procedure on the test set of the CaseHOLD dataset. The dataset uses a "citing prompt", and contains five multiple choice answer options. We ask the LLM to read over the prompt and options, then select the best one.

We test seven different LLMs, three from Amazon and four from Llama. We record the responses for each model for further analysis elsewhere.

In [None]:
import pandas as pd
from datasets import load_dataset
from langchain_aws import ChatBedrockConverse
from boto3 import client
from botocore.config import Config

config = Config(read_timeout=1000)

client = client(service_name='bedrock-runtime',
                      config=config, region_name="us-east-1")

ds = load_dataset("casehold/casehold", "all")

In [None]:
def create_prompt(r):
    return """Input: "{}"
    Question: Consider the appropriate legal grammar and reasoning. Select which of the following clauses is the best replacement for the <HOLDING> tag above? 
    A: {}
    B: {}
    C: {}
    D: {}
    E: {}
    
    First think it through. Then provide your answer formatted in backticks like ANSWER:`A` or ANSWER:`C`.
    
    REASONING: """.format(r['citing_prompt'], r['holding_0'], r['holding_1'], r['holding_2'], r['holding_3'], r['holding_4'])

In [None]:
test = ds['test'].to_pandas()
test['label'] = test['label'].astype(float)
test['final_prompt'] = test.apply(create_prompt, axis = 1)

In [None]:
#test['response_nova_micro'] = ''
#test['response_nova_lite'] = ''
#test['response_nova_pro'] = ''
#test['response_llama_1b'] = ''
#test['response_llama_3b'] = ''
#test['response_llama_11b'] = ''
#test['response_llama_90b'] = ''


In [None]:
llm_nova_micro = ChatBedrockConverse(model="amazon.nova-micro-v1:0", region_name="us-east-1", temperature = 0, client = client)
llm_nova_lite = ChatBedrockConverse(model="amazon.nova-lite-v1:0", region_name="us-east-1", temperature = 0, client = client)
llm_nova_pro = ChatBedrockConverse(model="amazon.nova-pro-v1:0", region_name="us-east-1", temperature = 0, client = client)

llm_llama32_1b = ChatBedrockConverse(model="us.meta.llama3-2-1b-instruct-v1:0", region_name="us-east-1", temperature = 0, client = client)
llm_llama32_3b = ChatBedrockConverse(model="us.meta.llama3-2-3b-instruct-v1:0", region_name="us-east-1", temperature = 0, client = client)
llm_llama32_11b = ChatBedrockConverse(model="us.meta.llama3-2-11b-instruct-v1:0", region_name="us-east-1", temperature = 0, client = client)
llm_llama32_90b = ChatBedrockConverse(model="us.meta.llama3-2-90b-instruct-v1:0", region_name="us-east-1", temperature = 0, client = client)

In [None]:
for i in range(0, test.shape[0]):
    
    print(i)
    row = test.loc[i]
    messages = [
        ("user", row['final_prompt'])
    ]

    a = llm_nova_micro.invoke(messages)
    test.loc[i, 'response_nova_micro'] = a.content
    
    b = llm_nova_lite.invoke(messages)
    test.loc[i, 'response_nova_lite'] = b.content

    c = llm_nova_pro.invoke(messages)
    test.loc[i, 'response_nova_pro'] = c.content

    d = llm_llama32_1b.invoke(messages)
    test.loc[i, 'response_llama_1b'] = d.content

    e = llm_llama32_3b.invoke(messages)
    test.loc[i, 'response_llama_3b'] = e.content

    f = llm_llama32_11b.invoke(messages)
    test.loc[i, 'response_llama_11b'] = f.content

    g = llm_llama32_90b.invoke(messages)
    test.loc[i, 'response_llama_90b'] = g.content

In [None]:
test.to_csv('casehold_test.csv')