In [10]:
import os
import fnmatch
import json 
from sklearn.metrics import r2_score

def find_files_with_pattern(directory, pattern):
    matching_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if fnmatch.fnmatch(file, pattern):
                matching_files.append(os.path.join(root, file))
    return matching_files
def exact_match(actual, predicted):
    both, user, assistant = False, False, False
    if actual==predicted:
        both=True
    try:
        if predicted.split('.')[0]==actual.split('.')[0]:
            user=True
    except:
        pass
    try:
        if predicted.split('.')[1]==actual.split('.')[1]:
            assistant=True
    except:
        pass
    return both, user, assistant

def evaluate_CaSiNo(directory, pattern):
    files = find_files_with_pattern(directory, pattern)
    print(files)
    for file in files:
        with open(file, 'r') as fin:
            data = json.load(fin)
        both, user, assistant = 0, 0, 0
        for solution in data:
            b, u, a = exact_match(solution['ground_truth'], solution['response'])
            if b:
                both+=1
            if u:
                user+=1
            if a:
                assistant+=1
        print(f"The accuracy for {file.split('/')[-1].split('.')[0]}:\tBoth: {round(both/len(data), 2)}\tUser: {round(user/len(data), 2)}\tAssistant: {round(assistant/len(data), 2)}")

In [11]:
evaluate_CaSiNo('./controls', 'CaSiNo *.jsonl')

['./controls/CaSiNo Shallow Llama-3 1B.jsonl', './controls/CaSiNo Middle Llama-3 1B.jsonl', './controls/CaSiNo Deep Llama-3 1B.jsonl', './controls/CaSiNo Shallow Llama-3 3B.jsonl', './controls/CaSiNo Deep Llama-3 8B.jsonl', './controls/CaSiNo Middle Llama-3 3B.jsonl', './controls/CaSiNo Shallow Llama-3 8B.jsonl', './controls/CaSiNo Deep Llama-3 3B.jsonl', './controls/CaSiNo Middle Llama-3 8B.jsonl']
The accuracy for CaSiNo Shallow Llama-3 1B:	Both: 0.0	User: 0.0	Assistant: 0.0
The accuracy for CaSiNo Middle Llama-3 1B:	Both: 0.07	User: 0.3	Assistant: 0.23
The accuracy for CaSiNo Deep Llama-3 1B:	Both: 0.0	User: 0.03	Assistant: 0.0
The accuracy for CaSiNo Shallow Llama-3 3B:	Both: 0.2	User: 0.5	Assistant: 0.53
The accuracy for CaSiNo Deep Llama-3 8B:	Both: 0.13	User: 0.27	Assistant: 0.3
The accuracy for CaSiNo Middle Llama-3 3B:	Both: 0.2	User: 0.4	Assistant: 0.37
The accuracy for CaSiNo Shallow Llama-3 8B:	Both: 0.3	User: 0.53	Assistant: 0.63
The accuracy for CaSiNo Deep Llama-3 3B:	Bo

In [18]:
files = find_files_with_pattern('./controls', 'CRAIGSLISTBARGAIN *.jsonl')
for file in files:
    with open(file, 'r') as fin:
        data = json.load(fin)
    buyer_price_actual, buyer_price_predicted, seller_price_actual, seller_price_predicted = [], [], [], []
    for item in data:
        buyer_price_actual.append(int(item['ground_truth'].split(' ')[-2].replace(',', '')))
        try:
            buyer_price_predicted.append(int(item['response'].split(' ')[-2].replace(',', '')))
        except:
            buyer_price_predicted.append(10)
        seller_price_actual.append(int(item['ground_truth'].split(' ')[7].strip(',').replace(',', '')))
        try:
            seller_price_predicted.append(int(item['response'].split(' ')[7].strip(',').replace(',', '')))
        except:
            seller_price_predicted.append(10)
    print(f'{file.split("/")[-1].split(".")[0]} \t++>\t R^2 Score for Buyer: {round(r2_score(buyer_price_actual, buyer_price_predicted), 2)}\tR^2 Score for Seller: {round(r2_score(seller_price_actual, seller_price_predicted), 2)}')

CRAIGSLISTBARGAIN Shallow Llama-3 1B 	++>	 R^2 Score for Buyer: -0.54	R^2 Score for Seller: -0.48
CRAIGSLISTBARGAIN Shallow Llama-3 3B 	++>	 R^2 Score for Buyer: 0.59	R^2 Score for Seller: 0.05
CRAIGSLISTBARGAIN Middle Llama-3 8B 	++>	 R^2 Score for Buyer: 0.95	R^2 Score for Seller: 0.98
CRAIGSLISTBARGAIN Deep Llama-3 3B 	++>	 R^2 Score for Buyer: 0.95	R^2 Score for Seller: 0.98
CRAIGSLISTBARGAIN Deep Llama-3 8B 	++>	 R^2 Score for Buyer: 0.93	R^2 Score for Seller: 0.97
CRAIGSLISTBARGAIN Deep Llama-3 1B 	++>	 R^2 Score for Buyer: 0.96	R^2 Score for Seller: 0.98
CRAIGSLISTBARGAIN Shallow Llama-3 8B 	++>	 R^2 Score for Buyer: 0.93	R^2 Score for Seller: 0.99
CRAIGSLISTBARGAIN Middle Llama-3 1B 	++>	 R^2 Score for Buyer: 0.78	R^2 Score for Seller: 0.88
CRAIGSLISTBARGAIN Middle Llama-3 3B 	++>	 R^2 Score for Buyer: 0.97	R^2 Score for Seller: 0.98
