# Preamble

In [None]:
from openai import OpenAI
import json
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import confusion_matrix

# OpenAI

In [None]:
#MODEL= "gpt-3.5-turbo"
MODEL= "gpt-4-0613"

In [None]:
API_KEY = 'ENTER-KEY'

In [None]:
client = OpenAI(api_key= API_KEY)

In [None]:
def query(question, verbose=False):
    completion = client.chat.completions.create(
        model=MODEL,
        seed=0,
        temperature=0.0,
        messages=[
            {"role": "system", "content": "Answer binary causal questions with 'Yes' or 'No'."},
            {"role": "user", "content": question}
        ]
    )
    
    if verbose:
        print(completion)
    
    return completion.choices[0].message.content
    

query("May smoking cause cancer?", verbose=True)

# Evaluation

In [None]:
def compute_metrics(labels, predictions):
    assert len(labels) == len(predictions)
    metrics = {}
    metrics['accuracy'] = round(accuracy_score(labels, predictions), 3)
    metrics['f1_score'] = round(f1_score(labels, predictions, average='binary'), 3)
    metrics['recall'] = round(recall_score(labels, predictions, average='binary'), 3)
    metrics['precision'] = round(precision_score(labels, predictions, average='binary'), 3)

    tn, fp, fn, tp = confusion_matrix(labels, predictions, labels=[0, 1]).ravel()
    metrics['tp'] = round(tp, 3)
    metrics['fn'] = round(fn, 3)
    metrics['fp'] = round(fp, 3)
    metrics['tn'] = round(tn, 3)

    return metrics

# MS MARCO

In [None]:
f = open('../datasets/msmarco_test.json')
data = json.load(f)

In [None]:
filtered = []
for item in data:
    if item['answer:Extracted'] != ['No Answer Present.']:
        filtered.append(item)

In [None]:
len(filtered)

In [None]:
for item in filtered:
    question = item['question']
    prediction = query(question)
    item['answer:Predicted'] = [prediction]

In [None]:
FILE_MS_MARCO_RESULTS = f'msmarco_test_{MODEL}.json'

In [None]:
with open(FILE_MS_MARCO_RESULTS, 'w') as f:
    json.dump(filtered, f)

In [None]:
f = open(FILE_MS_MARCO_RESULTS)
filtered = json.load(f)

In [None]:
labels = []
for item in filtered:
    labels.append(item['answer:Extracted'][0] == 'Yes')

In [None]:
predictions = []
for item in filtered:
    predictions.append(item['answer:Predicted'][0].lower().startswith('yes'))

In [None]:
compute_metrics(labels, predictions)

# SemEval

In [None]:
df = pd.read_csv('../datasets/sem_test.csv')

In [None]:
predictions = []
for item in df.itertuples():
    question = f'Can {item.cause} cause {item.effect}?'
    answer = query(question)
    predictions.append(answer)
    
df['predictions'] = predictions

In [None]:
FILE_SEMEVAL_RESULTS = f'sem_test_{MODEL}.csv'

In [None]:
df.to_csv(FILE_SEMEVAL_RESULTS)

In [None]:
df = pd.read_csv(FILE_SEMEVAL_RESULTS)

In [None]:
labels = []
predictions = []
for item in df.itertuples():
    labels.append(item.causal == 'causal')
    predictions.append(item.predictions.lower().startswith('yes'))

In [None]:
len(predictions)

In [None]:
compute_metrics(labels, predictions)