In [47]:
import json
import random
import requests
import concurrent.futures
from tqdm import tqdm
import re
import threading

In [69]:
API_URL = "https://api.deepseek.com/chat/completions"
API_KEY = ""
headers = {
    "Authorization": f"Bearer {API_KEY}",
    "Content-Type": "application/json"
}
NSHOT = 2
file_lock = threading.Lock()

In [3]:
def apply_template(example):
    formatted = []
    formatted.append("Observation 1: " + example['obs1'] + "\n")
    formatted.append("Observation 2: " + example['obs2'] + "\n")
    formatted.append("Possible Explanations:\n")
    formatted.append("1: " + example['hyp1'] + "\n")
    formatted.append("2: " + example['hyp2'] + "\n")
    formatted.append("Answer: ")
    return "".join(formatted)

def generate_few_shot_testcases(n): # for n shot
    # Load JSONL data
    x_train = []
    with open("alphanli-train-dev/train.jsonl") as f:
        for line in f:
            x_train.append(json.loads(line))

    y_train = []
    with open("alphanli-train-dev/train-labels.lst") as f:
        y_train = [int(line.strip()) for line in f]

    x_val = []
    with open("alphanli-train-dev/dev.jsonl") as f:
        for line in f:
            x_val.append(json.loads(line))

    y_val = []
    with open("alphanli-train-dev/dev-labels.lst") as f:
        y_val = [int(line.strip()) for line in f]

    prompts = []
    for testcase in x_val:
        prompt = []
        indices = random.sample(range(len(x_train)), n)
        for i in indices:
            example = x_train[i]
            label = y_train[i]
            prompt.append(apply_template(example))
            prompt.append(str(label) + "\n\n")
        prompt.append(apply_template(testcase))
        prompts.append("".join(prompt))
    return prompts, y_val

In [70]:
def query_deepseek(prompt):
    data = {
        "model": "deepseek-reasoner",  # Use 'deepseek-reasoner' for R1 model or 'deepseek-chat' for V3 model
        "messages": [
            {"role": "system", "content": "You are an AI model that selects the most plausible explanation for a given pair of observations. Given two observations and multiple possible explanations, choose the explanation that best follows logically from the observations. Respond only with the correct answer number."},
            {"role": "user", "content": prompt}
        ],
        "temperature": 0.5,
        "stream": False  # Disable streaming
    }
    
    response = requests.post(API_URL, headers=headers, json=data)
    return response.json()

def process_prompt(index, prompt):
    result = query_deepseek(prompt)
    
    # Save to .jsonl immediately
    with file_lock:
        with open("deepseek_r1_results.jsonl", "a") as f:
            # Include metadata for retries
            line = json.dumps({
                "index": index,
                "response": result,
                "success": "error" not in result
            })
            f.write(line + "\n")  # JSONL requires one JSON object per line
    return result

In [72]:
prompts, labels = generate_few_shot_testcases(NSHOT)
for i, prompt in enumerate(prompts):
    with open("prompts.jsonl", "a") as f:
        line = json.dumps({
            "index": i,
            "prompt": prompt,
        })
        f.write(line + "\n")

In [74]:
results = []
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
    # Use executor.submit to pass multiple arguments
    futures = [executor.submit(process_prompt, index, prompt) for index, prompt in enumerate(prompts)]
    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        result = future.result()  # Wait for completion
        results.append(result)

  0%|▎                                                                                                                                            | 3/1532 [10:00<84:58:39, 200.08s/it]


JSONDecodeError: Expecting value: line 10 column 1 (char 9)

In [76]:
results = []
with open("deepseek_r1_results.jsonl") as f:
    for line in f:
        results.append(json.loads(line))

In [80]:
y_val = []
with open("alphanli-train-dev/dev-labels.lst") as f:
    y_val = [int(line.strip()) for line in f]

labels = []
for result in results:
    labels.append(y_val[result['index']])

In [84]:
predictions = [int(re.search(r"\d+", result['response']['choices'][0]['message']['content']).group()) for result in results]

In [85]:
accuracy = sum(a == b for a, b in zip(predictions, labels)) / len(labels)

In [86]:
print(accuracy)

0.873769024171889
