In [None]:
import json
import ollama
import re
import time
from sklearn.metrics import accuracy_score, classification_report

# Define the model
model = "deepseek-r1:7b"

# Number of runs
num_runs = 5

# Store accuracy, time, and null values for each run
accuracy_list = []
time_list = []
null_values_list = []  # List to store null counts per run

# Run the entire process 5 times
for run in range(1, num_runs + 1):
    print(f"\n🔄 Starting Run {run} of {num_runs}...")

    # Start the timer
    start_time = time.time()

    # Load dataset from JSONL file
    with open(r"Datasets\PolitiFact_PromptCompletion_Test.jsonl", "r", encoding="utf-8") as f:
        clean_dataset = [json.loads(line) for line in f]

    # Limit to the first 100 entries
    clean_dataset = clean_dataset[:100]

    print(f"✅ Loaded dataset with {len(clean_dataset)} entries.")

    # Store actual labels & predictions
    actual_labels = []
    predicted_labels = []

    # Total number of entries for progress tracking
    total_entries = len(clean_dataset)

    # Iterate through dataset and test deepseek-r1:7B
    for i, entry in enumerate(clean_dataset):
        prompt = entry["prompt"]
        # Create a prompt without revealing the label
        prompt = f"The claim: '{prompt}'\nIs this claim True or False? No explanation is required."

        actual_label = 1 if entry["completion"].strip().lower() == "true" else 0

        # Print progress every 10 entries
        # if i % 10 == 0 or i == total_entries - 1:
        #     print(f"Processing entry {i + 1} of {total_entries}...")

        # Send the prompt to Ollama
        response = ollama.chat(model=model, messages=[{"role": "user", "content": prompt}])
        model_output = response['message']['content'].strip().lower()

        # Print the response.
        # print(f"\n output: {model_output}\n")

        # Clean the model's output by removing LaTeX and special characters
        cleaned_output = re.sub(r'[^a-zA-Z\s{}]', '', model_output)

        # Extract last occurrence of "true" or "false"
        matches = re.findall(r'\b(true|false)\b', cleaned_output, re.IGNORECASE)

        if matches:
            last_label = matches[-1].lower()
            predicted_labels.append(1 if last_label == "true" else 0)
        else:
            # print(f"⚠️ No 'true' or 'false' found in response {i+1}: {model_output}")
            predicted_labels.append(None)  # Handle unexpected cases

        actual_labels.append(actual_label)

    # Count the number of `None` (null) values
    null_count = predicted_labels.count(None)
    null_values_list.append(null_count) 

    # Print how many null values were found in this run
    print(f"⚠️ Run {run} Null Values: {null_count}")

    # Remove None values (if any) for accuracy calculation
    filtered_actual = [a for a, p in zip(actual_labels, predicted_labels) if p is not None]
    filtered_predicted = [p for p in predicted_labels if p is not None]

    # Calculate accuracy
    accuracy = accuracy_score(filtered_actual, filtered_predicted)
    print(f"✅ Run {run} Accuracy: {accuracy:.2%}")

    # Store accuracy for averaging later
    accuracy_list.append(accuracy)

    # End the timer
    end_time = time.time()

    # Calculate total elapsed time in minutes
    elapsed_time = (end_time - start_time) / 60
    print(f"⏳ Run {run} Time: {elapsed_time:.2f} minutes")

    # Store time for averaging later
    time_list.append(elapsed_time)

# 🎯 Calculate and Print Averages
average_accuracy = sum(accuracy_list) / num_runs
average_time = sum(time_list) / num_runs
average_nulls = sum(null_values_list) / num_runs

output_file = r"NoTrainTest\PolitiFact_Deepseek_r1_7B_Test.json"
# Save results back to a JSON file
results = [
    {
        "prompt": entry["prompt"],
        "actual_label": actual_labels[i],
        "predicted_label": predicted_labels[i]
    }
    for i, entry in enumerate(clean_dataset)
]

with open(output_file, "w", encoding="utf-8") as file:
    json.dump(results, file, indent=4, ensure_ascii=False)

print("\n📊 **Final Summary After 5 Runs**")
print(f"✅ **Average Accuracy:** {average_accuracy:.2%}")
print(f"⏳ **Average Execution Time:** {average_time:.2f} minutes")
print(f"⚠️ **Average Null Values per Run:** {average_nulls:.2f}")
print(f"✅ **Example Output file saved as {output_file}")


🔄 Starting Run 1 of 5...
✅ Loaded dataset with 100 entries.
⚠️ Run 1 Null Values: 0
✅ Run 1 Accuracy: 55.00%
⏳ Run 1 Time: 93.28 minutes

🔄 Starting Run 2 of 5...
✅ Loaded dataset with 100 entries.
⚠️ Run 2 Null Values: 0
✅ Run 2 Accuracy: 58.00%
⏳ Run 2 Time: 90.20 minutes

🔄 Starting Run 3 of 5...
✅ Loaded dataset with 100 entries.
⚠️ Run 3 Null Values: 0
✅ Run 3 Accuracy: 61.00%
⏳ Run 3 Time: 94.20 minutes

🔄 Starting Run 4 of 5...
✅ Loaded dataset with 100 entries.
⚠️ Run 4 Null Values: 0
✅ Run 4 Accuracy: 58.00%
⏳ Run 4 Time: 91.26 minutes

🔄 Starting Run 5 of 5...
✅ Loaded dataset with 100 entries.
⚠️ Run 5 Null Values: 0
✅ Run 5 Accuracy: 56.00%
⏳ Run 5 Time: 90.83 minutes

📊 **Final Summary After 5 Runs**
✅ **Average Accuracy:** 57.60%
⏳ **Average Execution Time:** 91.95 minutes
⚠️ **Average Null Values per Run:** 0.00
✅ **Example Output file saved as C:\Users\charl\OneDrive\Documents\Dissertation\Code\NoTrainTest\PolitiFact_Deepseek_r1_7B_Test.json
