In [1]:

from transformers import AutoModelForCausalLM, AutoTokenizer, pipelines
from accelerate import init_empty_weights
import torch

In [2]:
# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
system_prompt = """You are a predicate logic assistant specializing in formal logic analysis.

Your capabilities include:
1. Evaluating the validity of logical arguments using predicate logic
2. Identifying logical fallacies in arguments
3. Converting natural language statements to predicate logic notation
4. Determining if formulas are well-formed
5. Checking logical equivalence between formulas
6. Performing logical operations (negation, conjunction, disjunction, implication)
7. Translating between natural language and first-order logic

When given a logical problem and query, respond ONLY with "yes" or "no".
"""

In [5]:
import pandas as pd
dataset_path = "/home/fozle/Course Work/Y1S2/CISC 844/Projects/LLM-Predicate-Logic/dataset/data.csv"
df = pd.read_csv(dataset_path)

In [113]:
sample_size = 50  # Adjust based on how many examples you want to test
df_sample = df.sample(sample_size, random_state=42)

In [114]:
# Initialize results storage
results = []

In [115]:
system_prompt = """You are a predicate logic assistant specializing in formal logic analysis.

Your capabilities include:
1. Evaluating the validity of logical arguments using predicate logic
2. Identifying logical fallacies in arguments
3. Converting natural language statements to predicate logic notation
4. Determining if formulas are well-formed
5. Checking logical equivalence between formulas
6. Performing logical operations (negation, conjunction, disjunction, implication)
7. Translating between natural language and first-order logic

When given a logical query, respond ONLY with "yes" if the statement is logically correct else "no".
"""

In [116]:
from transformers import pipeline
import torch
pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.2-1B-Instruct",  # or 3B version
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Device set to use cuda:0


In [117]:
#  load only one row from the dataset
row = df_sample.iloc[0]
# user_prompt = row['query']
user_prompt = "Consider the following premises: It is cloudy or Richard is playing tennis. If it is cloudy, then it is sunny. If Richard plays tennis, then it is sunny. Can we infer the following from them?"
print(user_prompt)
ground_truth = row['answer']
ground_truth

Consider the following premises: It is cloudy or Richard is playing tennis. If it is cloudy, then it is sunny. If Richard plays tennis, then it is sunny. Can we infer the following from them?


'no'

In [118]:
 # Prepare messages for the model
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": user_prompt}
]

outputs = pipe(
            messages,
            max_new_tokens=128,  # Shorter since we only need yes/no
            do_sample=False  # Deterministic response
        )
        
        
# Extract response content
response = outputs[0]["generated_text"][-1]["content"].strip().lower()
response

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'yes'

In [119]:
import time
from tqdm.notebook import tqdm
from IPython.display import display, HTML

# Run inference on samples with fixed progress tracking
for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="Testing model"):
    # Construct user prompt from problem and query
    user_prompt = f"{row['problem']} {row['query']}"
    
    # Prepare messages for the model
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]
    
    # Get model response
    try:
        start_time = time.time()
        outputs = pipe(
            messages,
            max_new_tokens=128,  # Shorter since we only need yes/no
            do_sample=False  # Deterministic response
        )
        inference_time = time.time() - start_time
        
        # Extract response content
        response = outputs[0]["generated_text"][-1]["content"].strip().lower()
        
        # Check if response contains yes or no
        if "yes" in response and "no" not in response:
            prediction = "yes"
        elif "no" in response:
            prediction = "no"
        else:
            prediction = "unclear"
            
        # Compare with ground truth
        correct = prediction == row["answer"]
        
        # Store results
        result = {
            "id": row["id"],
            "logic": row["logic"],
            "rule_category": row["rule_category"],
            "rule": row["rule"],
            "problem": row["problem"],
            "query": row["query"],
            "ground_truth": row["answer"],
            "prediction": prediction,
            "correct": correct,
            "inference_time": inference_time
        }
        results.append(result)
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.5)
        
    except Exception as e:
        print(f"Error processing row {idx}: {e}")

Testing model:   0%|          | 0/50 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

In [122]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,id,logic,rule_category,rule,problem,query,ground_truth,prediction,correct,inference_time
0,1127,propositional,inference,disjunction elimination,unrelated,Consider the following premises: It is cloudy ...,no,no,True,0.029729
1,80,predicate,equivalent,Law of quantifier distribution,contradiction,Consider the following premises: There is at l...,no,yes,False,0.015643
2,3261,propositional,inference,resolution,contradiction,Consider the following premises: Jennifer is r...,no,yes,False,0.014647
3,1737,predicate,equivalent,existential biconditional laws,contradiction,Consider the following premises: There is at l...,no,yes,False,0.014481
4,4527,predicate,equivalent,universal distributive laws,inference,"Consider the following premises: For all x, x ...",yes,no,False,0.014226


In [123]:
results_df.to_csv("results.csv", index=False)

In [None]:
# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)

# Calculate overall accuracy
accuracy = results_df["correct"].mean() * 100
print(f"Overall Accuracy: {accuracy:.2f}%")

# Display number of samples by prediction
prediction_counts = results_df["prediction"].value_counts()
print("\nPrediction counts:")
for pred, count in prediction_counts.items():
    print(f"  {pred}: {count}")

# # Display accuracy by logic type
print("\nAccuracy by Logic Type:")
logic_acc = results_df.groupby("logic")["correct"].agg(["mean", "count"])
logic_acc["accuracy_pct"] = logic_acc["mean"] * 100
display(logic_acc[["count", "accuracy_pct"]])

# # Display accuracy by rule category
print("\nAccuracy by Rule Category:")
rule_acc = results_df.groupby("rule_category")["correct"].agg(["mean", "count"])
rule_acc["accuracy_pct"] = rule_acc["mean"] * 100
display(rule_acc.sort_values("count", ascending=False)[["count", "accuracy_pct"]])

# # Save results to CSV
results_file = "llama_predicate_logic_results.csv"
results_df.to_csv(results_file, index=False)
print(f"\nResults saved to {results_file}")

Overall Accuracy: 44.00%

Prediction counts:
  yes: 36
  no: 14

Accuracy by Logic Type:


Unnamed: 0_level_0,count,accuracy_pct
logic,Unnamed: 1_level_1,Unnamed: 2_level_1
predicate,37,37.837838
propositional,13,61.538462



Accuracy by Rule Category:


Unnamed: 0_level_0,count,accuracy_pct
rule_category,Unnamed: 1_level_1,Unnamed: 2_level_1
inference,25,56.0
equivalent,17,41.176471
fallacy,8,12.5


In [128]:
results_df.value_counts("correct")

correct
False    28
True     22
Name: count, dtype: int64