In [1]:

from transformers import AutoModelForCausalLM, AutoTokenizer, pipelines
from accelerate import init_empty_weights
import torch

In [2]:
# Checking if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Create an ICL prompt that includes example problems and their answers
system_prompt = """You are a predicate logic assistant specializing in formal logic analysis.

Your capabilities include:
1. Evaluating the validity of logical arguments using predicate logic
2. Identifying contradictions and fallacies
3. Translating natural language into predicate logic notation and vice versa
4. Assessing logical equivalence between statements
5. Performing logical operations such as negation, conjunction, disjunction, and implication

You will be given a new logical problem and query. Respond ONLY with "yes" or "no" based on whether the conclusion logically follows from the premises.
"""

In [4]:
import pandas as pd
dataset_path = "/home/fozle/Course Work/Y1S2/CISC 844/Projects/LLM-Predicate-Logic/dataset/data.csv"
df = pd.read_csv(dataset_path)

In [6]:
sample_size = 5000  # Adjust based on how many examples you want to test
df_sample = df.sample(sample_size, random_state=42)

In [7]:
from transformers import pipeline
import torch
pipe = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.2-1B-Instruct",  # or 3B version
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

Device set to use cuda:0


In [8]:
#  load only one row from the dataset
row = df_sample.iloc[3]
row

id                                                            1737
logic                                                    predicate
rule_category                                           equivalent
rule                                existential biconditional laws
problem                                              contradiction
query            Consider the following premises: There is at l...
answer                                                          no
Name: 1736, dtype: object

In [9]:

# user_prompt = row['query']
user_prompt = row['query']
print(user_prompt)
ground_truth = row['answer']
ground_truth

Consider the following premises: There is at least one x for which if x writes a letter, then x is a lawyer and if x were a lawyer, then x is writing a letter. Can we infer the following from them? Answer yes or no: For all x, it is not true that x will write a letter if and only if x is a lawyer.


'no'

In [10]:
system_prompt

'You are a predicate logic assistant specializing in formal logic analysis.\n\nYour capabilities include:\n1. Evaluating the validity of logical arguments using predicate logic\n2. Identifying contradictions and fallacies\n3. Translating natural language into predicate logic notation and vice versa\n4. Assessing logical equivalence between statements\n5. Performing logical operations such as negation, conjunction, disjunction, and implication\n\nYou will be given a new logical problem and query. Respond ONLY with "yes" or "no" based on whether the conclusion logically follows from the premises.\n'

In [11]:
 # Prepare messages for the model
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": df_sample.iloc[0]['query']},
    {"role": "assistant", "content": df_sample.iloc[0]['answer']},
    {"role": "user", "content": df_sample.iloc[1]['query']},
    {"role": "assistant", "content": df_sample.iloc[1]['answer']},
    {"role": "user", "content": df_sample.iloc[2]['query']},
    {"role": "assistant", "content": df_sample.iloc[2]['answer']},
    {"role": "user", "content": df_sample.iloc[3]['query']},
    {"role": "assistant", "content": df_sample.iloc[3]['answer']},
    {"role": "user", "content": user_prompt}
]

outputs = pipe(
            messages,
            max_new_tokens=512,  # Shorter since we only need yes/no
            # do_sample=False  # Deterministic response
        )
        
        
# Extract response content
response = outputs[0]["generated_text"][-1]["content"].strip().lower()
response

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


'no'

In [12]:
import time
from tqdm.notebook import tqdm
from IPython.display import display, HTML

# Initialize results storage
results = []

# Run inference on samples with fixed progress tracking
for idx, row in tqdm(df_sample.iterrows(), total=len(df_sample), desc="ICL Inference"):
    # Construct user prompt from problem and query
    user_prompt = f"{row['problem']} {row['query']}"
    
    # Prepare messages for the model
    messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": df_sample.iloc[0]['query']},
    {"role": "assistant", "content": df_sample.iloc[0]['answer']},
    {"role": "user", "content": df_sample.iloc[1]['query']},
    {"role": "assistant", "content": df_sample.iloc[1]['answer']},
    {"role": "user", "content": df_sample.iloc[2]['query']},
    {"role": "assistant", "content": df_sample.iloc[2]['answer']},
    {"role": "user", "content": df_sample.iloc[3]['query']},
    {"role": "assistant", "content": df_sample.iloc[3]['answer']},
    {"role": "user", "content": user_prompt}
]
    
    # Get model response
    try:
        start_time = time.time()
        outputs = pipe(
            messages,
            max_new_tokens=512,  # Shorter since we only need yes/no
            # do_sample=False  # Deterministic response
        )
        inference_time = time.time() - start_time
        
        # Extract response content
        response = outputs[0]["generated_text"][-1]["content"].strip().lower()
        
        # Check if response contains yes or no
        if "yes" in response and "no" not in response:
            prediction = "yes"
        elif "no" in response:
            prediction = "no"
        else:
            prediction = "unclear"
            
        # Compare with ground truth
        correct = prediction == row["answer"]
        
        # Store results
        result = {
            "id": row["id"],
            "logic": row["logic"],
            "rule_category": row["rule_category"],
            "rule": row["rule"],
            "problem": row["problem"],
            "query": row["query"],
            "ground_truth": row["answer"],
            "prediction": prediction,
            "correct": correct,
            "inference_time": inference_time
        }
        results.append(result)
        
        # Add a small delay to avoid rate limiting
        # time.sleep(0.5)
        
    except Exception as e:
        print(f"Error processing row {idx}: {e}")

ICL Inference:   0%|          | 0/5000 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to

In [13]:
results_df = pd.DataFrame(results)
results_df.head()

Unnamed: 0,id,logic,rule_category,rule,problem,query,ground_truth,prediction,correct,inference_time
0,1127,propositional,inference,disjunction elimination,unrelated,Consider the following premises: It is cloudy ...,no,no,True,0.042453
1,80,predicate,equivalent,Law of quantifier distribution,contradiction,Consider the following premises: There is at l...,no,yes,False,0.025735
2,3261,propositional,inference,resolution,contradiction,Consider the following premises: Jennifer is r...,no,yes,False,0.020917
3,1737,predicate,equivalent,existential biconditional laws,contradiction,Consider the following premises: There is at l...,no,yes,False,0.022545
4,4527,predicate,equivalent,universal distributive laws,inference,"Consider the following premises: For all x, x ...",yes,no,False,0.019829


In [14]:
results_df.to_csv("results_icl.csv", index=False)

In [15]:
# Convert results to DataFrame for analysis
results_df = pd.DataFrame(results)

# Calculate overall accuracy
accuracy = results_df["correct"].mean() * 100
print(f"Overall Accuracy: {accuracy:.2f}%")

# Display number of samples by prediction
prediction_counts = results_df["prediction"].value_counts()
print("\nPrediction counts:")
for pred, count in prediction_counts.items():
    print(f"  {pred}: {count}")

# # Display accuracy by logic type
print("\nAccuracy by Logic Type:")
logic_acc = results_df.groupby("logic")["correct"].agg(["mean", "count"])
logic_acc["accuracy_pct"] = logic_acc["mean"] * 100
display(logic_acc[["count", "accuracy_pct"]])

# # Display accuracy by rule category
print("\nAccuracy by Rule Category:")
rule_acc = results_df.groupby("rule_category")["correct"].agg(["mean", "count"])
rule_acc["accuracy_pct"] = rule_acc["mean"] * 100
display(rule_acc.sort_values("count", ascending=False)[["count", "accuracy_pct"]])

# # Save results to CSV
results_file = "llama_predicate_logic_results.csv"
results_df.to_csv(results_file, index=False)
print(f"\nResults saved to {results_file}")

Overall Accuracy: 68.90%

Prediction counts:
  no: 3400
  yes: 1600

Accuracy by Logic Type:


Unnamed: 0_level_0,count,accuracy_pct
logic,Unnamed: 1_level_1,Unnamed: 2_level_1
predicate,3511,67.900883
propositional,1489,71.255876



Accuracy by Rule Category:


Unnamed: 0_level_0,count,accuracy_pct
rule_category,Unnamed: 1_level_1,Unnamed: 2_level_1
inference,2591,70.822076
equivalent,1952,67.930328
fallacy,457,62.14442



Results saved to llama_predicate_logic_results.csv


In [16]:
results_df.value_counts("correct")

correct
True     3445
False    1555
Name: count, dtype: int64