## Evaluation of
## "complete_predict_raw_llama31_8b.json" ,
## "complete_predict_4omini_structed.json" ,
## "complete_predict_4_1mini_structed.json" and
## our fine-tuned model results: "generated_predictions_all_llama31_2.jsonl"

In [2]:

import json
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
a = []
input_file = "generated_predictions_all_llama31_2.jsonl"

with open(input_file, "r", encoding="utf-8") as f:
                 
#     a = json.load(f)    ##########     # Uncomment this for .json files (not .jsonl):


##########################  # Uncomment this for JSONL :)
    for i, line in enumerate(f):
        if i >= 65:
            break
        a.append(json.loads(line))
        
def eval_metrics(vul_range):
    """
    Accepts strings like '7', '7-12', '7,8,9', '7,9-12'
    Returns a set of all lines covered.
    """
    if not vul_range or vul_range.strip() == "":
        return set()
    vul_range = vul_range.replace(" ", "")  # Remove whitespace
    result = set()
    for part in vul_range.split(","):
        if "-" in part:
            try:
                start, end = map(int, part.split("-"))
                result.update(range(start, end + 1))
            except ValueError:
                continue
        else:
            try:
                result.add(int(part))
            except ValueError:
                continue
    return result

def parse_field(field):
    if isinstance(field, dict):
        return field
    if not field or field.strip() == "":
        return {"vulnerableLines": ""}
    try:
        return json.loads(field)
    except json.JSONDecodeError:
        return {"vulnerableLines": ""}

y_true = []
y_pred = []
results = []

for i, sample in enumerate(a):
    label_dict = parse_field(sample.get("label", ""))
    pred_dict = parse_field(sample.get("predict", ""))

    label_range_str = label_dict.get("vulnerableLines", "")
    pred_range_str = pred_dict.get("vulnerableLines", "")

    label_lines = eval_metrics(label_range_str)
    pred_lines = eval_metrics(pred_range_str)
    # print(label_lines)
    # print(pred_lines)
    # print(label_lines & pred_lines)
    if not label_lines and not pred_lines:
        y_true.append(0)
        y_pred.append(0)
        results.append({"id": i, "result": "TN"})
    elif label_lines and not pred_lines:
        y_true.append(1)
        y_pred.append(0)
        results.append({"id": i, "result": "FN"})
    elif not label_lines and pred_lines:
        y_true.append(0)
        y_pred.append(1)
        results.append({"id": i, "result": "FP"})
    else:
        overlap = label_lines & pred_lines
        if overlap:
            y_true.append(1)
            y_pred.append(1)
            results.append({"id": i, "result": "TP"})
        else:
            y_true.append(1)
            y_pred.append(0)
            results.append({"id": i, "result": "FN (no overlap)"})
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)

print("Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-score : {f1:.4f}")

# Optional: Save results
with open(f"eval_{input_file}.json", "w", encoding="utf-8") as f:
    for r in results:
        f.write(json.dumps(r) + "\n")
        #print(r)


Evaluation Metrics:
Accuracy : 0.8923
Precision: 0.9429
Recall   : 0.8684
F1-score : 0.9041


## Factual Consistency: predict["vulnerabilityReason"] <--> label["vulnerabilityReason"]

In [3]:
from sentence_transformers import SentenceTransformer, util
import json
from tqdm import tqdm
tqdm(disable=True) 

# Load lightweight sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Safe JSON parser
def safe_parse_json(field):
    if isinstance(field, dict):
        return field
    if not field or (isinstance(field, str) and field.strip() == ""):
        return {}
    try:
        return json.loads(field)
    except json.JSONDecodeError:
        return {}

# Compute semantic similarity between label and predicted vulnerabilityReason
factual_scores = []

for sample in a:  # Use all samples
    label = safe_parse_json(sample.get("label", ""))
    pred = safe_parse_json(sample.get("predict", ""))

    label_reason = label.get("vulnerabilityReason", "")
    pred_reason = pred.get("vulnerabilityReason", "")

    if not label_reason.strip() or not pred_reason.strip():
        continue

    label_emb = model.encode(label_reason, convert_to_tensor=True)
    pred_emb = model.encode(pred_reason, convert_to_tensor=True)

    score = util.cos_sim(label_emb, pred_emb).item()
    factual_scores.append(score)

# Report
avg_score = sum(factual_scores) / len(factual_scores) if factual_scores else 0
print(f"Average Factual Consistency Score (Label Predict Reason): {avg_score:.4f}")


Average Factual Consistency Score (Label Predict Reason): 0.7737


## (FixedCode <--> FixedCode)
## (vulnerableCode <--> vulnerableCode)

In [4]:
import warnings
warnings.filterwarnings('ignore')

from sentence_transformers import SentenceTransformer, util
import json

# Load lightweight sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Safe JSON parser
def safe_parse_json(field):
    if not field or (isinstance(field, str) and field.strip() == ""):
        return {}
    if isinstance(field, dict):
        return field
    try:
        return json.loads(field)
    except json.JSONDecodeError:
        return {}

# Helper to extract code snippet from possible dict/list structure
def extract_code_snippet(item):
    if isinstance(item, str):
        return item
    elif isinstance(item, dict):
        # Change 'code' to another key if needed
        return item.get("code", "") or str(item)
    return str(item)

code_similarity_scores = []

for sample in a:  # Use all samples
    pred = safe_parse_json(sample.get("predict", ""))

    vuln_code = pred.get("fixedCode", [])
    if isinstance(vuln_code, list):
        vuln_code = "\n".join([extract_code_snippet(item) for item in vuln_code])
    elif vuln_code is None:
        vuln_code = ""
    else:
        vuln_code = extract_code_snippet(vuln_code)
    
    pred = safe_parse_json(sample.get("label", ""))
    fixed_code = pred.get("fixedCode", [])
    if isinstance(fixed_code, list):
        fixed_code = "\n".join([extract_code_snippet(item) for item in fixed_code])
    elif fixed_code is None:
        fixed_code = ""
    else:
        fixed_code = extract_code_snippet(fixed_code)

    # Both should be strings at this point
    if not vuln_code.strip() or not fixed_code.strip():
        continue

    vuln_emb = model.encode(vuln_code, convert_to_tensor=True)
    fixed_emb = model.encode(fixed_code, convert_to_tensor=True)

    score = util.cos_sim(vuln_emb, fixed_emb).item()
    code_similarity_scores.append(score)

# Report
avg_score = sum(code_similarity_scores) / len(code_similarity_scores) if code_similarity_scores else 0
print(f"Average Code-Aware Similarity (fixed code of prediction ↔ label): {avg_score:.4f}")

Average Code-Aware Similarity (fixed code of prediction ↔ label): 0.8862


## (potentialSecurityRisk <--> potentialSecurityRisk)

In [5]:
from sentence_transformers import SentenceTransformer, util
import json

# Load lightweight sentence embedding model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Safe JSON parser
# def safe_parse_json(field):
#     if not field or field.strip() == "":
#         return {}
#     if isinstance(field, dict):
#         return field
#     try:
#         return json.loads(field)
#     except json.JSONDecodeError:
#         return {}
def safe_parse_json(field):
    if isinstance(field, dict):
        return field
    if not field or (isinstance(field, str) and field.strip() == ""):
        return {}
    try:
        return json.loads(field)
    except json.JSONDecodeError:
        return {}

# Compute semantic similarity between vulnerabilityReason and potentialSecurityRisk
risk_alignment_scores = []

for sample in a:  # Use all samples
    pred = safe_parse_json(sample.get("predict", ""))
    
    reason = pred.get("potentialSecurityRisk", "")
    
    pred = safe_parse_json(sample.get("label", ""))
    risk = pred.get("potentialSecurityRisk", "")

    if not reason.strip() or not risk.strip():
        continue

    reason_emb = model.encode(reason, convert_to_tensor=True)
    risk_emb = model.encode(risk, convert_to_tensor=True)

    score = util.cos_sim(reason_emb, risk_emb).item()
    risk_alignment_scores.append(score)

# Report
avg_score = sum(risk_alignment_scores) / len(risk_alignment_scores) if risk_alignment_scores else 0
print(f"Average Risk Assessment Alignment (Risk ↔ Risk): {avg_score:.4f}")


Average Risk Assessment Alignment (Risk ↔ Risk): 0.6891


## OpenAI's embedding Model:

In [6]:
import os
import openai, json
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
openai.api_key = "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"


In [30]:
file_path = "complete_predict_4omini_structed.json"  # complete_predict_4omini_structed.json  #generated_predictions_all_llama31_2.jsonl ## complete_predict_4_1mini_structed.jso
max_samples = 65

data = load_json_or_jsonl(file_path, max_items=max_samples)

all_reasons = []
index_map = []

def load_json_or_jsonl(file_path, max_items=None):
    """
    Loads a file as either .json (object/array) or .jsonl (one JSON object per line).
    Returns a list of dicts.
    """
    with open(file_path, "r", encoding="utf-8") as f:
        try:
            data = json.load(f)
            # If it's a list, you can optionally slice for max_items
            if isinstance(data, list):
                if max_items is not None:
                    data = data[:max_items]
                return data
            else:
                # If it's a dict, wrap in a list for compatibility
                return [data]
        except json.JSONDecodeError:
            # If not a valid JSON array/object, treat as JSONL
            items = []
            for i, line in enumerate(f):
                if max_items is not None and i >= max_items:
                    break
                line = line.strip()
                if not line:
                    continue
                items.append(json.loads(line))
            return items

def safe_parse_json(field):
    if isinstance(field, dict):
        return field
    if not field or (isinstance(field, str) and field.strip() == ""):
        return {}
    try:
        return json.loads(field)
    except json.JSONDecodeError:
        return {}

def get_code_as_str(val):
    if isinstance(val, list):
        return "\n".join(str(x) for x in val).strip()
    elif isinstance(val, str):
        return val.strip()
    elif val is None:
        return ""
    else:
        return str(val).strip()

def get_embedding(text, model="text-embedding-3-small"):
    if not text.strip():
        return None
    response = openai.embeddings.create(input=[text], model=model)
    return response.data[0].embedding

# ----- Main Code -----

for i, entry in enumerate(data):
    pred = safe_parse_json(entry.get("predict", ""))
    label = safe_parse_json(entry.get("label", ""))

    pred_reason = get_code_as_str(pred.get("fixedCode", ""))
    label_reason = get_code_as_str(label.get("fixedCode", ""))

    if pred_reason and label_reason:
        all_reasons.extend([pred_reason, label_reason])
        index_map.append(i)

# Get all embeddings in one batch call
response = openai.embeddings.create(
    input=all_reasons,
    model="text-embedding-3-small"
)
embeddings = [item.embedding for item in response.data]

# Compute similarity
similarity_scores = []
for i, idx in enumerate(index_map):
    emb_pred = embeddings[2 * i]
    emb_label = embeddings[2 * i + 1]
    score = cosine_similarity([emb_pred], [emb_label])[0][0]

    pred = safe_parse_json(data[idx].get("predict", ""))
    label = safe_parse_json(data[idx].get("label", ""))

    similarity_scores.append({
        "predict": pred.get("fixedCode", ""),
        "label": label.get("fixedCode", ""),
        "similarity": score
    })
sum_similarity = 0
for item in similarity_scores[:]:
    sum_similarity = sum_similarity + item["similarity"]
avg_score = sum_similarity / len(similarity_scores)
print(f"Average Semantic Similarity:{avg_score:.4f}")

Average Semantic Similarity:0.7413
