In [28]:
import Modules.utils as utils
import matplotlib.pyplot as plt
import numpy as np


plt.rcParams.update({
    "pgf.texsystem": "pdflatex",
    "font.family": "serif",
    "text.usetex": True,         
    "pgf.rcfonts": False,
})

In [29]:
malware_extractor_data = utils.load_json_files("./Modules/MalwareExtractor", ["Baseline", "ZERO-O1", "ZERO-O2", "FS-O1", "FS-O2"], models=["llama_3_2_1b_instruct", "llama_3_2_3b_instruct", "llama_3_1_8b_instruct", "llama_3_1_70b_instruct", "qwen_2_5_1p5b_instruct", "qwen_2_5_3b_instruct", "qwen_2_5_7b_instruct", "qwen_2_5_72b_instruct"])

threat_actor_extractor_data = utils.load_json_files("./Modules/ThreatActorExtractor/", ["Baseline", "ZERO-O1", "ZERO-O2", "FS-O1", "FS-O2"], models=["llama_3_2_1b_instruct", "llama_3_2_3b_instruct", "llama_3_1_8b_instruct", "llama_3_1_70b_instruct", "qwen_2_5_1p5b_instruct", "qwen_2_5_3b_instruct", "qwen_2_5_7b_instruct", "qwen_2_5_72b_instruct"])

attack_pattern_extractor_data = utils.load_json_files("./Modules/AttackPatternExtractor/", ["Baseline", "ZERO-O1", "ZERO-O2", "FS-O1", "FS-O2"], models=["llama_3_2_1b_instruct", "llama_3_2_3b_instruct", "llama_3_1_8b_instruct", "llama_3_1_70b_instruct", "qwen_2_5_1p5b_instruct", "qwen_2_5_3b_instruct", "qwen_2_5_7b_instruct", "qwen_2_5_72b_instruct"])

targets_extractor_data = utils.load_json_files("./Modules/TargetsExtractor/", ["Baseline", "ZERO-O1", "ZERO-O2", "FS-O1", "FS-O2"], models=["llama_3_2_1b_instruct", "llama_3_2_3b_instruct", "llama_3_1_8b_instruct", "llama_3_1_70b_instruct", "qwen_2_5_1p5b_instruct", "qwen_2_5_3b_instruct", "qwen_2_5_7b_instruct", "qwen_2_5_72b_instruct"])



def calc_improvements(data: dict):
    baseline_f1 = sum(data["Baseline"]["f1"])/len(data["Baseline"]["f1"])

    zs_o1_f1 = sum(data["ZERO-O1"]["f1"])/len(data["ZERO-O1"]["f1"])
    zs_o2_f1 = sum(data["ZERO-O2"]["f1"])/len(data["ZERO-O2"]["f1"])

    fs_o1_f1 = sum(data["FS-O1"]["f1"])/len(data["FS-O1"]["f1"])
    fs_o2_f1 = sum(data["FS-O2"]["f1"])/len(data["FS-O2"]["f1"])

    zs_o1_gain = zs_o1_f1 - baseline_f1
    zs_o2_gain = zs_o2_f1 - baseline_f1

    fs_o1_gain = fs_o1_f1 - baseline_f1
    fs_o2_gain = fs_o2_f1 - baseline_f1


    res = {
        "zs_o1_gain": zs_o1_gain,
        "zs_o2_gain": zs_o2_gain,
        "fs_o1_gain": fs_o1_gain,
        "fs_o2_gain": fs_o2_gain,
    }

    for key in res.keys():
        if res[key] < 0:
            res[key] = 0.0

    return res



gains_by_model = {
    "llama_3_2_1b_instruct": [],
    "llama_3_2_3b_instruct": [],
    "llama_3_1_8b_instruct": [],
    "llama_3_1_70b_instruct": [],
    "qwen_2_5_1p5b_instruct": [],
    "qwen_2_5_3b_instruct": [],
    "qwen_2_5_7b_instruct": [],
    "qwen_2_5_72b_instruct": []
}





# 1. Malware Extractor 
for model, value in malware_extractor_data.items():
    res = calc_improvements(value)
    gains_by_model[model].append(res)
    

# 2. Threat Actor Extractor 
for model, value in threat_actor_extractor_data.items():
    res = calc_improvements(value)    
    gains_by_model[model].append(res)

# 3. Attack Pattern Extractor 
for model, value in attack_pattern_extractor_data.items():
    res = calc_improvements(value)    
    gains_by_model[model].append(res)

# 4. Targets Extractor 
for model, value in targets_extractor_data.items():
    res = calc_improvements(value)    
    gains_by_model[model].append(res)

In [30]:
llama_llms = [
    "llama_3_2_1b_instruct",
    "llama_3_2_3b_instruct",
    "llama_3_1_8b_instruct",
    "llama_3_1_70b_instruct",
]

qwen_llms = [
    "qwen_2_5_1p5b_instruct",
    "qwen_2_5_3b_instruct",
    "qwen_2_5_7b_instruct",
    "qwen_2_5_72b_instruct"
]




from collections import defaultdict

def populate_performances(value):
    zs_o1 = np.mean(value["ZERO-O1"]["f1"])
    zs_o2 = np.mean(value["ZERO-O2"]["f1"])
    fs_o1 = np.mean(value["FS-O1"]["f1"])
    fs_o2 = np.mean(value["FS-O2"]["f1"])
    baseline = np.mean(value["Baseline"]["f1"])

    if zs_o1 < baseline: 
        zs_o1 = baseline
    
    if zs_o2 < baseline:
        zs_o2 = baseline
    
    if fs_o1 < baseline: 
        fs_o1 = baseline
    
    if fs_o2 < baseline:
        fs_o2 = baseline

    return zs_o1, zs_o2, fs_o1, fs_o2, baseline




# 1. Malware Extractor 
llama_malware_extractor_delta, qwen_malware_extractor_delta, llama_malware_opt, qwen_malware_opt = defaultdict(list), defaultdict(list), [], []
for model, value in malware_extractor_data.items():
    zs_o1, zs_o2, fs_o1, fs_o2, baseline = populate_performances(value)
    if model in llama_llms:
        llama_malware_extractor_delta["zs-o1"].append(zs_o1)
        llama_malware_extractor_delta["zs-o2"].append(zs_o2)
        llama_malware_extractor_delta["fs-o1"].append(fs_o1)
        llama_malware_extractor_delta["fs-o2"].append(fs_o2)
        llama_malware_extractor_delta["baseline"].append(baseline)
    elif model in qwen_llms:
        qwen_malware_extractor_delta["zs-o1"].append(zs_o1)
        qwen_malware_extractor_delta["zs-o2"].append(zs_o2)
        qwen_malware_extractor_delta["fs-o1"].append(fs_o1)
        qwen_malware_extractor_delta["fs-o2"].append(fs_o2)
        qwen_malware_extractor_delta["baseline"].append(baseline)
    else:
        raise Exception("Error in qwen_llms or llama_llms for value:", model)

# 2. Threat Actor Extractor 
llama_actor_delta, qwen_actor_delta, llama_actor_opt, qwen_actor_opt = defaultdict(list), defaultdict(list), [], []
for model, value in threat_actor_extractor_data.items():
    zs_o1, zs_o2, fs_o1, fs_o2, baseline = populate_performances(value)
    if model in llama_llms:
        llama_actor_delta["zs-o1"].append(zs_o1)
        llama_actor_delta["zs-o2"].append(zs_o2)
        llama_actor_delta["fs-o1"].append(fs_o1)
        llama_actor_delta["fs-o2"].append(fs_o2)
        llama_actor_delta["baseline"].append(baseline)
    elif model in qwen_llms:
        qwen_actor_delta["zs-o1"].append(zs_o1)
        qwen_actor_delta["zs-o2"].append(zs_o2)
        qwen_actor_delta["fs-o1"].append(fs_o1)
        qwen_actor_delta["fs-o2"].append(fs_o2)
        qwen_actor_delta["baseline"].append(baseline)
    else:
        raise Exception("Error in qwen_llms or llama_llms for value:", model)

# 3. Attack Pattern Extractor 
llama_attack_delta, qwen_attack_delta, llama_attacks_opt, qwen_attacks_opt = defaultdict(list), defaultdict(list), [], []
for model, value in attack_pattern_extractor_data.items():
    zs_o1, zs_o2, fs_o1, fs_o2, baseline = populate_performances(value)
    if model in llama_llms:
        llama_attack_delta["zs-o1"].append(zs_o1)
        llama_attack_delta["zs-o2"].append(zs_o2)
        llama_attack_delta["fs-o1"].append(fs_o1)
        llama_attack_delta["fs-o2"].append(fs_o2)
        llama_attack_delta["baseline"].append(baseline)
    elif model in qwen_llms:
        qwen_attack_delta["zs-o1"].append(zs_o1)
        qwen_attack_delta["zs-o2"].append(zs_o2)
        qwen_attack_delta["fs-o1"].append(fs_o1)
        qwen_attack_delta["fs-o2"].append(fs_o2)
        qwen_attack_delta["baseline"].append(baseline)
    else:
        raise Exception("Error in qwen_llms or llama_llms for value:", model)

# 4. Targets Extractor 
llama_targes_delta, qwen_targets_delta, llama_targets_opt, qwen_targets_opt = defaultdict(list), defaultdict(list), [], []
for model, value in targets_extractor_data.items():
    zs_o1, zs_o2, fs_o1, fs_o2, baseline = populate_performances(value)
    if model in llama_llms:
        llama_targes_delta["zs-o1"].append(zs_o1)
        llama_targes_delta["zs-o2"].append(zs_o2)
        llama_targes_delta["fs-o1"].append(fs_o1)
        llama_targes_delta["fs-o2"].append(fs_o2)
        llama_targes_delta["baseline"].append(baseline)
    elif model in qwen_llms:
        qwen_targets_delta["zs-o1"].append(zs_o1)
        qwen_targets_delta["zs-o2"].append(zs_o2)
        qwen_targets_delta["fs-o1"].append(fs_o1)
        qwen_targets_delta["fs-o2"].append(fs_o2)
        qwen_targets_delta["baseline"].append(baseline)
    else:
        raise Exception("Error in qwen_llms or llama_llms for value:", model)

In [None]:
print("Malware Extractor")
for key, value in llama_malware_extractor_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))

for key, value in qwen_malware_extractor_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))


print("Threat Actor")
for key, value in llama_actor_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))


for key, value in qwen_actor_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))


print("Attack Patterns")
for key, value in llama_attack_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))

for key, value in qwen_attack_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))


print("Targets")
for key, value in llama_targes_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))

for key, value in qwen_targets_delta.items(): 
    # print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))
    print(key, round((max(value)-min(value))*100, 2))

In [None]:
print("Malware Extractor")
for key, value in qwen_malware_extractor_delta.items(): 
    print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))


print("Threat Actor")
for key, value in qwen_actor_delta.items(): 
    print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))


print("Attack Patterns")
for key, value in qwen_attack_delta.items(): 
    print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))


print("Targets")
for key, value in qwen_targets_delta.items(): 
    print(key, "min:", min(value), "max:", max(value), "delta", round((max(value)-min(value))*100, 2))

In [None]:
fig, axs = plt.subplots(1, 4, figsize=(10, 3), constrained_layout=True)




colors = ["#4074ff", "#003bd8", "#7d8178"]
bar_count = 3
bar_width = 0.05
gap = 0.01

bar_x = [i * (bar_width + gap) for i in range(bar_count)]


# Malware Extractor
ax = axs[0]

ax.set_ylabel("F1-Score")

models = ["Qwen-72B-ZS-O2", "Qwen-72B-FS-O1", "aCTIon (GPT 3.5)"]
scores = [79, 81, 72]

bars = ax.bar(bar_x, scores, width=bar_width, color=colors)

ax.set_title("Malware Extractor", fontsize=13)
ax.set_ylim(0, 100)
ax.set_xticks(bar_x)
ax.set_xticklabels(models, rotation=30, ha='right', fontsize=11)
ax.grid(axis='y', linestyle='dashed', linewidth=0.5, alpha=0.4)


# Optional: Score
for j, bar in enumerate(bars):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
            str(scores[j]), ha='center', va='bottom', fontsize=8)






# Threat Actor Extractor
ax = axs[1]
models = ["Llama-70B-ZS-O1", "Llama-70B-FS-O1", "aCTIon (GPT 3.5)"]
scores = [76, 78, 80]


bars = ax.bar(bar_x, scores, width=bar_width, color=colors)

ax.set_title("Threat-Actor-Extractor", fontsize=13)
ax.set_ylim(0, 100)
ax.set_xticks(bar_x)
ax.set_xticklabels(models, rotation=30, ha='right', fontsize=11)
ax.grid(axis='y', linestyle='dashed', linewidth=0.5, alpha=0.4)


# Optional: Score über jedem Balken anzeigen
for j, bar in enumerate(bars):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
            str(scores[j]), ha='center', va='bottom', fontsize=8)


# Attack Pattern Extractor
ax = axs[2]

# ax.set_ylabel("F1-Score")

models = ["Qwen-72B-ZS-O1", "Llama-70B-FS-O1", "aCTIon (GPT 3.5)"]
scores = [43, 44, 54]

bars = ax.bar(bar_x, scores, width=bar_width, color=colors)

ax.set_title("Attack-Pattern-Extractor", fontsize=13)
ax.set_ylim(0, 100)
ax.set_xticks(bar_x)
ax.set_xticklabels(models, rotation=30, ha='right', fontsize=11)
ax.grid(axis='y', linestyle='dashed', linewidth=0.5, alpha=0.4)


# Optional: Score über jedem Balken anzeigen
for j, bar in enumerate(bars):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
            str(scores[j]), ha='center', va='bottom', fontsize=8)
    

# Targets Extractor
ax = axs[3]

models = ["Llama-70B-ZS-O2", "Llama-8B-FS-O1", "aCTIon (GPT 3.5)"]
scores = [52, 68, 56]


bars = ax.bar(bar_x, scores, width=bar_width, color=colors)

ax.set_title("Targets-Extractor", fontsize=13)
ax.set_ylim(0, 100)
ax.set_xticks(bar_x)
ax.set_xticklabels(models, rotation=30, ha='right', fontsize=11)

ax.grid(axis='y', linestyle='dashed', linewidth=0.5, alpha=0.4)


# Optional: Score
for j, bar in enumerate(bars):
    ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
            str(scores[j]), ha='center', va='bottom', fontsize=8)
    


plt.show()
# fig.savefig("./pgfs/PerformanceComparisonSOTA.pgf")

In [None]:
def summarize_model_gains(gains_by_model: dict):
    summary = dict()

    for model, gains_obj in gains_by_model.items(): 
        avg_gains = {
            "zs_o1_gain": 0,
            "zs_o2_gain": 0,
            "fs_o1_gain": 0,
            "fs_o2_gain": 0,
        }

        for obj in gains_obj:
            for key in avg_gains.keys():
                avg_gains[key] += obj[key]
        

        summary[model] = {k: (v / len(gains_obj)) for k, v in avg_gains.items()}

    return summary

In [None]:
gains_by_llm = summarize_model_gains(gains_by_model)

for model, value in gains_by_llm.items(): 
    print(model, value)

In [None]:
models = [
    "llama_3_2_1b_instruct",
    "qwen_2_5_1p5b_instruct",
    "qwen_2_5_3b_instruct",
    "llama_3_2_3b_instruct",
    "qwen_2_5_7b_instruct",
    "llama_3_1_8b_instruct",
    "llama_3_1_70b_instruct",
    "qwen_2_5_72b_instruct"
]


zs_o1 = [gains_by_llm[m]["zs_o1_gain"] *100 for m in models]
zs_o2 = [gains_by_llm[m]["zs_o2_gain"] *100 for m in models]
fs_o1 = [gains_by_llm[m]["fs_o1_gain"] *100 for m in models]
fs_o2 = [gains_by_llm[m]["fs_o2_gain"] *100 for m in models]

fig, ax = plt.subplots(1,1, figsize=(6, 4), layout="constrained")

x = np.arange(len(models))
width=0.17

ax.bar(x - 1.5*width, zs_o1, width, label='ZS-O1', color='blue')
ax.bar(x - 0.5*width, zs_o2, width, label='ZS-O2', color='red')

ax.bar(x + 0.5*width+0.05, fs_o1, width, label='FS-O1', color='purple')
ax.bar(x + 1.5*width+0.05, fs_o2, width, label='FS-O2', color='green')

ax.set_xticks(x)
models_labels = [
    "Llama 3.2 1B instruct",
    "Qwen 2.5 1.5B instruct",
    "Qwen 2.5 3B instruct",
    "Llama 3.2 3B instruct",
    "Qwen 2.5 7B instruct",
    "Llama 3.1 8B instruct",
    "Llama 3.1 70B instruct",
    "Qwen 2.5 72B instruct"
]

ax.set_xticklabels(models_labels, rotation=30, ha='right', fontsize=10)
ax.legend(loc="upper right", prop={'size': 10})


plt.grid(axis = "y", linestyle="dashed", color="gray", linewidth=0.4, alpha=0.5)

ax.set_ylabel("Optimizer-Gain (\%)")

# fig.savefig("./pgfs/MetaAvgGainPerLlm.pgf")

In [None]:
average_gains = {
    "ZS-O1": np.mean(zs_o1),
    "ZS-O2": np.mean(zs_o2),
    "FS-O1": np.mean(fs_o1),
    "FS-O2": np.mean(fs_o2)
}

for key, value in average_gains.items():
    print(f"{key}: {value:.2f}")

In [None]:
llama_llms = [
    "llama_3_2_1b_instruct",
    "llama_3_2_3b_instruct",
    "llama_3_1_8b_instruct",
    "llama_3_1_70b_instruct",
]

qwen_llms = [
    "qwen_2_5_1p5b_instruct",
    "qwen_2_5_3b_instruct",
    "qwen_2_5_7b_instruct",
    "qwen_2_5_72b_instruct"
]

llama_zs_o1_gains = []
for llm in llama_llms: 
    llama_zs_o1_gains.append(gains_by_llm[llm]["zs_o1_gain"]*100)

qwen_zs_o1_gains = []
for llm in qwen_llms: 
    qwen_zs_o1_gains.append(gains_by_llm[llm]["zs_o1_gain"]*100)


print(np.mean(llama_zs_o1_gains), np.mean(qwen_zs_o1_gains))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

gains_by_llm = {
    'llama_3_2_1b_instruct': {'zs_o1_gain': 0.0405, 'zs_o2_gain': 0.0506, 'fs_o1_gain': 0.2495, 'fs_o2_gain': 0.3001},
    'llama_3_2_3b_instruct': {'zs_o1_gain': 0.0781, 'zs_o2_gain': 0.0504, 'fs_o1_gain': 0.2770, 'fs_o2_gain': 0.3417},
    'llama_3_1_8b_instruct': {'zs_o1_gain': 0.0079, 'zs_o2_gain': 0.0189, 'fs_o1_gain': 0.1302, 'fs_o2_gain': 0.1244},
    'llama_3_1_70b_instruct': {'zs_o1_gain': 0.0382, 'zs_o2_gain': 0.0779, 'fs_o1_gain': 0.1546, 'fs_o2_gain': 0.1493},
    'qwen_2_5_1p5b_instruct': {'zs_o1_gain': 0.0540, 'zs_o2_gain': 0.0844, 'fs_o1_gain': 0.2099, 'fs_o2_gain': 0.2591},
    'qwen_2_5_3b_instruct': {'zs_o1_gain': 0.0626, 'zs_o2_gain': 0.0817, 'fs_o1_gain': 0.1942, 'fs_o2_gain': 0.1784},
    'qwen_2_5_7b_instruct': {'zs_o1_gain': 0.0883, 'zs_o2_gain': 0.1242, 'fs_o1_gain': 0.1877, 'fs_o2_gain': 0.2207},
    'qwen_2_5_72b_instruct': {'zs_o1_gain': 0.0815, 'zs_o2_gain': 0.0909, 'fs_o1_gain': 0.1085, 'fs_o2_gain': 0.1402}
}

models = list(gains_by_llm.keys())
zs_o1 = np.array([gains_by_llm[m]["zs_o1_gain"] for m in models])
zs_o2 = np.array([gains_by_llm[m]["zs_o2_gain"] for m in models])
fs_o1 = np.array([gains_by_llm[m]["fs_o1_gain"] for m in models])
fs_o2 = np.array([gains_by_llm[m]["fs_o2_gain"] for m in models])




# Zero-Shot O1 vs. Few-Shot O1
t3, p3 = stats.ttest_rel(zs_o1, fs_o1)
print(f"Zero-Shot O1 vs. Few-Shot O1: t = {t3}, p = {p3}")

# Zero-Shot O2 vs. Few-Shot O2
t4, p4 = stats.ttest_rel(zs_o2, fs_o2)
print(f"Zero-Shot O2 vs. Few-Shot O2: t = {t4}, p = {p4}")

# GZero-Shot O1 vs. Zero-Shot O2
t_zs, p_zs = stats.ttest_rel(zs_o1, zs_o2)
print(f"Zero-Shot O1 vs. Zero-Shot O2: t = {t_zs}, p = {p_zs}")

# Few-Shot O1 vs. Few-Shot O2
t_fs, p_fs = stats.ttest_rel(fs_o1, fs_o2)
print(f"Few-Shot O1 vs. Few-Shot O2: t = {t_fs}, p = {p_fs}")

In [None]:
from scipy import stats


model_sizes = {
    "llama_3_2_1b_instruct": 1.0,
    "llama_3_2_3b_instruct": 3.0,
    "llama_3_1_8b_instruct": 8.0,
    # "llama_3_1_70b_instruct": 70.0,
    "qwen_2_5_1p5b_instruct": 1.5,
    "qwen_2_5_3b_instruct": 3.0,
    "qwen_2_5_7b_instruct": 7.0,
    # "qwen_2_5_72b_instruct": 72.0,
}



# scores_by_llm = dict() 
zs_o1_scores_by_llm, zs_o2_scores_by_llm = dict(), dict()
fs_o1_scores_by_llm, fs_o2_scores_by_llm = dict(), dict()


for extractor_data in [malware_extractor_data, threat_actor_extractor_data, attack_pattern_extractor_data, targets_extractor_data]:

    zs_o1_scores_by_llm, zs_o2_scores_by_llm = dict(), dict()
    fs_o1_scores_by_llm, fs_o2_scores_by_llm = dict(), dict()   

    for model, value in extractor_data.items(): 
        # scores_by_llm[model] = np.mean(value["Baseline"]["f1"])
        zs_o1_scores_by_llm[model] = np.mean(value["ZERO-O1"]["f1"])
        zs_o2_scores_by_llm[model] = np.mean(value["ZERO-O2"]["f1"])

        fs_o1_scores_by_llm[model] = np.mean(value["FS-O1"]["f1"])
        fs_o2_scores_by_llm[model] = np.mean(value["FS-O2"]["f1"])


    r, p = stats.pearsonr([model_sizes[m] for m in model_sizes.keys()]*2, [zs_o1_scores_by_llm[m] for m in model_sizes.keys()] + [zs_o2_scores_by_llm[m] for m in model_sizes.keys()], alternative="greater")
    print(r, p < 0.05, p)


    r, p = stats.pearsonr([model_sizes[m] for m in model_sizes.keys()]*2, [fs_o1_scores_by_llm[m] for m in model_sizes.keys()] + [fs_o2_scores_by_llm[m] for m in model_sizes.keys()], alternative="greater")
    print(r, p < 0.05, p)

In [None]:
model_sizes_1_to_8 = {
    "llama_3_2_1b_instruct": 1.0,
    "llama_3_2_3b_instruct": 3.0,
    "llama_3_1_8b_instruct": 8.0,
    # "llama_3_1_70b_instruct": 70.0,
    "qwen_2_5_1p5b_instruct": 1.5,
    "qwen_2_5_3b_instruct": 3.0,
    "qwen_2_5_7b_instruct": 7.0,
    # "qwen_2_5_72b_instruct": 72.0,
}

model_sizes_all = {
    "llama_3_2_1b_instruct": 1.0,
    "llama_3_2_3b_instruct": 3.0,
    "llama_3_1_8b_instruct": 8.0,
    "llama_3_1_70b_instruct": 70.0,
    "qwen_2_5_1p5b_instruct": 1.5,
    "qwen_2_5_3b_instruct": 3.0,
    "qwen_2_5_7b_instruct": 7.0,
    "qwen_2_5_72b_instruct": 72.0,
}




for type, extractor_data in zip(["Malware-Extractor", "Threat-Actor-Extractor", "Attack-Pattern-Extractor", "Targets-Extractor"], [malware_extractor_data, threat_actor_extractor_data, attack_pattern_extractor_data, targets_extractor_data]):
    # 
    baseline_scores_by_llm = dict() 
    zs_o1_scores_by_llm, zs_o2_scores_by_llm = dict(), dict()
    fs_o1_scores_by_llm, fs_o2_scores_by_llm = dict(), dict()   

    print("\n++++++++++++++++++++++++++++++++\n", type, sep="")

    # collect scores for all models 
    for model, value in extractor_data.items(): 
        baseline_scores_by_llm[model] = np.mean(value["Baseline"]["f1"])

        zs_o1_scores_by_llm[model] = np.mean(value["ZERO-O1"]["f1"])
        zs_o2_scores_by_llm[model] = np.mean(value["ZERO-O2"]["f1"])

        fs_o1_scores_by_llm[model] = np.mean(value["FS-O1"]["f1"])
        fs_o2_scores_by_llm[model] = np.mean(value["FS-O2"]["f1"])

    # calculate correlations 
    small_r, small_p = stats.pearsonr([model_sizes_1_to_8[m] for m in model_sizes_1_to_8.keys()]*2, [zs_o1_scores_by_llm[m] for m in model_sizes_1_to_8.keys()] + [zs_o2_scores_by_llm[m] for m in model_sizes_1_to_8.keys()], alternative="greater")
    
    all_r, all_p = stats.pearsonr([model_sizes_all[m] for m in model_sizes_all.keys()]*2, [zs_o1_scores_by_llm[m] for m in model_sizes_all.keys()] + [zs_o2_scores_by_llm[m] for m in model_sizes_all.keys()], alternative="greater")
    print("Zero-Shot O1/O2:", np.round(small_r, 3), np.round(small_p, 4), small_p<0.05, "||", np.round(all_r, 3), np.round(all_p, 4), all_p<0.05)


    small_r, small_p = stats.pearsonr([model_sizes_1_to_8[m] for m in model_sizes_1_to_8.keys()]*2, [fs_o1_scores_by_llm[m] for m in model_sizes_1_to_8.keys()] + [fs_o2_scores_by_llm[m] for m in model_sizes_1_to_8.keys()], alternative="greater")
    all_r, all_p = stats.pearsonr([model_sizes_all[m] for m in model_sizes_all.keys()]*2, [fs_o1_scores_by_llm[m] for m in model_sizes_all.keys()] + [fs_o2_scores_by_llm[m] for m in model_sizes_all.keys()], alternative="greater")
    print("Few-Shot O1/O2:", np.round(small_r, 3), np.round(small_p, 4), small_p<0.05, "||", np.round(all_r, 3), np.round(all_p, 4), all_p<0.05)