In [1]:
from pathlib import Path
import pandas as pd
from spacy.scorer import PRFScore

from collections import Counter

In [2]:
exp_dir = Path("dev_dataset_split_by_names/")

train_true_df_path = exp_dir / "cyber_attrs_training_df.zip"
train_pred_df_path = exp_dir / "cyber_attrs_training_by_model_df.zip"
test_true_df_path = exp_dir / "cyber_attrs_eval_df.zip"
test_pred_df_path = exp_dir / "cyber_attrs_eval_by_model_df.zip"

#### utils

In [4]:
def count_names(df: pd.DataFrame, col_name: str):
    names = df[col_name]
    try:
        names = names.apply(eval)
    except TypeError:
        pass
    counter = Counter([x for _list in names for x in _list])
    return counter


def get_lower_names(counter: Counter):
    return {name.lower() for name in counter.keys()}

# Evaluate on test-set

In [None]:
gold_df = pd.read_csv(test_true_df_path)
cand_df = pd.read_csv(test_pred_df_path)

In [5]:
entity_type = "group_name"

gold_counter = count_names(gold_df, entity_type)
gold_names = get_lower_names(gold_counter)

cand_counter = count_names(cand_df, entity_type)
cand_names = get_lower_names(cand_counter)

all_names = cand_names.union(gold_names)

print(f"{entity_type}: {len(gold_names)=}")
print(f"{entity_type}: {len(cand_names)=}")
print(f"{entity_type}: {len(gold_names-cand_names)=}")
print(f"{entity_type}: {len(cand_names-gold_names)=}")
print(f"{entity_type} {len(all_names)=}")


group_name: len(gold_names)=19
group_name: len(cand_names)=149
group_name: len(gold_names-cand_names)=6
group_name: len(cand_names-gold_names)=136
group_name len(all_names)=155


In [6]:
# slice into interesting lines only -> than create a column for each name the dataset was labeled by
assert gold_df.shape == cand_df.shape
gold_names_series = gold_df.copy()
gold_names_series = gold_names_series[
    (gold_df[entity_type] != "[]") |
    (cand_df[entity_type] != "[]")
    ]
gold_names_series = gold_names_series[entity_type]

cand_names_series = cand_df.copy()
cand_names_series = cand_names_series[
    (gold_df[entity_type] != "[]") |
    (cand_df[entity_type] != "[]")
    ]
cand_names_series = cand_names_series[entity_type]

comp_gold_cand_df = pd.concat([gold_names_series, cand_names_series], axis=1)
comp_gold_cand_df.columns = ["gold_names", "cand_names"]
print(f"{comp_gold_cand_df.shape=}")


# convert to be list based
for col in comp_gold_cand_df.columns:
    try:
        comp_gold_cand_df[col] = comp_gold_cand_df[col].apply(eval)
    except TypeError:
        pass

comp_gold_cand_df.shape=(2147, 2)


In [7]:
#TODO CALC AGAIN ALL_NAME??

prf_names = {name: PRFScore() for name in all_names}
for idx in range(comp_gold_cand_df.shape[0]):
    line_gold_names = comp_gold_cand_df.iloc[idx][0]
    line_cand_names = comp_gold_cand_df.iloc[idx][1]
    tp_name = 0

    # Calc TruePositive + FalsePositive
    for name in line_cand_names:
        if name in line_gold_names:
            prf_names[name].tp += 1
        else:
            prf_names[name].fp += 1
    # Calc FalseNegative
    for name in set(line_gold_names) - set(line_cand_names):
        prf_names[name].fn += 1


prf_dict = {name: [prf.tp, prf.fp, prf.fn,prf.recall, prf.precision, prf.fscore, (prf.tp/(prf.tp+prf.fn+1e-100))] for name, prf in prf_names.items()}
prd_df = pd.DataFrame(prf_dict).transpose()
prd_df.columns = ["tp", "fp", "fn", "recall", "precision", "f1", "acc"]
print(f"{prd_df.shape=}")
with pd.option_context('display.max_rows', 500, 'display.max_columns', 10):
    display(prd_df)

prd_df.shape=(155, 7)


Unnamed: 0,tp,fp,fn,recall,precision,f1,acc
−− hklm\software\microsoft\windows\currentversion\,0.0,1.0,0.0,0.0,0.0,0.0,0.0
your network,0.0,3.0,0.0,0.0,0.0,0.0,0.0
apt30,202.0,0.0,11.0,0.948357,1.0,0.973494,0.948357
java zero,0.0,7.0,0.0,0.0,0.0,0.0,0.0
apt28 ttps,0.0,2.0,0.0,0.0,0.0,0.0,0.0
apt29 phishing,0.0,2.0,0.0,0.0,0.0,0.0,0.0
splm,0.0,2.0,0.0,0.0,0.0,0.0,0.0
webc2,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ktae,0.0,1.0,0.0,0.0,0.0,0.0,0.0
mimikatz,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Evaluate on train-set (Copy paste the code from above)

In [None]:
gold_df = pd.read_csv(train_true_df_path)
cand_df = pd.read_csv(train_pred_df_path)

In [9]:
entity_type = "group_name"

gold_counter = count_names(gold_df, entity_type)
gold_names = get_lower_names(gold_counter)

cand_counter = count_names(cand_df, entity_type)
cand_names = get_lower_names(cand_counter)

all_names = cand_names.union(gold_names)

print(f"{entity_type}: {len(gold_names)=}")
print(f"{entity_type}: {len(cand_names)=}")
print(f"{entity_type}: {len(gold_names-cand_names)=}")
print(f"{entity_type}: {len(cand_names-gold_names)=}")
print(f"{entity_type} {len(all_names)=}")
################################################################################3
# slice into interesting lines only -> than create a column for each name the dataset was labeled by
assert gold_df.shape == cand_df.shape
gold_names_series = gold_df.copy()
gold_names_series = gold_names_series[
    (gold_df[entity_type] != "[]") |
    (cand_df[entity_type] != "[]")
    ]
gold_names_series = gold_names_series[entity_type]

cand_names_series = cand_df.copy()
cand_names_series = cand_names_series[
    (gold_df[entity_type] != "[]") |
    (cand_df[entity_type] != "[]")
    ]
cand_names_series = cand_names_series[entity_type]

comp_gold_cand_df = pd.concat([gold_names_series, cand_names_series], axis=1)
comp_gold_cand_df.columns = ["gold_names", "cand_names"]
print(f"{comp_gold_cand_df.shape=}")


# convert to be list based
for col in comp_gold_cand_df.columns:
    try:
        comp_gold_cand_df[col] = comp_gold_cand_df[col].apply(eval)
    except TypeError:
        pass
################################################################################
prf_names = {name: PRFScore() for name in all_names}
for idx in range(comp_gold_cand_df.shape[0]):
    line_gold_names = comp_gold_cand_df.iloc[idx][0]
    line_cand_names = comp_gold_cand_df.iloc[idx][1]
    tp_name = 0

    # Calc TruePositive + FalsePositive
    for name in line_cand_names:
        if name in line_gold_names:
            prf_names[name].tp += 1
        else:
            prf_names[name].fp += 1
    # Calc FalseNegative
    for name in set(line_gold_names) - set(line_cand_names):
        prf_names[name].fn += 1


prf_dict = {name: [prf.tp, prf.fp, prf.fn,prf.recall, prf.precision, prf.fscore, (prf.tp/(prf.tp+prf.fn+1e-100))] for name, prf in prf_names.items()}
prd_df = pd.DataFrame(prf_dict).transpose()
prd_df.columns = ["tp", "fp", "fn", "recall", "precision", "f1", "acc"]
print(f"{prd_df.shape=}")
with pd.option_context('display.max_rows', 500, 'display.max_columns', 10):
    display(prd_df)

group_name: len(gold_names)=19
group_name: len(cand_names)=149
group_name: len(gold_names-cand_names)=6
group_name: len(cand_names-gold_names)=136
group_name len(all_names)=155
comp_gold_cand_df.shape=(2147, 2)
prd_df.shape=(155, 7)


Unnamed: 0,tp,fp,fn,recall,precision,f1,acc
−− hklm\software\microsoft\windows\currentversion\,0.0,1.0,0.0,0.0,0.0,0.0,0.0
your network,0.0,3.0,0.0,0.0,0.0,0.0,0.0
apt30,202.0,0.0,11.0,0.948357,1.0,0.973494,0.948357
java zero,0.0,7.0,0.0,0.0,0.0,0.0,0.0
apt28 ttps,0.0,2.0,0.0,0.0,0.0,0.0,0.0
apt29 phishing,0.0,2.0,0.0,0.0,0.0,0.0,0.0
splm,0.0,2.0,0.0,0.0,0.0,0.0,0.0
webc2,0.0,1.0,0.0,0.0,0.0,0.0,0.0
ktae,0.0,1.0,0.0,0.0,0.0,0.0,0.0
mimikatz,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [20]:
prf = PRFScore()
prf.tp = prd_df.tp.sum()
prf.fp = prd_df.fp.sum()
prf.fn = prd_df.fn.sum()

prf_tot_df = pd.DataFrame([prf.tp, prf.fp, prf.fn,prf.recall, prf.precision, prf.fscore, (prf.tp/(prf.tp+prf.fn+1e-100))]).transpose()
prd_df.
prd_df

Unnamed: 0,tp,fp,fn,recall,precision,f1,acc
0,1925.0,339.0,717.0,0.728615,0.850265,0.784753,0.728615
