In [14]:
import pandas as pd
from os import path, makedirs

pd.set_option('display.max_columns', None)

In [15]:
tmp_data_directory = path.join(path.dirname('./'), '_tmp')

if not path.exists(tmp_data_directory):
    makedirs(tmp_data_directory)

In [16]:
def coalesce(df: pd.DataFrame, cols: list[str]) -> pd.Series:
    """
    Retorna uma Series com o primeiro valor não nulo entre as colunas especificadas.

    :param df: DataFrame contendo as colunas.
    :param cols: Lista de nomes de colunas para aplicar o coalesce.
    :return: Series com o primeiro valor não nulo por linha.
    """
    if not cols:
        raise ValueError("A lista de colunas não pode estar vazia.")
    
    result = df[cols[0]]
    for col in cols[1:]:
        result = result.combine_first(df[col])
    return result


In [17]:
def write_dataframe(df: pd.DataFrame, name: str):
    df.to_parquet(f"./_tmp/{name}.parquet")

In [18]:
def read_columns_txt(path: str) -> list:
    return open(path, "r").read().splitlines()

In [19]:
def get_vcdb(columns: list = None) -> pd.DataFrame:
    return pd.read_csv(
            "./data/vcdb.csv",
            low_memory=False,
            usecols=columns or read_columns_txt(path="columns.txt")
            )

In [20]:
def filter_columns(include_patterns: list[str], exclude_patterns: list[str] = None):
    columns = read_columns_txt("columns.txt")

    def pattern_to_parts(pattern):
        return pattern.split(".")

    def match(col: str, parts: list[str]) -> bool:
        col_parts = col.split(".")
        if len(col_parts) < len(parts):
            return False
        return all(p == "*" or p == c for p, c in zip(parts, col_parts))

    # Incluir colunas que casam com qualquer padrão de inclusão
    included = []
    for pattern in include_patterns:
        parts = pattern_to_parts(pattern)
        included += [col for col in columns if match(col, parts)]

    included = list(set(included))  # remove duplicatas

    # Excluir colunas que casam com qualquer padrão de exclusão
    if exclude_patterns:
        for pattern in exclude_patterns:
            parts = pattern_to_parts(pattern)
            included = [col for col in included if not match(col, parts)]

    # Sempre incluir incident_id
    if "incident_id" not in included:
        included.append("incident_id")

    return included


In [21]:
def undo_onehot_encoding(df: pd.DataFrame, prefix: str) -> pd.Series:
    columns = [col for col in df.columns if col.startswith(prefix + ".")]
    def extract_value(row):
        for col in columns:
            if row[col]:
                return col.replace(prefix + ".", "").split(".")[-1]
        return None
    return df.apply(extract_value, axis=1)

In [22]:
def first_true_column(df: pd.DataFrame, cols: list[str]) -> pd.Series:
    """
    Retorna uma Series com o nome da primeira coluna (entre as especificadas)
    que contém True em cada linha.

    :param df: DataFrame contendo as colunas booleanas.
    :param cols: Lista de nomes de colunas booleanas.
    :return: Series com o nome da primeira coluna True por linha, ou None.
    """
    def find_first_true(row):
        for col in cols:
            if row[col]:
                return col
        return None
    
    return df[cols].apply(find_first_true, axis=1)

# Actor

In [23]:
cols = filter_columns(include_patterns=["actor.*.variety.*", "actor.*.motive.*"])
actors_varieties = get_vcdb(cols)

actor_types = ["internal", "external", "partner"]

for actor in actor_types:
    actors_varieties[actor] = undo_onehot_encoding(actors_varieties[cols], f"actor.{actor}")
cols = [s for s in cols if s not in ("incident_id")]

motive_cols = [col for col in cols if "motive" in col]

actors_varieties['motive'] = first_true_column(actors_varieties, motive_cols).apply(lambda x: None if x is None else x.split(".")[-1])
actors_varieties['actor'] = first_true_column(actors_varieties, actor_types)
actors_varieties['actor_variety'] = coalesce(actors_varieties, actor_types)

actors_varieties.drop(columns=cols + actor_types, inplace=True)

write_dataframe(actors_varieties, "actors_varieties")

print(actors_varieties.shape)
print(actors_varieties.dtypes)

actors_varieties.head()

(10394, 4)
incident_id      object
motive           object
actor            object
actor_variety    object
dtype: object


Unnamed: 0,incident_id,motive,actor,actor_variety
0,0001AA7F-C601-424A-B2B8-BE6C9F5164E7,Unknown,internal,Unknown
1,0eb22d90-8e87-11ec-a600-657ec2694f8f,,internal,
2,0008DADB-E83D-4278-A19A-CEE01610CF43,Financial,external,Unknown
3,000D403E-2DC9-4EA7-9294-BD3938D1C3C7,Unknown,partner,Unknown
4,0012CC25-9167-40D8-8FE3-3D0DFD8FB6BB,Financial,external,Unknown


# Asset

In [24]:
cols = filter_columns(include_patterns=["asset.*"], exclude_patterns=["asset.assets.amount.*", "asset.country.*"])
assets = get_vcdb(cols)

cols = [s for s in cols if s not in ("incident_id", "asset.total_amount")]

assets["asset_cloud"] = undo_onehot_encoding(assets[cols], "asset.cloud")
assets["asset_hosting"] = undo_onehot_encoding(assets[cols], "asset.hosting")
assets["asset_management"] = undo_onehot_encoding(assets[cols], "asset.management")
assets["asset_ownership"] = undo_onehot_encoding(assets[cols], "asset.ownership")
assets["asset_role"] = undo_onehot_encoding(assets[cols], "asset.role")
assets["asset_variety"] = undo_onehot_encoding(assets[cols], "asset.assets.variety").apply(lambda x: x.split(" - ")[-1])

assets.drop(columns=cols, inplace=True)
write_dataframe(assets, "assets")

print(assets.shape)
print(assets.dtypes)

assets.head()

(10394, 8)
asset.total_amount    float64
incident_id            object
asset_cloud            object
asset_hosting          object
asset_management       object
asset_ownership        object
asset_role             object
asset_variety          object
dtype: object


Unnamed: 0,asset.total_amount,incident_id,asset_cloud,asset_hosting,asset_management,asset_ownership,asset_role,asset_variety
0,,0001AA7F-C601-424A-B2B8-BE6C9F5164E7,Unknown,,,,,Database
1,1.0,0eb22d90-8e87-11ec-a600-657ec2694f8f,External Cloud Asset(s),,,,IT,File
2,,0008DADB-E83D-4278-A19A-CEE01610CF43,Unknown,,,,,Desktop or laptop
3,,000D403E-2DC9-4EA7-9294-BD3938D1C3C7,Unknown,,,,,Disk media
4,,0012CC25-9167-40D8-8FE3-3D0DFD8FB6BB,Unknown,,,,,Web application


# Action


In [25]:
cols = filter_columns(include_patterns=["action.*.variety.*"])
action_varieties = get_vcdb(cols)

action_types = ["environmental", "error", "hacking", "malware", "misuse", "physical", "social"]
    
for action in action_types:
    action_varieties[action] = undo_onehot_encoding(action_varieties[cols], f"action.{action}")
cols = [s for s in cols if s not in ("incident_id")]

action_varieties.drop(columns=cols, inplace=True)

action_varieties['action'] = first_true_column(action_varieties, action_types)
action_varieties['action_variety'] = coalesce(action_varieties, action_types)
action_varieties.drop(columns=action_types, inplace=True)

write_dataframe(action_varieties, "action_varieties")
print(action_varieties.shape)
print(action_varieties.dtypes)
action_varieties.head()

(10394, 3)
incident_id       object
action            object
action_variety    object
dtype: object


Unnamed: 0,incident_id,action,action_variety
0,0001AA7F-C601-424A-B2B8-BE6C9F5164E7,misuse,Privilege abuse
1,0eb22d90-8e87-11ec-a600-657ec2694f8f,error,Misconfiguration
2,0008DADB-E83D-4278-A19A-CEE01610CF43,physical,Theft
3,000D403E-2DC9-4EA7-9294-BD3938D1C3C7,error,Loss
4,0012CC25-9167-40D8-8FE3-3D0DFD8FB6BB,hacking,Exploit vuln


In [26]:
cols = filter_columns(include_patterns=["*.country.*", "actor.external.country.*"])
location = get_vcdb(cols)

cols = [s for s in cols if s != "incident_id"]

location["external_actor_country"] = undo_onehot_encoding(location[cols], "actor.external.country")
location["asset_country"] = undo_onehot_encoding(location[cols], "asset")
location["victim_country"] = undo_onehot_encoding(location[cols], "victim")

location.drop(columns=cols, inplace=True)

write_dataframe(location, "location")
print(location.shape)
print(location.dtypes)

location.head()

(10394, 4)
incident_id               object
external_actor_country    object
asset_country             object
victim_country            object
dtype: object


Unnamed: 0,incident_id,external_actor_country,asset_country,victim_country
0,0001AA7F-C601-424A-B2B8-BE6C9F5164E7,,,US
1,0eb22d90-8e87-11ec-a600-657ec2694f8f,,,ES
2,0008DADB-E83D-4278-A19A-CEE01610CF43,Unknown,,US
3,000D403E-2DC9-4EA7-9294-BD3938D1C3C7,,,US
4,0012CC25-9167-40D8-8FE3-3D0DFD8FB6BB,Unknown,,GB


In [27]:
timeline = get_vcdb(["incident_id", "timeline.incident.year", "timeline.incident.month", "timeline.incident.day"])
timeline.rename({x: x.split(".")[-1] for x in timeline.columns}, inplace=True, axis=1)
timeline["day"] = timeline["day"].astype("Int8")
timeline["month"] = timeline["month"].astype("Int8")

write_dataframe(timeline, "timeline")
print(timeline.shape)
print(timeline.dtypes)

timeline.head()

(10394, 4)
incident_id    object
day              Int8
month            Int8
year            int64
dtype: object


Unnamed: 0,incident_id,day,month,year
0,0001AA7F-C601-424A-B2B8-BE6C9F5164E7,9.0,4.0,2010
1,0eb22d90-8e87-11ec-a600-657ec2694f8f,,,2020
2,0008DADB-E83D-4278-A19A-CEE01610CF43,,,2014
3,000D403E-2DC9-4EA7-9294-BD3938D1C3C7,,,2014
4,0012CC25-9167-40D8-8FE3-3D0DFD8FB6BB,,,2012


# Misc

## Action Notes

In [28]:
cols = filter_columns(include_patterns=["action.*.notes"])
action_notes = get_vcdb(cols)

cols.remove("incident_id")
action_notes["action_notes"] = coalesce(action_notes, cols)

action_notes.drop(cols, axis=1, inplace=True)
write_dataframe(action_notes, "action_notes")

print(action_notes.shape)
print(action_notes.dtypes)
action_notes[~action_notes["action_notes"].isnull()].head()

(10394, 2)
incident_id     object
action_notes    object
dtype: object


Unnamed: 0,incident_id,action_notes
11,39B1FCE8-E94B-49DD-98AF-8B8E75F52F96,it was as easy as using a commonly used passwo...
13,E12ACA81-EFCF-4096-8D2D-4D54E9898077,"\nVERIS 1_3_7 to 1_4_0 Migration script, to fi..."
14,d2f15020-8c28-11e7-944b-11adeb58a7b0,\nVERIS 1_3_7 to 1_4_0 Migration script to rem...
20,F1CD56BA-3C76-4F21-B738-4E806BE33542,"\nVERIS 1_3_7 to 1_4_0 Migration script, to fi..."
26,91830E0B-8896-413F-A921-84915E52B122,"\nVERIS 1_3_7 to 1_4_0 Migration script, to fi..."


# Dataframe Builder