# Pseudonymize LFwC for Review.

In [None]:
import secrets
from pathlib import Path

import pandas as pd
import ssdeep

In [None]:
CORPUS_PATH: Path = Path("lfwc.csv")

In [None]:
df: pd.DataFrame = pd.read_csv(CORPUS_PATH, index_col=0)

In [None]:
length_mapping: dict["str", int] = {"md5": 16, "sha1": 20, "sha256": 32, "sha512": 64}


def replace_hash(_: str, kind: str) -> str:
    if kind == "tlsh":
        return f"T1{secrets.token_hex(nbytes=35).upper()}"
    if kind == "ssdeep":
        return ssdeep.hash(secrets.token_bytes(nbytes=1024))
    return secrets.token_hex(nbytes=length_mapping[kind])

In [None]:
for column in ["md5", "sha1", "sha256", "sha512", "ssdeep", "tlsh"]:
    df[column] = df[column].apply(lambda x: replace_hash(x, column))

In [None]:
pseudo_name_map: dict[str, str] = {}


def pseudonymize_device_names(name: str) -> str:
    if name not in pseudo_name_map:
        pseudo_name_map[name] = secrets.token_hex(nbytes=12)
    return pseudo_name_map[name]

In [None]:
df["device_name"] = df["device_name"].apply(pseudonymize_device_names)
df["filename"] = "*****"
df["source_link"] = "https://*****"
df["wayback"] = "https://web.archive.org/*****"

In [None]:
df

In [None]:
df.to_csv("../public_data/lfwc_masked.csv")