Author: Filip Bucko  
Email: xbucko05@vutbr.cz  
Institution: Brno University of Technology - Faculty of Information Technology  
Date: 18.5.2024

# Flatten Domain–IP JSON to a Tidy DataFrame

This notebook processes a large JSON-Lines export of domain-IP mappings, exploding the `ip_data` list to yield one row per IP, flattening nested structures with `pandas.json_normalize`, and preparing transformer-ready input strings. The final balanced dataset is written to CSV for downstream modeling.

In [None]:
# Standard libraries
import json
from pathlib import Path

# Data handling
import pandas as pd

In [8]:
# Helper functions
def stream_json(path: Path):
    """
    Memory-efficient JSONL reader: yields one JSON object per non-empty line.
    """
    with path.open("r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line:
                yield json.loads(line)

def geo_tokens_from_ip_data(ip_data):
    """
    Convert geo sub-objects in ip_data into tokens summarizing
    counts and unique country/region/city/timezone values.
    """
    if not ip_data:
        return [
            "ip_count: 0",
            "countries: NA",
            "regions: NA",
            "cities: NA",
            "timezones: NA"
        ]

    ip_count = len(ip_data)
    countries, regions, cities, timezones = set(), set(), set(), set()

    for rec in ip_data:
        if isinstance(rec, dict):
            geo = rec.get("geo") or {}
            countries.add(geo.get("country") or geo.get("country_code") or "NA")
            regions.add(geo.get("region") or "NA")
            cities.add(geo.get("city") or "NA")
            timezones.add(geo.get("timezone") or "NA")

    def uniq(vals):
        vals.discard("NA")
        return ", ".join(sorted(vals)) if vals else "NA"

    return [
        f"ip_count: {ip_count}",
        f"countries: {uniq(countries)}",
        f"regions: {uniq(regions)}",
        f"cities: {uniq(cities)}",
        f"timezones: {uniq(timezones)}"
    ]

def prepare_geo_input_string(row: pd.Series) -> str:
    """
    Build a single input string for transformer models from domain_name and ip_data.
    """
    CLS, SEP = "[CLS]", "[SEP]"
    domain = (row.get("domain_name") or "NA").lower().lstrip("www.")
    tokens = [f"domain: {domain}"] + geo_tokens_from_ip_data(row.get("ip_data"))
    return f"{CLS} " + f" {SEP} ".join(tokens) + f" {SEP}"

def build_dataset(json_path: Path, label: int) -> pd.DataFrame:
    """
    Load a JSONL file, build input strings, assign label,
    and return only 'input_string' and 'label' columns.
    """
    df = pd.DataFrame(stream_json(json_path))
    df["input_string"] = df.apply(prepare_geo_input_string, axis=1)
    df["label"] = label
    return df[["input_string", "label"]]


In [10]:
# Load, balance, and save

project_root = Path.cwd().parent.parent

# malicious_path  = project_root / "datasets" / "phishing" / "phishing_strict_ip_2024.json"
malicious_path  = project_root / "datasets" / "malware" / "malware_strict_ip_2024.json"
benign_path = project_root / "datasets" / "benign"   / "benign_2312_anonymized_ip_2024.json"

# df_malware  = build_dataset(malware_path,  label=1)
df_malicious  = build_dataset(malicious_path,  label=1)
df_benign = build_dataset(benign_path, label=0)

minority_size = min(len(df_malicious), len(df_benign))
malicious_bal  = df_malicious.sample(n=minority_size, random_state=42)
benign_bal = df_benign.sample(n=minority_size, random_state=42)

balanced = pd.concat([malicious_bal, benign_bal], ignore_index=True)
balanced = balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# out_path = project_root / "datasets" / "phishing" / "geo_phishing_preprocessed.csv"
out_path = project_root / "datasets" / "malware" / "geo_malware_preprocessed_2.csv"
balanced.to_csv(out_path, index=False)

print(f"Wrote {out_path.resolve()} with {balanced.shape[0]} rows "
      f"({minority_size} phishing + {minority_size} benign)")


Wrote /home/fetagpu/Work/01-transofmers/datasets/malware/geo_malware_preprocessed_2.csv with 201618 rows (100809 phishing + 100809 benign)
