In [None]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm 
import os
import json
from collections import Counter
from collections import defaultdict
import seaborn as sns

In [None]:
# Configuration
data_dir = "./dataset"
percentage_to_load = 5
seed = 42

# File sampling
json_files = [f for f in os.listdir(data_dir) if f.endswith('.json')]
total_files = len(json_files)
sample_size = max(1, int((percentage_to_load / 100.0) * total_files))

random.seed(seed)
selected_files = random.sample(json_files, sample_size)

# Initialize storage and counters
records = []
skipped_no_ip = 0
skipped_malformed = 0

# Read & Parse
for json_file in tqdm(selected_files, desc=f"Loading {percentage_to_load}% of files"):
    file_path = os.path.join(data_dir, json_file)
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                try:
                    row = json.loads(line)

                    host_identifier = row.get("host_identifier")
                    ip = host_identifier.get("ipv4") if isinstance(host_identifier, dict) else None
                    if not ip:
                        skipped_no_ip += 1
                        continue

                    ports = row.get("ports_list", [])
                    ports = [int(p) for p in ports if isinstance(p, str) and p.isdigit()]

                    services = row.get("services", [])
                    if isinstance(services, list):
                        transport_list = [s.get("transport") for s in services if isinstance(s, dict)]
                        service_name_list = [s.get("service_name") for s in services if isinstance(s, dict)]
                    else:
                        transport_list = []
                        service_name_list = []

                    whois = row.get("whois")
                    org_name = (
                        whois.get("organization", {}).get("name")
                        if isinstance(whois, dict) and isinstance(whois.get("organization"), dict)
                        else None
                    )

                    records.append({
                        "ip": ip,
                        "ports": ports,
                        "transports": transport_list,
                        "service_names": service_name_list,
                        "org_name": org_name,
                        "asn": (
                            int(row["autonomous_system"]["asn"])
                            if isinstance(row.get("autonomous_system"), dict) and "asn" in row["autonomous_system"]
                            else None),
                        "country_code": (row["location"].get("country_code") if isinstance(row.get("location"), dict) else None),
                        "province": (row["location"].get("province") if isinstance(row.get("location"), dict) else None),
                        "has_reverse_dns": (
                            int(bool(row.get("dns", {}).get("reverse_dns", {}).get("names")))
                            if isinstance(row.get("dns"), dict)
                            else 0),
                        "udp_ratio": (transport_list.count("UDP") / len(transport_list) if transport_list else 0),
                        "unique_services": (len(set(service_name_list)) if service_name_list else 0),
                        "num_svr": (service_name_list.count("SVR") if service_name_list else 0)
                    })

                except json.JSONDecodeError:
                    skipped_malformed += 1
                    continue

    except Exception as e:
        print(f"Error reading {file_path}: {e}")

# Build initial DataFrame
df = pd.DataFrame(records)

print(f"Loaded {len(df)} hosts into DataFrame ({percentage_to_load}% of data)")
print(f"Skipped entries:")
print(f" - Malformed JSON lines: {skipped_malformed}")
print(f" - Missing or invalid IP: {skipped_no_ip}")

# Binary port matrix creation
MAX_PORT = 65535
port_matrix = np.zeros((len(df), MAX_PORT), dtype=np.uint8)

for i, port_list in enumerate(df['ports']):
    for p in port_list:
        if 1 <= p <= MAX_PORT:
            port_matrix[i, p - 1] = 1

# Convert to DataFrame and merge
port_columns = [f"port_{p}" for p in range(1, MAX_PORT + 1)]
port_df = pd.DataFrame(port_matrix, columns=port_columns)

# Drop original ports column and merge
df_final = pd.concat([df.drop(columns=["ports"]), port_df], axis=1)

# Preview final output
output_path = "superhost_5percent.parquet"
df_final.to_parquet(output_path, index=False)
print(f"Saved {df_final.shape[0]} records to: {output_path}")

print("Final DataFrame shape:", df_final.shape)
print(df_final.head())