In [None]:
!pip install python-dotenv
from openai import OpenAI
import os
from dotenv import load_dotenv

In [None]:
import os
import json
import re
import PyPDF2
def load_papers_from_jsonl(file_path):
    papers = []
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            for line in file:
                try:
                    paper = json.loads(line.strip())
                    papers.append(paper)
                except json.JSONDecodeError:
                    print(f"Skipping a line due to JSON decoding error.")
    except FileNotFoundError:
        print(f"The file {file_path} was not found.")
    except Exception as e:
        print(f"An unexpected error occurred while reading the file: {e}")

    print(f"Loaded papers from {file_path}.")
    return papers

# Specify the path to your JSONL file
jsonl_file_path = "extracted_dfrws_papers_NEWEST_final.jsonl"

# Load papers
papers = load_papers_from_jsonl(jsonl_file_path)

# Display loaded papers (Optional)
for i, paper in enumerate(papers[:5]):  # Limit display to the first 5 papers for readability
    print(f"Paper {i+1}:")
    print(f"Title: {paper.get('title', 'No title provided')}")
    print("Content:")
    print(paper.get('content', 'No content provided')[:500])  # Print first 500 characters of content
    print("-" * 50)  # Separator


Initial Taxonomy

In [None]:
def generate_all_metadata_prompt(task, paper, ontology_json=None):
    title = paper['title']
    content = paper['content']

    if task == "title":
        return f'''
        You are tasked with extracting the full title from the digital forensics paper titled "{title}".

        Guidelines:
        - The title is usually at the top of the first page or in the first section.
        - Extract the title in its entirety.

        Your response must be in the following JSON format:
        {{
            "title": "Title of the paper here"
        }}

        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>

        Your response: """
        '''
       
    elif task == "taxonomy_classification":
         
          return f'''
          You are tasked with classifying the research paper titled "{title}" using the digital forensics taxonomy below.
          
          Rules & Guidelines:
          \t1- Choose only ONE value for each of the following fields: primary_domain, sub_domain.
          \t2- If no match is found, return "unknown" for that field.
          \t3- If a new domain or sub_domain not in the list is clearly found, return it directly.
          \t4- tags MUST be a list of 5–10 short strings.
          \t5- Use the taxonomy below as your starting point. It is not final; you may extend it using rule 3.
          \t6- Object lists are illustrative examples to aid disambiguation; a paper may be classified under a sub_domain even if it does not explicitly mention the listed objects.
          \t7- Artifact types and low-level sub-objects (e.g., heaps, stacks, page tables, registry keys, prefetch files) may be included as tags for Computer Forensics to capture fine-grained technical details and aid disambiguation (e.g., distinguishing physical vs. process memory), 
          but should not be used as values for primary_domain, sub_domain, or object.
         
          TAXONOMY (initial):
        {{
          "Computer Forensics": {{
          "Memory Forensics": {{ "Physical Memory Forensics": [  "Physical RAM", "Memory Dump", "DMA Artifacts" ],
          "Process Memory Forensics": [ "Heaps", "Stacks","Loaded Modules/DLLs", "Decrypted Artifacts"],
          "Kernel Memory Forensics": ["Kernel Objects", "Drivers/Modules","Kernel Hooks/Rootkits"],
          "Virtual Address Space Analysis": ["Page Tables","Address Translation","Paging Structures"],
          "Memory Acquisition": ["Live Acquisition","Offline Dump Analysis","Acquisition Integrity"]
        }},
        
        "Disk / Storage Forensics": {{ 
        "File System Forensics": ["NTFS/FAT/ext/APFS","Directories","File Allocation"],
        "File System Metadata & System Artifacts": ["Registry","Journals","Browser Artifacts","System Logs","Temp Artifacts"],
        "Unallocated Space & File Carving": ["Deleted Files","Fragmented Files","Carved Objects"],
        "Low-Level Sector Forensics": ["Sectors","Bad Sectors","Firmware/Controller Areas"],
        "Disk Imaging & Acquisition": ["Imaging Tools","Faulty Sector Handling","Integrity/Hashing"]
        }},

        "System & OS Artifact Forensics": {{ 
        "Operating System State": ["Running Services","Loaded Drivers", "System Configuration"],
        "User & Account Artifacts": ["User Accounts","Login Sessions","Access Control Data"],
        "Execution & Activity Artifacts": ["Prefetch","ShimCache","Amcache","Recent File Artifacts"],
        "Application & System Logs": ["Event Logs","Audit Logs","Application Logs", "E-mail artifacts"]
        }},
 
        "Vehicle / Automotive Forensics": {{
        "Automotive Forensics": ["Vehicle Electronic Control Units (ECUs)", "In-Vehicle Networks (CAN/LIN/FlexRay/Ethernet)", "On-Board Diagnostics Interface (OBD-II)", "Vehicle Service Systems"],
        "Drone / UAV Forensics": ["Unmanned Aerial Vehicles (UAVs)", "Flight Controllers", "Onboard Sensor Modules", "Communication Modules", "Ground Control Stations"],
        "Telematics & Event Data Forensics": [ "Event Data Recorders (EDR)", "Vehicle Telematics Control Units (TCU)", "Navigation / Infotainment Systems"]
        }},
          
        "Software Forensics": {{
        "Operating System Forensics": ["File systems for Windows/Mac/Unix/Linux", "Windows/Mac/Unix/Linux"],
        "Application Software Forensics": ["Mail Services", "Web Services", "DBMS", "Access Control Systems", "E-Commerce Services"],
        "Forensic Tools Analysis (Open source/Proprietary)": ["E4Case/FTK/File Hound/Sleuthkit/WinHex"]
        }},
          
        "Database Forensics": {{
        "Database Metadata/Contents Forensics": ["DBMS", "Databases"]
        }},
          
        "Multimedia Forensics": {{
        "Image Forensics": ["Digital Images"],
        "Video Forensics": ["Digital Video"],
        "Audio Forensics": ["Digital Audio"]
        }},
          
        "Device Forensics": {{
        "Peripherals Device Forensics": ["Copiers", "Printers", "Scanners"],
        "Network Enabled Device Forensics": ["Wireless AP", "IDS", "Firewalls", "Hubs", "Switches", "Routers"],
        "Storage Device Forensics": ["RFID Tags/Smart cards/Memory cards", "DVD/CD/Floppy/Tapes", "External Hard Drives", "Thumb Drive", "Digital Music Players"],
        "Large Scale Device Forensics": ["SAN (Storage Area Network)", "NAS (Network Attached Storage)"],
        "Obscure Device Forensics": ["Recording Devices (Video/Audio)", "Gaming Devices"],
        "Mobile Forensics": ["PDAs", "Smart/Cell phones", "Tablets"],
        "Small Scale Device Forensics": ["Embedded Devices"],
        "Wearable & Immersive Device Forensics": ["Smart Watches", "Smart Glasses", "VR Headsets", "AR Glasses", "Motion Controllers"],
        "Additive Manufacturing / 3D Printer Forensics": ["3D Printer"],
        "Medical Device Forensics": ["Implantable Medical Devices", "Wearable Medical Devices","Diagnostic Medical Equipment", "Therapeutic Devices"]
        }},
          
        "Network Forensics": {{
        "Cloud Forensics": ["Clouds (Cloud Computing)"],
        "Telecom Network Forensics": ["Cell Phone/Telecom Service Provider Network"],
        "Internet Forensics": ["Web Documents", "Webmails", "Emails", "Domain Name Records", "ISP Logs"],
        "Wireless Forensics": ["Bluetooth, Infrared, Wi-Fi"]
        }},
          
        "IoT Forensics": {{
        "Smart Home & Building Systems": [ "IoT Cameras",  "AI Speaker Devices", "Thermostats",  "Relays / Switch Actuators", "Building Automation Systems", "HVAC Controllers", "Access Control Systems", 
        "Smart Lighting", "Energy Management Systems", "Occupancy Sensors"],
        "Industrial IoT Systems": ["SCADA Systems", "ICS Platforms / Control Systems","Embedded Controllers (PLCs, RTUs)"],
        "Medical IoT devices,": ["Smart Contact Lenses", "Glucose Monitoring Devices", "Remote Patient Monitoring Devices", "Networked Medical Devices","Implantable Medical Devices" "Wearable Medical Devices", "Diagnostic Medical Equipment"]
        }},
          
        "AI Forensics": {{
        "AI Training Forensics": ["Training Process Forensics", "Dataset Forensics", "Environment Forensics"],
        "AI Substrate Forensics": ["Disk, Network, Sensor, Actuator"],
        "AI Application Forensics": ["API, Artifacts"],
        "AI Model Forensics": ["Model Authentication/Fingerprinting/Identification/Performance/Malware/Chain of Custody"]
        }},
        
        "Blockchain Forensics": {{
        "Wallet Forensics": ["Cryptocurrency Wallets", "Wallet Key Stores"],
        "Transaction Analysis": ["Blockchain Ledger"]
        }},
          
        "Knowledge Systematization": {{
        "Systematization of Knowledge (SoK)": [],
        "Systematic Literature Review (SLR)": [],
        "Ontology Development/Taxonomy Development": [],
        "Survey Papers": [],
        "Frameworks": ["Validation Methodologies", "Community Standards"],
        "Policy Papers": [],
        "Education Papers": [],
        "Legal & Regulatory Studies": []
        }},

        "AI-Assisted Digital Forensics": {{
        "AI-Assisted Evidence Interpretation": ["Forensic Data Interpretation"],
        "AI-Assisted Investigative Reasoning": ["Forensic Reasoning Support"],
        "AI-Assisted Explanation & Sensemaking": ["Explainability Support"],
        "AI-Assisted Automation & Triage": ["Forensic Workflow Support"]
        }}
    }}
        
        Return ONLY valid JSON in this format:
        
        {{
         "primary_domain": "...",
         "sub_domain": "...",
         "object": "...",
         "tags": ["...", "...", "...", "...", "..."]
        }}
        
        \t6- Examples:
        
        \t- In the paper “Avoiding Burnout at the Digital Forensics Coalface”, the authors perform a thematic synthesis and propose evidence-based 
        frameworks for stress management in the digital forensics workforce.

        {{
          "primary_domain": "Knowledge Systematization",
          "sub_domain": "Frameworks",
          "object": "Validation Methodologies"
          "tags": ["Resilience","training", "stress management", "Organisational/occupational job stress", "Stress management strategies"]
        }}

        \t- In the paper “TLS Key Material Identification and Extraction from Memory”, the authors extract TLS session keys from 
        system memory and use them to decrypt previously captured encrypted network traffic.

        {{
          "primary_domain": "Computer Forensics",
          "sub_domain": "Memory Forensics",
          "object": "Process Memory",
          "tags": ["Memory forensics", "Network forensics", "TLS", "Transport layer security", "Live forensics", "Volatile memory", "TLS key material extraction", "Encrypted traffic decryption"]
        }}

        
        Here is the paper content:
        <Start of Paper Content>
        {content}
        <End of Paper Content>
        Your response: """
        
        '''
   
    else:
        raise ValueError("Invalid task")


In [None]:
import os
import csv
from openai import OpenAI
from dotenv import load_dotenv

# Load API key from .env
load_dotenv("api_key.env")

client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# ----------------------------
# FILES (separate!)
# ----------------------------
incremental_csv_path = "results_incremental.csv"   # append + resume
final_csv_path = "results_new_taxonomy_analysis.csv"  # final clean export

# ----------------------------
# Incremental helpers
# ----------------------------
def load_processed_titles(csv_path):
    if not os.path.isfile(csv_path):
        return set()

    processed = set()
    with open(csv_path, "r", newline="", encoding="utf-8") as f:
        reader = csv.reader(f)
        try:
            next(reader)  # header
        except StopIteration:
            return set()

        for row in reader:
            if row and row[0].strip():
                processed.add(row[0].strip())
    return processed

def ensure_header(csv_path, tasks):
    needs_header = (not os.path.isfile(csv_path)) or (os.path.getsize(csv_path) == 0)
    if needs_header:
        with open(csv_path, "w", newline="", encoding="utf-8") as f:
            writer = csv.writer(f)
            writer.writerow(["Paper Title"] + tasks)

def append_incremental_row(csv_path, paper_title, tasks, results_for_paper):
    with open(csv_path, "a", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        row = [paper_title] + [results_for_paper.get(t, "No result") for t in tasks]
        writer.writerow(row)
        f.flush()

# ----------------------------
# Main processor (same as yours + incremental save)
# ----------------------------
def process_papers_for_tasks(papers, tasks):
    task_results = {}

    # resume ONLY from incremental file
    processed_titles = load_processed_titles(incremental_csv_path)
    ensure_header(incremental_csv_path, tasks)

    for i, paper in enumerate(papers):
        paper_title = str(paper["title"]).strip()
        print(f"Processing paper: {paper_title}")

        # skip ONLY based on incremental file
        if paper_title in processed_titles:
            print(f"Skipping already processed paper: {paper_title}")
            continue

        task_results[paper_title] = {}

        for task in tasks:
            user_prompt = generate_all_metadata_prompt(task, paper)

            try:
                response = client.chat.completions.create(
                    model="gpt-4o-mini",
                    messages=[{"role": "user", "content": user_prompt}],
                    temperature=0.2,
                    max_tokens=1500
                )
                response_text = response.choices[0].message.content
                print(response_text)
                task_results[paper_title][task] = response_text

            except Exception as e:
                print(f"Error processing {task} for paper {i+1}: {e}")
                task_results[paper_title][task] = "error: " + str(e)

        # incremental save AFTER each paper finishes (to incremental file only)
        append_incremental_row(incremental_csv_path, paper_title, tasks, task_results[paper_title])
        processed_titles.add(paper_title)

    return task_results

# ----------------------------
# Run
# ----------------------------
test_papers = papers[:]
tasks = ["title", "taxonomy_classification"]

# Process papers (writes incremental as it runs)
all_results = process_papers_for_tasks(test_papers, tasks)

# ----------------------------
# Final CSV export (fresh overwrite, separate file)
# ----------------------------
with open(final_csv_path, "w", newline="", encoding="utf-8") as file:
    csv_writer = csv.writer(file)
    headers = ["Paper Title"] + tasks
    csv_writer.writerow(headers)

    for paper_title, results in all_results.items():
        row = [paper_title]
        for task in tasks:
            row.append(results.get(task, "No result"))
        csv_writer.writerow(row)

print(f"Incremental results: {incremental_csv_path}")
print(f"Final results: {final_csv_path}")


Venue × Discipline table (USA, EU, APAC) — robust + MEDIAN & CV


In [None]:
import re, json, ast
import pandas as pd

# -------------------------------
# Robust parsing helpers
# -------------------------------
def safe_parse_json(cell):
    """
    Parses cells that look like:
      ```json
      {...}
      ```
    Returns Python object or None.
    """
    s = str(cell)

    # strip code fences
    s = re.sub(r"```(?:json)?", "", s, flags=re.IGNORECASE).strip()
    s = s.replace("```", "").strip()

    if not s or s.lower() in ("nan", "none", "null"):
        return None

    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return None

def extract_from_raw(raw: str, key: str):
    """
    Regex fallback to extract a string field like "primary_domain":"..."
    even if JSON is malformed.
    """
    if raw is None:
        return None
    s = str(raw)
    s = re.sub(r"```(?:json)?", "", s, flags=re.IGNORECASE).strip()
    s = s.replace("```", "")

    m = re.search(rf'"{re.escape(key)}"\s*:\s*"([^"]+)"', s)
    return m.group(1).strip() if m else None


# -------------------------------
# Venue mapping 
# -------------------------------
def map_conference_to_venue(conf_str: str) -> str:
    s = (conf_str or "").strip().upper()
    if "USA" in s:
        return "DFRWS USA"
    if "EU" in s or "EUROPE" in s:
        return "DFRWS EU"
    if "APAC" in s or "ASIA" in s or "ASIA-PACIFIC" in s or "ASIA PACIFIC" in s:
        return "DFRWS APAC"
    return "Unknown"

def extract_conference(row) -> str:
    """
    Try JSON in 'conference'; if that fails, try plain strings / alternate cols.
    Expected new format: row['conference'] contains ```json {"conference":"DFRWS USA"} ```
    """
    conf_json = safe_parse_json(row.get("conference", ""))
    if isinstance(conf_json, dict):
        val = conf_json.get("conference", "") or conf_json.get("name", "")
        if isinstance(val, str) and val.strip():
            return val.strip()

    raw = row.get("conference", "")
    if isinstance(raw, str) and raw.strip():
        return raw.strip()

    for alt in ("venue", "conf", "event", "conference_name"):
        if alt in row and isinstance(row[alt], str) and row[alt].strip():
            return row[alt].strip()

    return ""


# -------------------------------
#  extract PRIMARY DOMAIN from taxonomy_classification
# -------------------------------
def extract_primary_domain(row) -> str:
    """
    Extract primary_domain from taxonomy_classification.
    Works even if some rows have malformed JSON (regex fallback).
    """
    raw = row.get("taxonomy_classification", "")
    tax = safe_parse_json(raw)

    if isinstance(tax, dict):
        d = tax.get("primary_domain", "")
        if isinstance(d, str) and d.strip():
            return d.strip()

    # fallback for malformed JSON
    d = extract_from_raw(raw, "primary_domain")
    return d.strip() if isinstance(d, str) and d.strip() else ""


# -------------------------------
# Venue × Primary Domain table (USA, EU, APAC) — MEDIAN & CV
# -------------------------------
records = []
debug_counts = {
    "rows": 0,
    "kept": 0,
    "no_conf": 0,
    "unknown_venue": 0,
    "no_domain": 0,
}

for _, row in df.iterrows():
    debug_counts["rows"] += 1

    conf_raw = extract_conference(row)
    venue = map_conference_to_venue(conf_raw)
    domain = extract_primary_domain(row)

    if not domain:
        debug_counts["no_domain"] += 1
        continue
    if not conf_raw:
        debug_counts["no_conf"] += 1
        continue
    if venue == "Unknown":
        debug_counts["unknown_venue"] += 1
        continue

    records.append({"venue": venue, "primary_domain": domain})
    debug_counts["kept"] += 1

venue_domain_df = pd.DataFrame.from_records(records, columns=["venue", "primary_domain"])

if venue_domain_df.empty:
    print("\n[WARN] No venue/domain records parsed. Debug:", debug_counts)
    print("Tip: Inspect a few raw cells, e.g.:")
    print(" - df['conference'].head() =", df.get("conference", pd.Series(dtype=object)).head().to_list())
    print(" - df['taxonomy_classification'].head() =", df.get("taxonomy_classification", pd.Series(dtype=object)).head().to_list())
else:
    # Build matrix
    venue_domain_matrix = pd.crosstab(
        venue_domain_df["venue"],
        venue_domain_df["primary_domain"]
    ).astype(int)

    # Order columns by global total (desc)
    col_order = venue_domain_matrix.sum(axis=0).sort_values(ascending=False).index
    venue_domain_matrix = venue_domain_matrix[col_order]

    # Row order: USA, EU, APAC (only those present)
    desired_rows = [v for v in ["DFRWS USA", "DFRWS EU", "DFRWS APAC"] if v in venue_domain_matrix.index]
    venue_domain_matrix = venue_domain_matrix.reindex(desired_rows)

    # ---- Stats ----
    core = venue_domain_matrix[col_order]

    venue_domain_matrix["TOTAL"] = core.sum(axis=1)

    # Median across domains (including zeros)
    venue_domain_matrix["MEDIAN"] = core.median(axis=1).round(2)

    # CV = std/mean (protect div-by-zero)
    means = core.mean(axis=1)
    stds = core.std(axis=1, ddof=1)
    venue_domain_matrix["CV"] = (stds / means.replace(0, pd.NA)).fillna(0.0).round(3)

    print("\n=== Venue × Primary Domain (counts + MEDIAN & CV) ===")
    print(venue_domain_matrix)

    out_path = "venue_primary_domain_matrix.csv"
    venue_domain_matrix.to_csv(out_path)
    print("\nSaved:", out_path)
    print("\n[Debug summary]", debug_counts)


In [None]:
import os, re, json, ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# CONFIG
# -------------------------
CSV_PATH = "results_new_taxonomy_analysis.csv"
OUTDIR = "dfrws_top10_country_domain"
os.makedirs(OUTDIR, exist_ok=True)

YEAR_MIN, YEAR_MAX = 2002, 2025
TOP_COUNTRIES = 10
TOP_DOMAINS = 12                 # heatmap columns

# Domain extraction mode:
# - "primary_domain"             : use primary_domain (fallback to sub_domain if primary missing)
# - "primary_domain_sub_domain"  : show "primary / sub" when both exist (fallback to whichever exists)
DOMAIN_MODE = "primary_domain"

# How to count country contribution per paper:
# - "presence"   : each (paper,country) counts as 1 regardless of #authors from that country
# - "fractional" : each paper contributes total=1 split across its unique countries (1/k per country)
COUNT_MODE = "presence"          # or "fractional"

# Domains you always want in the heatmap columns (if present after filtering)
FORCE_INCLUDE_DOMAINS = ["AI Forensics"]

# Write rows that were dropped because domain couldn't be extracted (for debugging)
WRITE_DROPPED_DOMAIN_DEBUG = True


# -------------------------
# SAFE PARSING
# -------------------------
def safe_parse(cell):
    """
    Parses cells that look like:
      ```json
      {...}
      ```
    Also tries python literal parsing as fallback.
    Returns Python object or None.
    """
    s = str(cell)

    # Remove code fences like ```json ... ```
    s = re.sub(r"```(?:json)?", "", s, flags=re.IGNORECASE).strip()
    s = s.replace("```", "").strip()

    # Treat empty / null-like as None
    if not s or s.lower() in ("nan", "none", "null"):
        return None

    # Try JSON then python literal
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return None


ALIASES = {
    "usa":"USA","us":"USA","u s":"USA","u.s.":"USA","u.s.a.":"USA","united states":"USA","united states of america":"USA",
    "uk":"United Kingdom","u k":"United Kingdom","u.k.":"United Kingdom","united kingdom":"United Kingdom",
    "england":"United Kingdom","scotland":"United Kingdom","wales":"United Kingdom","great britain":"United Kingdom","britain":"United Kingdom",
    "uae":"United Arab Emirates","u a e":"United Arab Emirates","u.a.e.":"United Arab Emirates",
    "united arab emirates":"United Arab Emirates","emirates":"United Arab Emirates",
    "republic of korea":"South Korea","korea, republic of":"South Korea","south korea":"South Korea","korea":"South Korea","r o k":"South Korea","r.o.k.":"South Korea",
}

def canon_country(c):
    s = str(c).strip().lower()
    s = re.sub(r"\s+", " ", s)   # collapse whitespace
    s = s.replace(".", "")       # remove dots so "u.s.a." -> "usa"
    return ALIASES.get(s, str(c).strip())

def extract_countries(obj):
    """Return a list of raw country strings from a parsed object."""
    if obj is None:
        return []
    if isinstance(obj, dict):
        obj = obj.get("author_countries", obj.get("countries", []))
    if isinstance(obj, list):
        out = []
        for x in obj:
            if isinstance(x, str):
                out.append(x)
            elif isinstance(x, dict) and "country" in x:
                out.append(x["country"])
        return out
    if isinstance(obj, str):
        return [obj]
    return []

def extract_domain(obj):
    """
    IMPORTANT: This function NEVER returns "Unknown".
    If it can't extract a domain, it returns None.
    """
    if not isinstance(obj, dict):
        return None

    d = str(obj.get("primary_domain", "")).strip()
    s = str(obj.get("sub_domain", "")).strip()

    if DOMAIN_MODE == "primary_domain_sub_domain":
        if d and s:
            return f"{d} / {s}"
        return d or s or None

    # default: primary_domain, but fall back to sub_domain if primary missing
    return d or s or None

def extract_year(obj):
    """Handle both dict years and raw strings."""
    if obj is None:
        return None
    if isinstance(obj, dict) and "year" in obj:
        try:
            return int(obj["year"])
        except Exception:
            return None
    try:
        return int(obj)
    except Exception:
        return None

def extract_conf(obj):
    if obj is None:
        return ""
    if isinstance(obj, dict) and "conference" in obj:
        return str(obj["conference"])
    return str(obj)


# -------------------------
# LOAD DATA
# -------------------------
df = pd.read_csv(CSV_PATH, dtype=str, keep_default_na=False, encoding="latin1")

df["countries"] = df["author_countries"].apply(safe_parse)
df["taxonomy"]  = df["taxonomy_classification"].apply(safe_parse)
df["year"]      = df["published_year"].apply(lambda x: extract_year(safe_parse(x)))
df["conf"]      = df["conference"].apply(lambda x: extract_conf(safe_parse(x)))

df["domain"] = df["taxonomy"].apply(extract_domain)

# filter DFRWS + years
df = df[df["conf"].str.startswith("DFRWS", na=False)]
df = df[df["year"].notna()]
df = df[(df["year"] >= YEAR_MIN) & (df["year"] <= YEAR_MAX)]

# build UNIQUE COUNTRY LIST per paper (dedup within paper!)
df["country_list"] = df["countries"].apply(
    lambda x: sorted({canon_country(c) for c in extract_countries(x) if str(c).strip()})
)

# -------------------------
# DROP ROWS WITH NO DOMAIN (so "Unknown" can never exist)
# -------------------------
before = len(df)
dropped = df[df["domain"].isna() | (df["domain"].astype(str).str.strip() == "")].copy()

df = df[df["domain"].notna()]
df = df[df["domain"].astype(str).str.strip() != ""]

after = len(df)
print(f"[INFO] Dropped {before - after} rows with missing/unparseable domain (no Unknown bucket).")

if WRITE_DROPPED_DOMAIN_DEBUG and len(dropped) > 0:
    debug_path = os.path.join(OUTDIR, "dropped_rows_missing_domain.csv")
    keep_cols = []
    for col in ["title", "conf", "year", "conference", "published_year", "author_countries", "taxonomy_classification"]:
        if col in dropped.columns:
            keep_cols.append(col)
    dropped[keep_cols].to_csv(debug_path, index=False, encoding="utf-8")
    print(f"[DEBUG] Wrote {debug_path} (rows dropped for missing domain)")


# -------------------------
# COUNTRY COUNTING (PER PAPER, NOT PER AUTHOR)
# -------------------------
rows = []
for _, r in df.iterrows():
    countries = [c for c in r["country_list"] if c and c != "Unknown"]
    if not countries:
        continue

    # weight per (paper,country)
    w = (1.0 / len(countries)) if COUNT_MODE == "fractional" else 1.0

    for c in countries:
        rows.append((c, r["domain"], w))

bin_df = pd.DataFrame(rows, columns=["country", "domain", "weight"])

# totals per country (sum of weights)
country_totals = bin_df.groupby("country")["weight"].sum().reset_index(name="total")

# Top countries by total
topN = (
    country_totals
    .sort_values("total", ascending=False)
    .head(TOP_COUNTRIES)["country"]
    .tolist()
)

bin_df = bin_df[bin_df["country"].isin(topN)]
country_totals = country_totals[country_totals["country"].isin(topN)]

# domain counts per country (sum of weights)
country_domain = (
    bin_df.groupby(["country", "domain"])["weight"]
    .sum()
    .reset_index(name="count")
)

country_domain = country_domain.merge(country_totals, on="country", how="left")
country_domain["share_pct"] = 100 * country_domain["count"] / country_domain["total"]

# -------------------------
# SAVE TABLE (TOP COUNTRIES ONLY)
# -------------------------
count_mode_label = "fractional" if COUNT_MODE == "fractional" else "presence"
csv_out = os.path.join(OUTDIR, f"top{TOP_COUNTRIES}_country_domain_{count_mode_label}.csv")
country_domain.sort_values(["country", "share_pct"], ascending=[True, False]).to_csv(csv_out, index=False)
print(f"[OK] Wrote {csv_out}")

# -------------------------
# HEATMAP (TOP COUNTRIES × TOP DOMAINS)
# -------------------------
top_domains = (
    country_domain.groupby("domain")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(TOP_DOMAINS)
    .index.tolist()
)

# Force-include domains if present
present_domains = set(country_domain["domain"].unique())
for d in FORCE_INCLUDE_DOMAINS:
    if d in present_domains and d not in top_domains:
        top_domains.append(d)

plot_df = country_domain[country_domain["domain"].isin(top_domains)]

pivot = (
    plot_df.pivot(index="country", columns="domain", values="share_pct")
    .reindex(topN)
    .fillna(0)
)

fig, ax = plt.subplots(figsize=(12, 6))
im = ax.imshow(pivot.values, aspect="auto")

ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels(pivot.index)
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels(pivot.columns, rotation=45, ha="right")

# Add numbers inside cells (0 decimals)
for i in range(pivot.shape[0]):
    for j in range(pivot.shape[1]):
        val = pivot.values[i, j]
        txt_color = "white" if val >= 18 else "black"
        ax.text(j, i, f"{val:.0f}", ha="center", va="center",
                color=txt_color, fontsize=8, fontweight="bold")

cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Share of country output (%)")

title_mode = "Fractional" if COUNT_MODE == "fractional" else "Country-presence"
ax.set_title(f"Top-{TOP_COUNTRIES} Countries × Domains (DFRWS, {YEAR_MIN}-{YEAR_MAX}) — {title_mode} counting")

plt.tight_layout()

fig_path = os.path.join(OUTDIR, f"top{TOP_COUNTRIES}_country_domain_heatmap_{count_mode_label}.png")
plt.savefig(fig_path, dpi=300)
plt.close()

print(f"[OK] Saved {fig_path}")
print("[DONE]")


CDFS

In [None]:
import pandas as pd, json, re
import matplotlib.pyplot as plt

# ---------- config ----------
INPUT_CSV = "results_new_taxonomy_analysis.csv"
YEAR_MIN, YEAR_MAX = 2002, 2025
all_years = list(range(YEAR_MIN, YEAR_MAX + 1))

# ---------- robust parsers ----------
def strip_fences(s: str) -> str:
    if not isinstance(s, str): return ""
    # remove ANY ``` or ```json occurrences, anywhere in the cell
    s = re.sub(r"```(?:json)?", "", s, flags=re.IGNORECASE)
    s = s.replace("```", "")
    return s.strip()

def parse_json_messy(cell, default=None):
    if isinstance(cell, dict):
        return cell
    s = strip_fences(cell)
    try:
        return json.loads(s)
    except Exception:
        # fallback: pull "year": "2007" with regex if JSON is broken
        m = re.search(r'"year"\s*:\s*"?(?P<y>\d{4})"?', s or "")
        if m:
            return {"year": m.group("y")}
        return default if default is not None else {}

def coerce_year(y):
    try:
        y = int(str(y).strip())
        return y if YEAR_MIN <= y <= YEAR_MAX else None
    except Exception:
        return None

# ---------- rebuild df (year + primary_domain) ----------
raw = pd.read_csv(INPUT_CSV, encoding="latin1")

years = raw.get("published_year", pd.Series([""] * len(raw))).apply(parse_json_messy, default={})
years = years.apply(lambda d: (d or {}).get("year"))
years = years.apply(coerce_year)

onto = raw.get("taxonomy_classification", pd.Series([""] * len(raw))).apply(parse_json_messy, default={})
primary_domain = onto.apply(lambda d: (d or {}).get("primary_domain", "")).fillna("").astype(str).str.strip()

df = pd.DataFrame({"year": years, "primary_domain": primary_domain}).dropna(subset=["year"])
df["year"] = df["year"].astype(int)

# ---------- year × primary_domain matrix (zeros for missing years) ----------
counts = (
    df.groupby(["year", "primary_domain"])
      .size()
      .reset_index(name="count")
)
mat = (
    counts.pivot(index="year", columns="primary_domain", values="count")
          .reindex(all_years, fill_value=0)
          .fillna(0).astype(int)
)

# ---------- CDF-style: ALL papers + Top-5 primary_domain ----------
# ALL papers per year
yearly_all = df.groupby("year").size().reindex(all_years, fill_value=0)

# Top-5 primary_domain by total volume
top5 = mat.sum(axis=0).sort_values(ascending=False).head(6).index.tolist()
print("Top-5 Primary Domain:", top5)

# Yearly counts table for ALL + Top-5
cdf_counts = pd.DataFrame({"ALL Papers": yearly_all})
for d in top5:
    cdf_counts[d] = mat[d].reindex(all_years, fill_value=0)

# Cumulative sums (CDF curves)
cdf_cum = cdf_counts.cumsum()

# Plot cumulative counts
plt.figure(figsize=(14, 7))
for col in cdf_cum.columns:
    plt.plot(cdf_cum.index, cdf_cum[col], linewidth=2.2, label=col)
plt.title("Cumulative Papers by Year (ALL + Top-6 Domains)")
plt.xlabel("Year"); plt.ylabel("Cumulative Count")
plt.xticks(all_years, rotation=45); plt.grid(True, linestyle="--", alpha=0.35)
plt.legend(title="Series", loc="center left", bbox_to_anchor=(1.02, 0.5), frameon=False)
plt.tight_layout(); plt.show()

# (Optional) Normalized CDFs (0..1) to compare timing of growth
cdf_norm = cdf_cum.div(cdf_cum.iloc[-1])
plt.figure(figsize=(14, 7))
for col in cdf_norm.columns:
    plt.plot(cdf_norm.index, cdf_norm[col], linewidth=2.0, label=col)
plt.title("Normalized CDF (Proportion Reached by Year)")
plt.xlabel("Year"); plt.ylabel("Proportion of Final Total"); plt.ylim(0, 1.02)
plt.xticks(all_years, rotation=45); plt.grid(True, linestyle="--", alpha=0.35)
plt.legend(title="Series", loc="center left", bbox_to_anchor=(1.02, 0.5), frameon=False)
plt.tight_layout(); plt.show()

# Exports
cdf_counts.to_csv("cdf_all_plus_top6_yearly_counts.csv")
cdf_cum.to_csv("cdf_all_plus_top6_cumulative_counts.csv")
cdf_norm.to_csv("cdf_all_plus_top6_normalized_cdf.csv")
print("Saved CDF CSVs.")


Country Level Domain Analysis

In [None]:
import os, re, json, ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# CONFIG
# -------------------------
CSV_PATH = "results_new_taxonomy_analysis.csv"
OUTDIR = "dfrws_top10_country_domain"
os.makedirs(OUTDIR, exist_ok=True)

YEAR_MIN, YEAR_MAX = 2002, 2025
TOP_COUNTRIES = 10
TOP_DOMAINS = 12  # columns in heatmap

# Domain extraction mode:
# - "primary_domain"              : use primary_domain only (fallback to sub_domain if primary missing)
# - "primary_domain_sub_domain"   : show "primary / sub" when both exist (fallback to whichever exists)
DOMAIN_MODE = "primary_domain"

# How to count country contribution per paper:
# - "presence"   : each (paper,country) counts as 1 regardless of #authors from that country
# - "fractional" : each paper contributes total=1 split across its unique countries (1/k per country)
COUNT_MODE = "presence"  # or "fractional"


FORCE_INCLUDE_DOMAINS = [
    "AI Forensics",
]

DROP_UNKNOWN_FROM_HEATMAP = True

WRITE_UNKNOWN_DEBUG = True

# -------------------------
# SAFE PARSING (WITH OPTIONAL DEBUG)
# -------------------------
parse_errors = []

def safe_parse(cell, field=""):
    s = str(cell)

    s = re.sub(r"```(?:json)?", "", s, flags=re.IGNORECASE).strip()
    s = s.replace("```", "").strip()

    if not s or s.lower() in ("nan", "none", "null"):
        return None

    try:
        return json.loads(s)
    except Exception as e1:
        try:
            return ast.literal_eval(s)
        except Exception as e2:
            parse_errors.append({
                "field": field,
                "preview": s[:400],
                "json_error": str(e1),
                "ast_error": str(e2),
            })
            return None

ALIASES = {
    "usa":"USA","us":"USA","u s":"USA","u.s.":"USA","u.s.a.":"USA","united states":"USA","united states of america":"USA",
    "uk":"United Kingdom","u k":"United Kingdom","u.k.":"United Kingdom","united kingdom":"United Kingdom",
    "england":"United Kingdom","scotland":"United Kingdom","wales":"United Kingdom","great britain":"United Kingdom","britain":"United Kingdom",
    "uae":"United Arab Emirates","u a e":"United Arab Emirates","u.a.e.":"United Arab Emirates",
    "united arab emirates":"United Arab Emirates","emirates":"United Arab Emirates",
    "republic of korea":"South Korea","korea, republic of":"South Korea","south korea":"South Korea","korea":"South Korea","r o k":"South Korea","r.o.k.":"South Korea",
}

def canon_country(c):
    s = str(c).strip().lower()
    s = re.sub(r"\s+", " ", s)  # collapse whitespace
    s = s.replace(".", "")      # remove dots so "u.s.a." -> "usa"
    return ALIASES.get(s, str(c).strip())

def extract_countries(obj):
    """Return a list of raw country strings from a parsed object."""
    if obj is None:
        return []
    if isinstance(obj, dict):
        obj = obj.get("author_countries", obj.get("countries", []))
    if isinstance(obj, list):
        out = []
        for x in obj:
            if isinstance(x, str):
                out.append(x)
            elif isinstance(x, dict) and "country" in x:
                out.append(x["country"])
        return out
    if isinstance(obj, str):
        return [obj]
    return []

def extract_domain(obj):
    """
    Extract domain from taxonomy dict.
    If taxonomy is missing/unparseable or has no domain keys, returns "Unknown".
    """
    if not isinstance(obj, dict):
        return "Unknown"

    d = str(obj.get("primary_domain", "")).strip()
    s = str(obj.get("sub_domain", "")).strip()

    if DOMAIN_MODE == "primary_domain_sub_domain":
        if d and s:
            return f"{d} / {s}"
        return d or s or "Unknown"

    # default: primary_domain, but fall back to sub_domain if primary missing
    return d or s or "Unknown"

def extract_year(obj):
    """Handle both dict years and raw strings."""
    if obj is None:
        return None
    if isinstance(obj, dict) and "year" in obj:
        try:
            return int(obj["year"])
        except Exception:
            return None
    try:
        return int(obj)
    except Exception:
        return None

def extract_conf(obj):
    if obj is None:
        return ""
    if isinstance(obj, dict) and "conference" in obj:
        return str(obj["conference"])
    return str(obj)

# -------------------------
# LOAD DATA
# -------------------------
df = pd.read_csv(CSV_PATH, dtype=str, keep_default_na=False, encoding="latin1")

df["countries"] = df["author_countries"].apply(lambda x: safe_parse(x, "author_countries"))
df["taxonomy"]  = df["taxonomy_classification"].apply(lambda x: safe_parse(x, "taxonomy_classification"))
df["year"]      = df["published_year"].apply(lambda x: extract_year(safe_parse(x, "published_year")))
df["conf"]      = df["conference"].apply(lambda x: extract_conf(safe_parse(x, "conference")))

df["domain"] = df["taxonomy"].apply(extract_domain)

# filter DFRWS + years
df = df[df["conf"].str.startswith("DFRWS", na=False)]
df = df[df["year"].notna()]
df = df[(df["year"] >= YEAR_MIN) & (df["year"] <= YEAR_MAX)]

# build UNIQUE COUNTRY LIST per paper (dedup within paper!)
df["country_list"] = df["countries"].apply(
    lambda x: sorted({canon_country(c) for c in extract_countries(x) if str(c).strip()})
)

# -------------------------
# OPTIONAL: DEBUG UNKNOWN DOMAIN ROWS
# -------------------------
if WRITE_UNKNOWN_DEBUG:
    unk = df[df["domain"] == "Unknown"].copy()
    print(f"[DEBUG] Unknown domain rows after filters: {len(unk)}")

    if len(unk) > 0:
        debug_path = os.path.join(OUTDIR, "unknown_domain_rows_debug.csv")
        # Save both raw and parsed previews
        unk_out = unk[["year", "conf", "published_year", "taxonomy_classification", "domain"]].copy()
        unk_out.to_csv(debug_path, index=False, encoding="utf-8")
        print(f"[DEBUG] Wrote {debug_path}")

# Also dump parse errors (if any)
if parse_errors:
    pe_path = os.path.join(OUTDIR, "parse_errors_debug.csv")
    pd.DataFrame(parse_errors).to_csv(pe_path, index=False, encoding="utf-8")
    print(f"[DEBUG] Wrote {pe_path} ({len(parse_errors)} parse errors)")

# -------------------------
# COUNTRY COUNTING (PER PAPER, NOT PER AUTHOR)
# -------------------------
rows = []
for _, r in df.iterrows():
    countries = [c for c in r["country_list"] if c and c != "Unknown"]
    if not countries:
        continue

    # weight per (paper,country)
    w = (1.0 / len(countries)) if COUNT_MODE == "fractional" else 1.0

    for c in countries:
        rows.append((c, r["domain"], w))

bin_df = pd.DataFrame(rows, columns=["country", "domain", "weight"])

# totals per country (sum of weights)
country_totals = bin_df.groupby("country")["weight"].sum().reset_index(name="total")

# Top-10 countries by total
top10 = (
    country_totals
    .sort_values("total", ascending=False)
    .head(TOP_COUNTRIES)["country"]
    .tolist()
)

bin_df = bin_df[bin_df["country"].isin(top10)]
country_totals = country_totals[country_totals["country"].isin(top10)]

# domain counts per country (sum of weights)
country_domain = (
    bin_df.groupby(["country", "domain"])["weight"]
    .sum()
    .reset_index(name="count")
)

country_domain = country_domain.merge(country_totals, on="country", how="left")
country_domain["share_pct"] = 100 * country_domain["count"] / country_domain["total"]

# -------------------------
# SAVE TABLE (TOP-10 ONLY)
# -------------------------
count_mode_label = "fractional" if COUNT_MODE == "fractional" else "presence"
csv_out = os.path.join(OUTDIR, f"top10_country_domain_country_{count_mode_label}.csv")
country_domain.sort_values(["country", "share_pct"], ascending=[True, False]).to_csv(csv_out, index=False)
print(f"[OK] Wrote {csv_out}")

# -------------------------
# HEATMAP (TOP-10 × TOP DOMAINS)
# -------------------------
# Optionally remove Unknown from the domain pool so it can't appear in the heatmap columns
domain_pool = country_domain.copy()
if DROP_UNKNOWN_FROM_HEATMAP:
    domain_pool = domain_pool[domain_pool["domain"] != "Unknown"]

top_domains = (
    domain_pool.groupby("domain")["count"]
    .sum()
    .sort_values(ascending=False)
    .head(TOP_DOMAINS)
    .index.tolist()
)

# Force-include rare domains (e.g., "AI Forensics") if they exist in the dataset
present_domains = set(domain_pool["domain"].unique())
for d in FORCE_INCLUDE_DOMAINS:
    if d in present_domains and d not in top_domains:
        top_domains.append(d)

plot_df = domain_pool[domain_pool["domain"].isin(top_domains)]

pivot = (
    plot_df.pivot(index="country", columns="domain", values="share_pct")
    .reindex(top10)
    .fillna(0)
)

fig, ax = plt.subplots(figsize=(12, 6))
im = ax.imshow(pivot.values, aspect="auto")

ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels(pivot.index)
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels(pivot.columns, rotation=45, ha="right")

# Add numbers inside cells (0 decimals)
for i in range(pivot.shape[0]):
    for j in range(pivot.shape[1]):
        val = pivot.values[i, j]
        txt_color = "white" if val >= 18 else "black"
        ax.text(j, i, f"{val:.0f}", ha="center", va="center",
                color=txt_color, fontsize=8, fontweight="bold")

cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Share of country output (%)")

title_mode = "Fractional" if COUNT_MODE == "fractional" else "Country-presence"
ax.set_title(f"Top-{TOP_COUNTRIES} Countries × Domains (DFRWS, {YEAR_MIN}-{YEAR_MAX}) — {title_mode} counting")

plt.tight_layout()

fig_path = os.path.join(OUTDIR, f"top10_country_domain_heatmap_{count_mode_label}.png")
plt.savefig(fig_path, dpi=300)
plt.close()

print(f"[OK] Saved {fig_path}")
print("[DONE]")


In [None]:
import os, re, json, ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# -------------------------
# CONFIG
# -------------------------
CSV_PATH = "results_new_taxonomy_analysis.csv"
OUTDIR = "dfrws_top10_country_domain"
os.makedirs(OUTDIR, exist_ok=True)

YEAR_MIN, YEAR_MAX = 2002, 2025
TOP_COUNTRIES = 10
TOP_DOMAINS = 13
DOMAIN_MODE = "primary_domain"   # or "discipline_subdiscipline"

# -------------------------
# SAFE PARSING
# -------------------------
def safe_parse(cell):
    s = str(cell)
    s = re.sub(r"```(?:json)?|```", "", s).strip()
    if not s or s.lower() in ("nan", "none", "null"):
        return None
    try:
        return json.loads(s)
    except Exception:
        try:
            return ast.literal_eval(s)
        except Exception:
            return None

ALIASES = {
    # USA
    "usa":"USA","u.s.":"USA","u.s.a.":"USA","u.s":"USA","us":"USA",
    "united states":"USA","united states of america":"USA",
    # UK
    "uk":"United Kingdom","u.k.":"United Kingdom","u.k":"United Kingdom",
    "united kingdom":"United Kingdom","england":"United Kingdom",
    "scotland":"United Kingdom","wales":"United Kingdom","great britain":"United Kingdom",
    # UAE
    "uae":"United Arab Emirates","u.a.e.":"United Arab Emirates",
    "united arab emirates":"United Arab Emirates","emirates":"United Arab Emirates",
    # Korea
    "republic of korea":"South Korea","korea, republic of":"South Korea",
}

def canon_country(c):
    return ALIASES.get(str(c).strip().lower(), str(c).strip())

def extract_countries(obj):
    if obj is None:
        return []
    if isinstance(obj, dict):
        obj = obj.get("author_countries", obj.get("countries", []))
    if isinstance(obj, list):
        out = []
        for x in obj:
            if isinstance(x, str):
                out.append(x)
            elif isinstance(x, dict) and "country" in x:
                out.append(x["country"])
        return out
    if isinstance(obj, str):
        return [obj]
    return []

def extract_domain(obj):
    if not isinstance(obj, dict):
        return "Unknown"
    d = str(obj.get("primary_domain","")).strip()
    s = str(obj.get("sub_domain","")).strip()
    if DOMAIN_MODE == "primary_domain_sub_domain" and d and s:
        return f"{d} / {s}"
    return d if d else "Unknown"

# -------------------------
# LOAD DATA
# -------------------------
df = pd.read_csv(CSV_PATH, dtype=str, keep_default_na=False, encoding="latin1")

df["countries"] = df["author_countries"].apply(safe_parse)
df["taxonomy"]  = df["taxonomy_classification"].apply(safe_parse)

# robust year/conf extraction
def get_year(x):
    j = safe_parse(x)
    if isinstance(j, dict) and "year" in j:
        try:
            return int(j["year"])
        except Exception:
            return None
    return None

def get_conf(x):
    j = safe_parse(x)
    if isinstance(j, dict) and "conference" in j:
        return str(j["conference"]).strip()
    return ""

df["year"] = df["published_year"].apply(get_year)
df["conf"] = df["conference"].apply(get_conf)

df["primary_domain"] = df["taxonomy"].apply(extract_domain)

# filter DFRWS + years
df = df[df["conf"].str.startswith("DFRWS")]
df = df[df["year"].between(YEAR_MIN, YEAR_MAX)]

# build country lists (dedupe AFTER normalization)
df["country_list"] = df["countries"].apply(
    lambda x: sorted(set(
        canon_country(c) for c in extract_countries(x) if str(c).strip()
    ))
)

# drop papers with no countries (if any exist)
df = df[df["country_list"].map(len) > 0]

# -------------------------
# BINARY COUNTING
# -------------------------
rows = []
for _, r in df.iterrows():
    for c in r["country_list"]:
        rows.append((c, r["primary_domain"]))

bin_df = pd.DataFrame(rows, columns=["country","primary_domain"])

country_totals = bin_df.groupby("country").size().reset_index(name="total")

# Top-10 countries
top10 = (
    country_totals.sort_values("total", ascending=False)
    .head(TOP_COUNTRIES)["country"].tolist()
)

bin_df = bin_df[bin_df["country"].isin(top10)]

# domain counts per country
country_domain = (
    bin_df.groupby(["country","primary_domain"]).size()
    .reset_index(name="count")
)

country_domain = country_domain.merge(country_totals, on="country", how="left")
country_domain["share_pct"] = 100 * country_domain["count"] / country_domain["total"]

# -------------------------
# SAVE TABLE (TOP-10 ONLY) - LONG FORMAT
# -------------------------
csv_long = os.path.join(OUTDIR, "top10_country_domain_binary_LONG.csv")
country_domain.sort_values(["country","share_pct"], ascending=[True,False]).to_csv(csv_long, index=False)
print(f"[OK] Wrote {csv_long}")

# -------------------------
# PICK TOP DOMAINS (global, within top10)
# -------------------------
top_domains = (
    country_domain.groupby("primary_domain")["count"].sum()
    .sort_values(ascending=False).head(TOP_DOMAINS).index.tolist()
)

plot_df = country_domain[country_domain["primary_domain"].isin(top_domains)].copy()

# -------------------------
# HEATMAP MATRIX (WIDE) + SAVE AS CSV 
# -------------------------
pivot = (
    plot_df.pivot(index="country", columns="primary_domain", values="share_pct")
    .reindex(top10).fillna(0.0)
)

csv_wide = os.path.join(OUTDIR, "top10_country_domain_heatmap_matrix.csv")
pivot.to_csv(csv_wide)
print(f"[OK] Wrote {csv_wide}")

# -------------------------
# HEATMAP FIGURE
# -------------------------
fig, ax = plt.subplots(figsize=(12,6))
im = ax.imshow(pivot.values, aspect="auto")

ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels(pivot.index)
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels(pivot.columns, rotation=45, ha="right")

cbar = plt.colorbar(im, ax=ax)
cbar.set_label("Share of country output (%)")

ax.set_title("Top-10 Countries × Digital Forensics Domains (Binary Counting)")
plt.tight_layout()

fig_path = os.path.join(OUTDIR, "top10_country_domain_heatmap.png")
plt.savefig(fig_path, dpi=300)
plt.close()

print(f"[OK] Saved {fig_path}")
print("[DONE]")
