In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import networkx as nx

INPUT_CLEAN = Path.home() / "Downloads" / "Emails_clean.parquet"   # where your clean input is
OUT_DIR     = Path(r"C:\Users\Dylan\OneDrive - Swinburne University\COS70008\Scripts")
OUT_DIR.mkdir(parents=True, exist_ok=True)

emails = pd.read_parquet(INPUT_CLEAN, engine="pyarrow", memory_map=False)

import re, ast, numpy as np

def to_list_clean(x):
    """Return a clean Python list[str] for any weird cell value."""
    if x is None:
        return []
    if hasattr(x, "as_py"):
        x = x.as_py()
    if hasattr(x, "to_pylist"):
        x = x.to_pylist()

    if isinstance(x, (list, tuple, set)):
        lst = list(x)
    elif isinstance(x, np.ndarray):
        lst = x.tolist()
    elif isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        if (s.startswith("[") and s.endswith("]")) or (s.startswith("(") and s.endswith(")")):
            try:
                v = ast.literal_eval(s)
                if isinstance(v, (list, tuple, set, np.ndarray)):
                    lst = list(v)
                else:
                    lst = [str(v)]
            except Exception:
                lst = [p.strip() for p in re.split(r"[;,]", s) if p.strip()]
        else:
            lst = [p.strip() for p in re.split(r"[;,]", s) if p.strip()]
    else:
        try:
            if pd.isna(x):
                return []
        except Exception:
            pass
        lst = [str(x).strip()]

    return [str(v).strip().lower() for v in lst if str(v).strip()]

for col in ["to_norm", "cc_norm", "bcc_norm"]:
    if col not in emails.columns:
        emails[col] = [[]] * len(emails)
    emails[col] = emails[col].apply(to_list_clean)

if "recipient_count" not in emails.columns:
    emails["recipient_count"] = (
        emails["to_norm"].str.len()
        + emails["cc_norm"].str.len()
        + emails["bcc_norm"].str.len()
    ).astype("int32")


if "recipient_count" not in emails.columns:
    emails["recipient_count"] = (
        emails["to_norm"].str.len()
        + emails["cc_norm"].str.len()
        + emails["bcc_norm"].str.len()
    ).astype("int32")


edges = emails[["email_id", "person_id", "dt_utc", "recipient_count"]].copy()
edges["recipient"] = emails["to_norm"] + emails["cc_norm"] + emails["bcc_norm"]
edges = edges.explode("recipient", ignore_index=True)

edges["recipient"] = edges["recipient"].astype("string").str.lower()
edges = edges.dropna(subset=["recipient"])
edges = edges[edges["recipient"].str.len() > 0]

edges = edges[edges["person_id"] != edges["recipient"]]

edges = edges.rename(columns={"person_id": "src_person_id", "recipient": "dst_person_id"})
edges["edge_id"]      = edges["email_id"] + "|" + edges["src_person_id"] + "|" + edges["dst_person_id"]
edges["weight_unit"]  = 1.0
edges["weight_mass"]  = 1.0 / (1.0 + edges["recipient_count"].fillna(0).astype(float))

edges_out = edges[["edge_id","src_person_id","dst_person_id","email_id","dt_utc"]].copy()
edges_out["weight"]   = edges["weight_unit"].astype("float32")
edges_out["directed"] = True
edges_out.to_parquet(OUT_DIR / "Edges.parquet", index=False)

edges_dir = (edges.groupby(["src_person_id","dst_person_id"], as_index=False)["weight_unit"]
                  .sum().rename(columns={"weight_unit":"weight"}))
edges_dir["directed"] = True

u = edges[["src_person_id","dst_person_id","weight_mass"]].copy()
u["a"] = np.where(u["src_person_id"] < u["dst_person_id"], u["src_person_id"], u["dst_person_id"])
u["b"] = np.where(u["src_person_id"] < u["dst_person_id"], u["dst_person_id"], u["src_person_id"])
edges_undir = (u.groupby(["a","b"], as_index=False)["weight_mass"]
                 .sum().rename(columns={"a":"src_person_id","b":"dst_person_id","weight_mass":"weight"}))
edges_undir["directed"] = False

edges_dir.to_parquet(OUT_DIR / "Edges_directed_agg.parquet",   index=False)
edges_undir.to_parquet(OUT_DIR / "Edges_undirected_agg.parquet", index=False)

senders = (emails[["person_id","from_norm","internal_sender","domain_sender"]]
           .drop_duplicates()
           .rename(columns={"from_norm":"email_norm","internal_sender":"internal","domain_sender":"domain"}))

recips = edges[["dst_person_id"]].drop_duplicates().rename(columns={"dst_person_id":"person_id"})
recips["email_norm"] = recips["person_id"]
recips["domain"]     = recips["email_norm"].str.extract(r'@(.+)$')[0].str.lower()
recips["internal"]   = recips["domain"].eq("enron.com")

node_index = pd.concat([senders[["person_id","email_norm","internal","domain"]],
                        recips[["person_id","email_norm","internal","domain"]]],
                       ignore_index=True).drop_duplicates("person_id")
node_index.to_parquet(OUT_DIR / "NodeIndex.parquet", index=False)

Gd = nx.DiGraph()
Gd.add_nodes_from(node_index["person_id"])
for r in edges_dir.itertuples(index=False):
    Gd.add_edge(r.src_person_id, r.dst_person_id, weight=float(r.weight))

in_deg  = dict(Gd.in_degree())
out_deg = dict(Gd.out_degree())
deg     = {n: in_deg.get(n,0) + out_deg.get(n,0) for n in Gd.nodes()}
pagerank = nx.pagerank(Gd, alpha=0.85, weight="weight") if Gd.number_of_edges() else {n:0.0 for n in Gd.nodes()}

Gu = nx.Graph()
Gu.add_nodes_from(node_index["person_id"])
for r in edges_undir.itertuples(index=False):
    Gu.add_edge(r.src_person_id, r.dst_person_id)

clust = nx.clustering(Gu) if Gu.number_of_edges() else {n:0.0 for n in Gu.nodes()}
try:
    core = nx.core_number(Gu) if Gu.number_of_edges() else {n:0 for n in Gu.nodes()}
except nx.NetworkXError:
    core = {n:0 for n in Gu.nodes()}

wdeg = {n: 0.0 for n in node_index["person_id"]}
for r in edges_undir.itertuples(index=False):
    wdeg[r.src_person_id] += float(r.weight)
    wdeg[r.dst_person_id] += float(r.weight)

nm = pd.DataFrame({"person_id": node_index["person_id"]})
nm["degree"]          = nm["person_id"].map(deg).fillna(0).astype("int32")
nm["in_degree"]       = nm["person_id"].map(in_deg).fillna(0).astype("int32")
nm["out_degree"]      = nm["person_id"].map(out_deg).fillna(0).astype("int32")
nm["w_degree"]        = nm["person_id"].map(wdeg).fillna(0).astype("float32")
nm["pagerank"]        = nm["person_id"].map(pagerank).fillna(0).astype("float32")
nm["clustering_coef"] = nm["person_id"].map(clust).fillna(0).astype("float32")
nm["kcore"]           = nm["person_id"].map(core).fillna(0).astype("int32")
nm.to_parquet(OUT_DIR / "NodeMetrics.parquet", index=False)

print("Wrote:", (OUT_DIR / 'Edges.parquet'),
                 (OUT_DIR / 'NodeIndex.parquet'),
                 (OUT_DIR / 'NodeMetrics.parquet'))

✅ Wrote: C:\Users\Dylan\OneDrive - Swinburne University\COS70008\Scripts\Edges.parquet C:\Users\Dylan\OneDrive - Swinburne University\COS70008\Scripts\NodeIndex.parquet C:\Users\Dylan\OneDrive - Swinburne University\COS70008\Scripts\NodeMetrics.parquet
