In [None]:
from faker import Faker
import pandas as pd
import random
import numpy as np
from collections import deque

# Create the dataset

This notebook generates a synthetic dataset simulating large company groups (conglomerates) and their internal corporate hierarchies. This is mostly AI generated.

- **50 Groups**: Each representing a conglomerate or major client.
- **Company Trees**: Each group contains up to 200 companies structured as a non-binary tree.
- **Hierarchy Depth**: Up to 6 levels deep, simulating parent-subsidiary structures.
- **Columns**:
  - `Group`: Name of the group
  - `Name`: Company name
  - `id`: Unique company ID
  - `parent_id`: ID of the parent company (`None` if group root)
  - `turnover`: Estimated revenue (100k to 5M), loosely correlated with workforce size
  - `workers`: Number of employees (1 to 1000)
  - `level`: Depth in the hierarchy (0 = group root)

## Purpose

- Simulates real-world corporate networks for UI prototyping or testing tree-based aggregations.
- Enables interactive drill-down in web UIs via `id`/`parent_id` structure.


In [None]:
# Initialize Faker
fake = Faker()

# Parameters
NUM_GROUPS = 50
MAX_COMPANIES_PER_GROUP = 200
MAX_CHILDREN_PER_NODE = 3
MAX_DEPTH = 6
ID_COUNTER = 1  # Global counter for unique company IDs


In [None]:

def generate_turnover_and_workers():
    workers = int(np.clip(np.random.normal(100, 100), 1, 1000))
    turnover = int(np.clip(workers * random.uniform(500, 700), 100_000, 5_000_000))
    return turnover, workers

# Function to generate a single group's company tree using BFS
def generate_group_tree(group_name):
    global ID_COUNTER
    companies = []

    # Create the group root company
    group_id = ID_COUNTER
    ID_COUNTER += 1
    turnover, workers = generate_turnover_and_workers()
    root = {
        "Group": group_name,
        "Name": group_name,
        "id": group_id,
        "parent_id": None,
        "turnover": turnover,
        "workers": workers,
        "level": 0
    }
    companies.append(root)

    queue = deque([(group_id, 1)])  # (parent_id, depth)

    while queue and len(companies) < MAX_COMPANIES_PER_GROUP:
        parent_id, depth = queue.popleft()
        if depth > MAX_DEPTH:
            continue

        num_children = random.randint(0, MAX_CHILDREN_PER_NODE)
        for _ in range(num_children):
            if len(companies) >= MAX_COMPANIES_PER_GROUP:
                break
            company_id = ID_COUNTER
            ID_COUNTER += 1
            turnover, workers = generate_turnover_and_workers()
            company = {
                "Group": group_name,
                "Name": fake.company(),
                "id": company_id,
                "parent_id": parent_id,
                "turnover": turnover,
                "workers": workers,
                "level": depth
            }
            companies.append(company)
            queue.append((company_id, depth + 1))

    return companies

def create_base_dataframe(num_groups):
    rows = []
    for _ in range(num_groups):
        group_name = f"{fake.company()} Group"
        group_companies = generate_group_tree(group_name)
        rows.extend(group_companies)

    return pd.DataFrame(rows)

In [None]:
def add_list_children_column(df):
    """
    Adds a 'list_children' column to the DataFrame, listing direct child IDs for each company.

    Parameters:
        df (pd.DataFrame): The input company DataFrame with 'id' and 'parent_id'.

    Returns:
        pd.DataFrame: The modified DataFrame with a new 'list_children' column.
    """
    # Step 1: Group all rows by parent_id → collect their IDs
    child_map = df.groupby("parent_id")["id"].apply(list).to_dict()
    # Step 2: Map each row's id to the list of children (or empty list)
    df["list_children"] = df["id"].map(child_map).apply(lambda x: x if isinstance(x, list) else [])
    return df


In [None]:
def add_total_metrics(df):
    """
    Adds 'total_turnover' and 'total_workers' columns to the DataFrame,
    where each row includes the sum of its own metrics + all recursive descendants'.
    """
    # Step 1: Build tree: parent_id → list of children
    children_map = df.groupby("parent_id")["id"].apply(list).to_dict()

    # Step 2: Initialize totals with own values
    df["total_turnover"] = df["turnover"]
    df["total_workers"] = df["workers"]

    # Step 3: Create a lookup table by ID
    df_index = df.set_index("id")

    # Step 4: Define recursive function using memoization
    from functools import lru_cache

    @lru_cache(maxsize=None)
    def compute_totals(node_id):
        # Start with the node’s own values
        total_turnover = df_index.at[node_id, "turnover"]
        total_workers = df_index.at[node_id, "workers"]

        # Recursively add all children’s totals
        for child_id in children_map.get(node_id, []):
            child_turnover, child_workers = compute_totals(child_id)
            total_turnover += child_turnover
            total_workers += child_workers

        return total_turnover, total_workers

    # Step 5: Apply the recursive total computation
    results = df["id"].apply(lambda node_id: compute_totals(node_id))
    df["total_turnover"] = results.apply(lambda x: x[0])
    df["total_workers"] = results.apply(lambda x: x[1])

    return df


In [None]:
df_groups = create_base_dataframe(NUM_GROUPS)
df_groups = add_list_children_column(df_groups)
df_groups = add_total_metrics(df_groups)

In [None]:
df_groups[df_groups.Group=="Russell-Cooper Group"]

## Save as CSV

In [None]:
df_groups.to_parquet("../data/company_groups.parquet", index=False)
# df_groups.to_csv("../data/company_groups.csv", index=False)