# Configuration & Parameters

In [2]:
%pip install faker pandas numpy
import pandas as pd
import numpy as np
from faker import Faker
import random
import uuid
from datetime import datetime, timedelta

Collecting pandas
  Downloading pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.4.1-cp314-cp314-macosx_14_0_arm64.whl.metadata (6.6 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading pandas-2.3.3-cp314-cp314-macosx_11_0_arm64.whl (10.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m28.6 MB/s[0m  [33m0:00:00[0mm0:00:01[0m0:01[0m
[?25hDownloading numpy-2.4.1-cp314-cp314-macosx_14_0_arm64.whl (5.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m30.6 MB/s[0m  [33m0:00:00[0m
[?25hDownloading pytz-2025.2-py2.py3-none-any.whl (509 kB)
Installing collected packages: pytz, numpy, pandas
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [pandas]2m2/3[0m [pandas]
[1A[2KSuccessfully installed numpy-2.4.1 pandas-2.3.3 pytz-2025.2
Note: you may need to restart the kerne

In [3]:
seed = 42 # for reproducible random output across runs
fake = Faker()
Faker.seed(seed)
np.random.seed(seed)
random.seed(seed)

## Volume Constraints

In [4]:
NUM_SUPPLIERS = 3000
NUM_MATERIALS = 7000  # Total material nodes across all tiers
TARGET_PO_COUNT = 80000

## Tier Distribution

Probabilities for a material falling into a specific tier

0.   Finished EV
1.   Battery Pack
2.   Module
3.   Cell
4.   Raw Material



In [5]:
TIER_DISTRIBUTION = [0.05, 0.10, 0.20, 0.30, 0.35]

## Country Distribution

Simulating realistic EV supply chain hubs

In [6]:
COUNTRY_WEIGHTS = {
    'CN': 0.45, # China dominates battery supply chain
    'KR': 0.15, # South Korea (LG, SK, Samsung)
    'JP': 0.10, # Japan (Panasonic)
    'DE': 0.10, # Germany (Auto tiers)
    'US': 0.10, # USA
    'XX': 0.10  # Rest of World
}

# Generate Supplier Nodes
Apply Power Law to create _Hub_ Suppliers

- TODO Review Attributes: Why are tier category / risk & capacity needed?

In [7]:
suppliers = []
countries = list(COUNTRY_WEIGHTS.keys())
weights = list(COUNTRY_WEIGHTS.values())

# We assign a 'capability_score' which determines how many materials they can supply
dominance_scores = np.random.zipf(a=1.5, size=NUM_SUPPLIERS)
# Normalize scores to a realistic capacity (max 50 materials per supplier for hubs)
dominance_scores = (dominance_scores / dominance_scores.max()) * 50
dominance_scores = np.maximum(dominance_scores, 1).astype(int)

for i in range(NUM_SUPPLIERS):
    country = random.choices(countries, weights=weights, k=1)[0]
    sup_id = f"SUP_{country}_{str(i+1).zfill(5)}"

    suppliers.append({
        "supplier_id": sup_id,
        "name": fake.company(),
        "country": country,
        "risk_score": round(random.uniform(0.1, 9.9), 2), # 10 is high risk
        "tier_category": random.choice(["Strategic", "Strategic", "Commodity", "Commodity", "Specialist"]),
        "capacity_score": int(dominance_scores[i]) # Hidden attribute for graph generation logic
    })

df_suppliers = pd.DataFrame(suppliers)
df_suppliers.head()

Unnamed: 0,supplier_id,name,country,risk_score,tier_category,capacity_score
0,SUP_JP_00001,"Rodriguez, Figueroa and Sanchez",JP,0.35,Commodity,1
1,SUP_CN_00002,Doyle Ltd,CN,1.47,Strategic,1
2,SUP_JP_00003,"Mcclain, Miller and Henderson",JP,8.84,Strategic,1
3,SUP_KR_00004,Davis and Sons,KR,0.41,Strategic,1
4,SUP_CN_00005,"Guzman, Hoffman and Baldwin",CN,5.05,Strategic,1


# Generate Material Nodes
- REVIEW: What are current limitations with this approach?

In [8]:
# Pre-define some semantic categories for realism
tier_names = {
    0: ["EV_Sedan", "EV_SUV", "EV_Truck"],
    1: ["Battery_Pack_HighRange", "Battery_Pack_Std", "Inverter_Assy", "Drive_Unit"],
    2: ["Module_LFP", "Module_NMC", "BMS_Circuit", "Cooling_Plate"],
    3: ["Cell_Prismatic", "Cell_Cylindrical_4680", "Cell_Pouch", "Anode_Sheet"],
    4: ["Lithium_Hydroxide", "Cobalt_Sulfate", "Nickel_Class1", "Graphite_Synth", "Copper_Foil"]
}

In [42]:
materials = []

for i in range(NUM_MATERIALS):
    tier = np.random.choice([0, 1, 2, 3, 4], p=TIER_DISTRIBUTION)

    # Semantic Naming
    base_name = random.choice(tier_names[tier])
    mat_id = f"MAT_T{tier}_{str(i+1).zfill(5)}"

    materials.append({
        "material_id": mat_id,
        "description": f"{base_name} - {fake.word().upper()} Variant",
        "tier_level": tier,
        "base_unit": "EA" if tier < 4 else "KG",
        "cost_estimate": round(random.lognormvariate(3, 1) * (5 - tier), 2) # Higher tiers = more expensive
    })

df_materials = pd.DataFrame(materials)
df_materials.head()

Unnamed: 0,material_id,description,tier_level,base_unit,cost_estimate
0,MAT_T3_00001,Cell_Pouch - STOP Variant,3,EA,13.85
1,MAT_T3_00002,Anode_Sheet - MOUTH Variant,3,EA,34.42
2,MAT_T4_00003,Graphite_Synth - COLLECTION Variant,4,KG,70.31
3,MAT_T4_00004,Nickel_Class1 - JUST Variant,4,KG,3.81
4,MAT_T2_00005,BMS_Circuit - OF Variant,2,EA,300.67


# Generate BOM Edges

Material -> Material

---
* BOM Type seems redundant
* should quantity be whole number?

In [39]:
bom_edges = []
# Group materials by tier for easy lookup
mats_by_tier = df_materials.groupby("tier_level")["material_id"].apply(list).to_dict()

# Logic: Iterate through Tiers 0 to 3 and assign children from Tier N+1
# We use a constrained random approach to ensure every item has children (except Raw Materials)
for tier in range(4): # 0, 1, 2, 3
    parents = mats_by_tier.get(tier, [])
    potential_children = mats_by_tier.get(tier+1, [])

    if not potential_children: continue

    for parent in parents:
        # Determine number of components (Fan-out)
        # Complex items (Tier 0) have many components; Raw parents (Tier 3) have few
        num_children = max(1, int(np.random.poisson(lam=4.0 - (tier * 0.5))))

        # Select children
        # "Nexus" Logic: We intentionally sample from a smaller subset of Tier 4 items
        # to ensure multiple Tier 3s depend on the SAME Tier 4s (creating bottlenecks).
        if tier == 3:
            # Heavily biased selection for Raw Materials to create dependency hubs
            children = np.random.choice(potential_children, size=num_children, replace=False)
        else:
            children = random.sample(potential_children, k=min(len(potential_children), num_children))

        for child in children:
            qty = round(random.uniform(1.0, 20.0), 2)
            if tier == 3: qty = round(random.uniform(0.5, 5.0), 3) # KG for raw materials

            bom_edges.append({
                "parent_material_id": parent,
                "child_material_id": child,
                "quantity": qty,
                "bom_type": "Production"
            })

df_bom = pd.DataFrame(bom_edges)
df_bom.head()

Unnamed: 0,parent_material_id,child_material_id,quantity,bom_type
0,MAT_T0_00011,MAT_T1_02667,17.57,Production
1,MAT_T0_00011,MAT_T1_05753,1.73,Production
2,MAT_T0_00011,MAT_T1_01594,2.95,Production
3,MAT_T0_00011,MAT_T1_06438,15.42,Production
4,MAT_T0_00011,MAT_T1_00430,10.17,Production


# Generate Purchase Order Edges

Supplier -> Material

---

In [40]:
po_records = []
supplier_list = df_suppliers.to_dict('records')
material_list = df_materials.to_dict('records')

## Assign _Approved Supplier List_ (ASL)

Not every supplier supplies every part. We link them first.

### Logic

* Higher tier items (Tier 0/1) usually have strategic partners (Tier 1 Suppliers)
* Raw materials (Tier 4) are bought from Commodity suppliers
* Pick candidate suppliers based on the 'capacity_score' we generated earlier
* High capacity suppliers are more likely to be chosen (Preferential Attachment)

In [41]:
mat_supplier_map = {} # material_id -> list of possible supplier_ids

# Iterate materials and assign 1-3 suppliers each
for mat in material_list:
    candidates = random.choices(
        supplier_list,
        weights=[s['capacity_score'] for s in supplier_list],
        k=random.randint(1, 3) # Multi-sourcing
    )
    mat_supplier_map[mat['material_id']] = [s['supplier_id'] for s in candidates]

## Generate POs based on relationships

In [43]:
current_po_count = 0
po_id_counter = 100000

while current_po_count < TARGET_PO_COUNT:
    # Pick a random material
    mat = random.choice(material_list)
    # Pick one of its valid suppliers
    valid_suppliers = mat_supplier_map[mat['material_id']]
    supplier_id = random.choice(valid_suppliers)

    # Generate Date
    po_date = fake.date_between(start_date='-2y', end_date='today')
    due_date = po_date + timedelta(days=random.randint(14, 90))

    # Pareto Volume: 20% of orders get 80% of volume
    is_bulk = random.random() < 0.20
    quantity = int(np.random.pareto(a=1.16) * 50) + 1 if is_bulk else random.randint(1, 100)

    # Unit Price with some noise
    unit_price = mat['cost_estimate'] * random.uniform(0.95, 1.05)

    po_records.append({
        "po_number": f"PO-{po_id_counter}",
        "supplier_id": supplier_id,
        "material_id": mat['material_id'],
        "order_date": po_date,
        "delivery_due_date": due_date,
        "quantity": quantity,
        "unit_price": round(unit_price, 2),
        "total_value": round(quantity * unit_price, 2),
        "status": random.choices(["Closed", "Open", "Delayed"], weights=[0.7, 0.2, 0.1])[0]
    })

    po_id_counter += 1
    current_po_count += 1

In [44]:
df_po = pd.DataFrame(po_records)
df_po.head()

Unnamed: 0,po_number,supplier_id,material_id,order_date,delivery_due_date,quantity,unit_price,total_value,status
0,PO-100000,SUP_CN_00421,MAT_T3_00097,2024-02-24,2024-04-24,8,9.71,77.71,Closed
1,PO-100001,SUP_XX_02928,MAT_T3_02359,2024-08-25,2024-10-12,54,137.65,7433.32,Closed
2,PO-100002,SUP_XX_00739,MAT_T0_05134,2024-11-24,2025-01-01,56,312.66,17508.9,Closed
3,PO-100003,SUP_US_02242,MAT_T3_00478,2025-12-12,2026-01-19,13,6.2,80.63,Closed
4,PO-100004,SUP_CN_01411,MAT_T3_03467,2025-07-15,2025-10-04,16,34.26,548.08,Open


# Export

In [45]:
df_suppliers.drop(columns=['capacity_score']).to_csv("suppliers.csv", index=False)
df_materials.to_csv("materials.csv", index=False)
df_bom.to_csv("bom_relationships.csv", index=False)
df_po.to_csv("purchase_orders.csv", index=False)

print("Done! Files generated:")
print(f" - suppliers.csv ({len(df_suppliers)} rows)")
print(f" - materials.csv ({len(df_materials)} rows)")
print(f" - bom_relationships.csv ({len(df_bom)} rows)")
print(f" - purchase_orders.csv ({len(df_po)} rows)")

Done! Files generated:
 - suppliers.csv (3000 rows)
 - materials.csv (7000 rows)
 - bom_relationships.csv (13664 rows)
 - purchase_orders.csv (80000 rows)
