In [None]:
#IMPORTS 
import pandas as pd
from floweaver import *
from mip import *
from functools import cmp_to_key
from attr import evolve
import statistics
import time
from ipysankeywidget import SankeyWidget
from ipywidgets import Layout, Output


In [None]:
# Load with Dataset.from_csv
dataset = Dataset.from_csv("test_flows.csv", "test_processes.csv")

In [None]:
# Creating the partitions for the processes
partition_farms = Partition.Simple('process', ['farm1', 'farm2', 'farm3'])
partition_eat = Partition.Simple('process', ['domestic', 'industry'])
partition_dest = Partition.Simple('process', ['compost', 'landfill'])
partition_fruit = Partition.Simple('material', ['apples', 'bananas', 'oranges'])

In [35]:

# Defining the nodes, ordering and bundles 
nodes = {
    'farms':    ProcessGroup('function == "farms"', partition=partition_farms, title='Farms'),
    'eat':      ProcessGroup('function == "consumers"', partition=partition_eat, title='Eat'),
    'end':      ProcessGroup('function == "destination"', partition=partition_dest, title='Destination'),
    'fruit':    Waypoint(partition_fruit, title='fruit type')
}

ordering = [
    [[], ['farms'], []],
    [[], ['fruit'], []],
    [[], ['eat'], []],
    [[], ['end'], []],
]

bundles = [
    Bundle('farms', 'eat', waypoints=['fruit']),
    Bundle('eat', 'end'),
    Bundle('farms', Elsewhere),
    Bundle('eat', Elsewhere),
    Bundle(Elsewhere,'eat'),
    Bundle(Elsewhere,'end'),
]

# Color palette
palette = {
    'oranges': 'orange',
    'bananas': 'yellow',
    'apples': 'green',
    '*': 'lightblue'
}

In [36]:
# 1. Create the SankeyDefinition
sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=dataset.partition('material'))

# 2. Generate SankeyData (no optimisation)
sankey_data = weave(sdd, dataset, palette=palette)

# 3. Plot
sankey_data.to_widget(width=700, height=450, margins=dict(left=70, right=90))

SankeyWidget(groups=[{'id': 'farms', 'type': 'process', 'title': 'Farms', 'nodes': ['farms^farm1', 'farms^farm…

In [37]:
# Plotting new dataset optimised node ordering sankey diagram


sankey_data_evolved = optimise_node_order(sankey_data, group_nodes=True)
sankey_data_evolved.to_widget(width=700, height=450, margins=dict(left=100, right=120), debugging=True)

Starting solution of the Linear programming relaxation problem using Primal Simplex

Coin0506I Presolve 36 (-24) rows, 24 (-28) columns and 108 (-56) elements
Clp1000I sum of infeasibilities 1.53309e-08 - average 4.25859e-10, 16 fixed columns
Coin0506I Presolve 20 (-16) rows, 8 (-16) columns and 44 (-64) elements
Clp0006I 0  Obj 0
Clp0029I End of values pass after 8 iterations
Clp0000I Optimal - objective value 0
Clp0000I Optimal - objective value 0
Coin0511I After Postsolve, objective 0, infeasibilities - dual 0 (0), primal 0 (0)
Clp0006I 0  Obj 0
Clp0000I Optimal - objective value 0
Clp0000I Optimal - objective value 0
Clp0000I Optimal - objective value 0
Coin0511I After Postsolve, objective 0, infeasibilities - dual 0 (0), primal 0 (0)
Clp0032I Optimal objective 0 - 0 iterations time 0.002, Presolve 0.00, Idiot 0.00

Starting MIP optimization
Cgl0004I processed model has 36 rows, 24 columns (24 integer (24 of which binary)) and 108 elements
Coin3009W Conflict graph built in 0.000 se

VBox(children=(SankeyWidget(groups=[{'id': 'farms', 'type': 'process', 'title': 'Farms', 'nodes': ['farms^farm…

In [38]:
sankey_data_evolved.to_widget(layout=optimise_node_positions(sankey_data_evolved, scale = 10,minimum_gap=10, margins=dict(left=100, right=120)), width=700, height=450)   

Starting solution of the Linear programming relaxation problem using Primal Simplex

Coin0506I Presolve 47 (-11) rows, 33 (-11) columns and 156 (-16) elements
Clp1000I sum of infeasibilities 4.91589e-05 - average 1.04593e-06, 16 fixed columns
Coin0506I Presolve 47 (0) rows, 17 (-16) columns and 120 (-36) elements
Clp0006I 0  Obj 81338.666 Primal inf 5.6475742e-06 (13) Dual inf 9.4443189e+12 (15)
Clp0029I End of values pass after 17 iterations
Clp0014I Perturbing problem by 0.001% of 27.409533 - largest nonzero change 0.00025901779 ( 0.00069123247%) - largest zero change 2.7080466e-05
Clp0000I Optimal - objective value 81338.695
Clp0000I Optimal - objective value 81338.695
Coin0511I After Postsolve, objective 81338.695, infeasibilities - dual 0 (0), primal 0 (0)
Clp0006I 0  Obj 81338.695 Dual inf 7619.4803 (2)
Clp0014I Perturbing problem by 0.001% of 17.533277 - largest nonzero change 0.00017573957 ( 0.00071648365%) - largest zero change 2.7080466e-05
Clp0000I Optimal - objective value 

SankeyWidget(groups=[{'id': 'farms', 'type': 'process', 'title': 'Farms', 'nodes': ['farms^farm1', 'farms^farm…

In [39]:
# 1. Create the SankeyDefinition

new_flows = dataset._flows.copy()
i_delete = 5
row1 = new_flows.iloc[i_delete].copy()
row1.update({'source': 'HIDDEN'})
row2 = new_flows.iloc[i_delete].copy()
row2.update({'target': 'HIDDEN'})
new_flows = pd.concat([
    new_flows.iloc[:i_delete],
    new_flows.iloc[i_delete + 1:],
    pd.DataFrame([row1, row2]),
], ignore_index=True)
#new_flows.loc[new_flows.source == 'eat1', 'source'] = 'HIDDEN'
#new_flows.loc[new_flows.target == 'eat1', 'target'] = 'HIDDEN'

new_dataset = Dataset(new_flows, dataset._dim_process, dataset._dim_material)
sdd = SankeyDefinition(nodes, bundles, ordering, flow_partition=new_dataset.partition('material'))

# 2. Generate SankeyData (no optimisation)
sankey_data = weave(sdd, new_dataset, palette=palette)

# 3. Plot
sankey_data.to_widget(width=700, height=450, margins=dict(left=70, right=90))

SankeyWidget(groups=[{'id': 'farms', 'type': 'process', 'title': 'Farms', 'nodes': ['farms^farm1', 'farms^farm…

In [None]:
def compute_crossing_score_from_dict(crossing_areas):
    crossing_count = len(crossing_areas)
    crossing_area = sum(crossing_areas.values())
    return {
        "crossing_count": crossing_count,              # ✅ renamed
        "crossing_area": crossing_area,                # ✅ renamed
        "crossing_score": crossing_count * crossing_area
    }

In [None]:
def compute_combined_vertical_score_final(flows_df, y_coords, sankey_definition):
    scores = []

    # Detect waypoint layers
    waypoint_groups = [key for key, val in sankey_definition.nodes.items() if "Waypoint" in str(type(val))]
    layers = sankey_definition.ordering.layers
    flat_ordering = [group[1][0] if group[1] else None for group in layers]

    # Identify waypoint structures (left → waypoint → right)
    waypoint_structures = {}
    for idx, node in enumerate(flat_ordering):
        if node in waypoint_groups and 0 < idx < len(flat_ordering) - 1:
            wp = node
            left = flat_ordering[idx - 1]
            right = flat_ordering[idx + 1]
            waypoint_structures[(left, right)] = wp

    # Build node-to-group mapping (e.g., farm3 → farms)
    node_to_group = {}
    for group_name, group_def in sankey_definition.nodes.items():
        if hasattr(group_def, "partition"):
            for subgroup in group_def.partition.groups:
                node_to_group[subgroup.label] = group_name

    for _, row in flows_df.iterrows():
        src = row["source"]
        tgt = row["target"]
        width = row["value"]
        material = row["material"]

        src_group = node_to_group.get(src, "")
        tgt_group = node_to_group.get(tgt, "")

        score = None
        span = None

        # Use material name as the waypoint node
        for (left, right), wp_group in waypoint_structures.items():
            if src_group == left and tgt_group == right:
                waypoint_node = material  # e.g., 'apples', 'bananas'
                if src in y_coords and tgt in y_coords and waypoint_node in y_coords:
                    dy1 = abs(y_coords[src] - y_coords[waypoint_node])
                    dy2 = abs(y_coords[waypoint_node] - y_coords[tgt])
                    score = (dy1 + dy2) / width if width > 0 else float("inf")
                    span = dy1 + dy2
                    break

        # Fallback to simple span
        if score is None and src in y_coords and tgt in y_coords:
            span = abs(y_coords[src] - y_coords[tgt])
            score = span / width if width > 0 else float("inf")

        if score is not None:
            scores.append({
                "source": src,
                "target": tgt,
                "material": material,
                "width": width,
                "span": span,
                "vertical_score": score
            })

    return pd.DataFrame(scores)

In [42]:
def normalize_scores(df, columns_to_normalize):
    """
    Normalize specified columns in a DataFrame using min-max scaling.

    Args:
        df (pd.DataFrame): DataFrame with raw scores.
        columns_to_normalize (list of str): Column names to normalize.

    Returns:
        pd.DataFrame: DataFrame with normalized columns added (suffix "_norm").
    """
    norm_df = df.copy()
    for col in columns_to_normalize:
        min_val = norm_df[col].min()
        max_val = norm_df[col].max()
        if max_val - min_val == 0:
            # Avoid divide-by-zero if all values are the same
            norm_df[f"{col}_norm"] = 0
        else:
            norm_df[f"{col}_norm"] = (norm_df[col] - min_val) / (max_val - min_val)
    return norm_df

In [None]:
def hide_flows_by_index(dataset, row_indices):
    
    # Step 1: Copy original flows table
    new_flows = dataset._flows.copy()
    rows_to_hide = new_flows.iloc[row_indices]

    # Step 2: Replace each flow with 2 hidden ones (source and target)
    hidden_rows = []
    for _, row in rows_to_hide.iterrows():
        row_source = row.copy()
        row_source["source"] = "HIDDEN"

        row_target = row.copy()
        row_target["target"] = "HIDDEN"

        hidden_rows.extend([row_source, row_target])

    # Step 3: Remove original and append hidden
    new_flows = pd.concat([
        new_flows.drop(index=row_indices),
        pd.DataFrame(hidden_rows)
    ], ignore_index=True)

    return new_flows

In [None]:
import random

def select_flows_by_width_band(flows_df, target_frac=0.25, tolerance=0.10):
    """
    Select a set of flows whose combined width falls within a target fraction ± tolerance.
    
    Parameters:
        flows_df (pd.DataFrame): The full flow DataFrame containing 'value' as width.
        target_frac (float): Desired percentage of total width to remove (e.g., 0.25 for 25%)
        tolerance (float): Percentage tolerance (e.g., 0.10 for ±10%)

    Returns:
        list: List of selected row indices to remove.
    """
    total_width = flows_df["value"].sum()
    min_width = total_width * (target_frac * (1 - tolerance))
    max_width = total_width * (target_frac * (1 + tolerance))
    
    remaining_indices = list(flows_df.index)
    random.shuffle(remaining_indices)

    selected = []
    current_sum = 0

    for idx in remaining_indices:
        flow_width = flows_df.loc[idx, "value"]
        if current_sum + flow_width > max_width:
            continue
        selected.append(idx)
        current_sum += flow_width
        if current_sum >= min_width:
            break

    return selected

# Example structure to confirm function works (we'll skip actually calling it here)
select_flows_by_width_band  # ready to integrate

<function __main__.select_flows_by_width_band(flows_df, target_frac=0.25, tolerance=0.1)>

In [None]:
from typing import Tuple

def update_results_table_with_full_metrics(
    dataset, sankey_definition, n_trials=10, removal_fraction=0.25, X1=1, X2=1
) -> Tuple[pd.DataFrame, dict, dict]:
    from floweaver import Dataset, weave
    from time import time

    results = []
    trial_details = {}

    original_width = dataset._flows["value"].sum()
    flow_df_full = dataset._flows.copy()

    # Compute original metrics
    sankey_original = weave(sankey_definition, dataset)
    sankey_orig_ordered, _, crossing_areas_orig = optimise_node_order(sankey_original, crossings__area=True)

    print("🧪 Node layers before layout:", sankey_orig_ordered.ordering.layers)

    # 🔧 FIX: Ensure minimum_gap is explicitly defined
    layout_orig = optimise_node_positions(sankey_orig_ordered, minimum_gap=20)

    y_coords_orig = {
        nid.split("^")[-1].split("*")[0]: pos[1]
        for nid, pos in layout_orig.node_positions.items()
    }

    vertical_df_orig = compute_combined_vertical_score_final(flow_df_full, y_coords_orig, sankey_definition)
    original_vertical_total = vertical_df_orig["vertical_score"].sum()

    crossing_metrics_orig = compute_crossing_score_from_dict(crossing_areas_orig)
    original_crossing_score = crossing_metrics_orig["crossing_score"]
    original_crossing_count = crossing_metrics_orig["crossing_count"]

    original_summary = {
        "original_crossings": int(original_crossing_count),
        "original_crossing_score": original_crossing_score,
        "original_vertical_score": original_vertical_total,
        "original_total_width": round(original_width, 2)
    }

    for trial in range(n_trials):
        start = time()

        selected_indices = select_flows_by_width_band(flow_df_full, removal_fraction)
        width_removed = flow_df_full.loc[selected_indices, "value"].sum()
        width_removed_pct = round(100 * width_removed / original_width, 2)

        modified_flows = hide_flows_by_index(dataset, selected_indices)
        new_dataset = Dataset(
            flows=modified_flows,
            dim_process=dataset._dim_process,
            dim_material=dataset._dim_material,
            dim_time=dataset._dim_time,
        )

        sankey_data = weave(sankey_definition, new_dataset)
        sankey_data_evolved, crossings, crossing_areas = optimise_node_order(
            sankey_data, crossings__area=True
        )
        ordered = optimise_node_order(sankey_data, group_nodes=True)

        # 🔧 FIX: Explicitly define minimum_gap
        layout = optimise_node_positions(ordered, minimum_gap=20)

        y_coords = {
            nid.split("^")[-1].split("*")[0]: pos[1]
            for nid, pos in layout.node_positions.items()
        }

        vertical_df = compute_combined_vertical_score_final(flow_df_full, y_coords, sankey_definition)
        vertical_df["flow_id"] = vertical_df.index
        vertical_total = vertical_df["vertical_score"].sum()

        crossing_metrics = compute_crossing_score_from_dict(crossing_areas)
        crossing_total = crossing_metrics["crossing_score"] // 2
        crossing_count = crossing_metrics["crossing_count"] // 2

        results.append({
            "trial": trial + 1,
            "crossing_count": crossing_count,
            "total_crossing_score": crossing_total,
            "total_vertical_score": vertical_total,
            "original_score": vertical_total + crossing_total,
            "total_width": round(original_width, 2),
            "width_removed": round(width_removed, 2),
            "width_removed_pct": width_removed_pct,
            "flow_count": len(flow_df_full),
            "flows_removed": list(selected_indices),
        })

        if trial < 3:
            trial_details[f"trial_{trial+1}"] = {
                "y_coords": y_coords,
                "vertical_table": vertical_df[[
                    "flow_id", "source", "target", "material", "width", "span", "vertical_score"
                ]],
                "crossing_metrics": crossing_metrics
            }

    results_df = pd.DataFrame(results)
    results_df["norm_vertical"] = (results_df["total_vertical_score"] - results_df["total_vertical_score"].min()) / (
        results_df["total_vertical_score"].max() - results_df["total_vertical_score"].min()
    )
    results_df["norm_crossing"] = (results_df["total_crossing_score"] - results_df["total_crossing_score"].min()) / (
        results_df["total_crossing_score"].max() - results_df["total_crossing_score"].min()
    )
    results_df["final_score"] = X1 * results_df["norm_vertical"] + X2 * results_df["norm_crossing"]

    summary = results_df.sort_values("final_score", ascending=True).reset_index(drop=True)

    return summary, trial_details, original_summary


In [46]:
summary_df, logs, original_metrics = update_results_table_with_full_metrics(
    dataset, sdd, n_trials=100, removal_fraction=0.3, X1=1, X2=1
)

# 🧾 Print original dataset metrics
print("\n📊 ORIGINAL DATASET METRICS:")
for key, val in original_metrics.items():
    print(f"{key}: {val}")

# 📊 Final Ranked Summary
print("\n📊 Final Ranked Summary:")
print(summary_df[[
    "trial", "final_score", "original_score", 
    "total_crossing_score", "total_vertical_score", 
    "crossing_count", "width_removed", "width_removed_pct", "flows_removed"
]])

# 📄 Top 3 Layouts (Row Format for Easy Copying)
print("\n📄 Top 3 Layouts Summary (Row Format):")
top3 = summary_df.head(3)
for i, row in top3.iterrows():
    print(
        f"Trial {row['trial']} | "
        f"Crossings: {row['crossing_count']} | "
        f"Area: {row['total_crossing_score']} | "
        f"Span: {row['total_vertical_score']} | "
        f"Width: {row['total_width']} | "
        f"Removed: {row['width_removed']} ({row['width_removed_pct']}%) | "
        f"Flows: {row['flows_removed']}"
    )


Starting solution of the Linear programming relaxation problem using Primal Simplex

Coin0506I Presolve 36 (-24) rows, 24 (-28) columns and 108 (-56) elements
Clp1000I sum of infeasibilities 1.53309e-08 - average 4.25859e-10, 16 fixed columns
Coin0506I Presolve 20 (-16) rows, 8 (-16) columns and 44 (-64) elements
Clp0006I 0  Obj 0
Clp0029I End of values pass after 8 iterations
Clp0000I Optimal - objective value 0
Clp0000I Optimal - objective value 0
Coin0511I After Postsolve, objective 0, infeasibilities - dual 0 (0), primal 0 (0)
Clp0006I 0  Obj 0
Clp0000I Optimal - objective value 0
Clp0000I Optimal - objective value 0
Clp0000I Optimal - objective value 0
Coin0511I After Postsolve, objective 0, infeasibilities - dual 0 (0), primal 0 (0)
Clp0032I Optimal objective 0 - 0 iterations time 0.002, Presolve 0.00, Idiot 0.00

Starting MIP optimization
Cgl0004I processed model has 36 rows, 24 columns (24 integer (24 of which binary)) and 108 elements
Coin3009W Conflict graph built in 0.000 se

In [None]:
# --- Step 1: Manually specify rows to hide ---

rows_to_hide =   [1, 0, 3, 13, 9]
# --- Step 2: Hide flows in dataset ---
def hide_flows_by_index(dataset, row_indices):
    flows = dataset._flows.copy()
    rows = flows.iloc[row_indices]

    hidden = []
    for _, row in rows.iterrows():
        r1, r2 = row.copy(), row.copy()
        r1["source"] = "HIDDEN"
        r2["target"] = "HIDDEN"
        hidden.extend([r1, r2])
    
    new_flows = pd.concat([
        flows.drop(index=row_indices),
        pd.DataFrame(hidden)
    ], ignore_index=True)
    
    print(flows)

    return Dataset(new_flows, dataset._dim_process, dataset._dim_material)

# --- Step 3: Build new dataset with hidden flows ---
sdd = SankeyDefinition(nodes, bundles, ordering,flow_partition=dataset.partition('material'))
updated_dataset = hide_flows_by_index(dataset, rows_to_hide)

# --- Step 4: Create Sankey layout ---
sankey_data = weave(sdd, updated_dataset,palette=palette)
sankey_data_manual_evolve = optimise_node_order(sankey_data, group_nodes=True)

#sankey_data_manual_evolve.to_widget(layout=optimise_node_positions(sankey_data_manual_evolve, scale = 10,minimum_gap=20, margins=dict(left=100, right=120)), width=700, height=450)   
sankey_data_manual_evolve.to_widget(layout=optimise_node_positions(sankey_data_manual_evolve, scale = 7,minimum_gap=10, margins=dict(left=50, right=220)), width=1000, height=800)   


IndexError: positional indexers are out-of-bounds