# PPI Flow Viz - Node 2-3

### Source: Producer
Value = total volume of polymers bound for single-use plastic applications, by producer
MFA matrix, ‘Conversion’ tab: sum of all identical ‘Producer’ entries (Column C) x ‘checksum’ (Column FZ)


### Edge
Value = volume of polymers bound for single-use plastic applications, by producer and country of production
MFA matrix, ‘Conversion’ tab: unique ’Country’ (Column B)/‘Producer’ (Column C) combinations x ‘checksum’ (Column FZ)


### Sink: Country of Production

Value = volume of polymers bound for single-use plastic applications, by country of production
MFA matrix, ‘Conversion’ tab: sum of all identical ’Country’ entries (Column B) x ‘checksum’ (Column FZ)

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## Load data

For the Sankey creation function, the data must be list the sources as the dataframe index, and the targets as the dataframe columns

In [None]:
producer = pd.read_excel(
    "../data/MFA matrix.xlsx",
    sheet_name="Conversion",
    engine="openpyxl",
    skiprows=1,
    usecols="C:FY",
).dropna()

producer = producer.groupby('Producer').sum()
print("Producer Dataframe")
producer.head()

## Create the source -> target structure

In [None]:
def filter_df(df, num_sources=None, num_targets=None):
    """
    A function to filter the data frame by top n sources and targets
    If num_sources or num_targets args are not supplied, they will not be filtered
    """
    if num_targets:
        top_targets = df.sum().sort_values(ascending=False)
        df = df[top_targets[:num_targets].index]
    
    if num_sources:
        top_sources = df.sum(axis=1).sort_values(ascending=False)[:num_sources]
        df = df.loc[top_sources.index]
    
    return df

def create_sankey_df(df, min_val=0):
    """
    Create the human-readable form of the Sankey chart data from an input data frame
    Data can be filtered by a threshold minimum value
    | Source | Source Value | Target | Target Value |
    |    A   |      5       |   i    |      3       |
    |    A   |      5       |   j    |      2       |
    |    B   |      7       |   i    |      1       |
    |    B   |      7       |   k    |      4       |
    """
    
    sources = []
    source_vals = []
    targets = []
    target_vals = []
    for source_name in df.index:
        row = df.loc[source_name]
        sources += [source_name] * sum(row.values > min_val)
        source_vals += [row[row.values > min_val].sum()] * sum(row.values > min_val)
        targets += list(row[row > min_val].index)
        target_vals += list(row[row > min_val].values)
    
    sankey_df = pd.DataFrame({
        'source': sources,
        'source_value': source_vals,
        'target': targets,
        'target_value': target_vals
    })
    
    return sankey_df

def create_sankey_dict(sankey_df):
    """
    Plotly requires that each source and target be converted to a numerical index
    This index also points to an entry in the labels file
    As a convention that I think will be useful, indices for a target build off of
    the last value in the preceding column's indices
    """
    source_nodes = sorted(sankey_df.source.unique())
    source_node_dict = {source_nodes[i]: i for i in range(len(source_nodes))}

    target_nodes = sorted(sankey_df.target.unique())
    target_node_dict = {target_nodes[i]: i + max(source_node_dict.values()) + 1 for i in range(len(target_nodes))}
    
    source_indices = list(sankey_df.source.map(source_node_dict).values)
    source_values  = list(sankey_df.source_value)
    target_indices = list(sankey_df.target.map(target_node_dict).values)
    target_values  = list(sankey_df.target_value)
    
    sankey_dict = {
        'source_labels': source_nodes,
        'source': source_indices,
        'source_values': source_values,
        'target_labels': target_nodes,
        'target': target_indices,
        'target_values': target_values
    }
    
    return sankey_dict

def plot_sankey(sankey_dict, title):
    """Plot a Sankey diagram. By default, line height is given by the target values"""
    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = sankey_dict['source_labels'] + sankey_dict['target_labels'],
          color = "black"
        ),
        link = dict(
          source = sankey_dict['source'],
          target = sankey_dict['target'],
          value  = sankey_dict['target_values']
      ))])

    fig.update_layout(title_text=title, font_size=10)
    return fig

## Create Producer -> Country Sankey

In [None]:
filtered_df = filter_df(producer, num_sources=10, num_targets=15)

In [None]:
sankey_df = create_sankey_df(filtered_df, min_val=100)
sankey_df.head()

In [None]:
sankey_dict = create_sankey_dict(sankey_df)

In [None]:
sankey_plot = plot_sankey(sankey_dict, 'Producer by Country of Production')
sankey_plot.show()