In [60]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

## Flow diagrams
Ulimately trying to generate the *sankey* diagrams in d3.js, like [this one](https://observablehq.com/@d3/sankey-diagram). The API reference, which informs the data structures, is [found here](https://github.com/d3/d3-sankey) on Github.  Python implementations can be [found here](https://plotly.com/python/sankey-diagram/), with a [tutorial here](https://coderzcolumn.com/tutorials/data-science/how-to-plot-sankey-diagram-in-python-jupyter-notebook-holoviews-and-plotly).

### Production
A data frame that shows production of polymer (e.g., LDPE) by operator in 2019, with additional variables like Region, Country, and State Province.

In [11]:
production_df = pd.read_excel(
    "../data/outputs.xlsx",
    engine='openpyxl',
    sheet_name="Production",
    skiprows=3,
    usecols="B:I,K",
    nrows=1205,
    dtype={
        "Asset ID": np.int32
    }
)

production_df.to_pickle("../dataframes/production.pkl")

Create a list of the top ten producers, just for reference in this demo notebook

In [13]:
top_producers = production_df.groupby(["Operator"]).sum().reset_index()
top_producers = top_producers[["Operator", "2019 Production"]].sort_values(by=["2019 Production"], ascending=False)
top_producers = top_producers["Operator"].to_list()

### Resin Trade

In [14]:
resin_trade_df = pd.read_excel(
    "../data/outputs.xlsx",
    engine='openpyxl',
    sheet_name="Sheet 1",
    skiprows=3,
    usecols="C:GE",
    nrows=1205,
    dtype={
        "Asset ID": np.int32
    }
)

Create a dataframe in long-form indicating the trade from owners to countries, prepared for input into Holoviews (i.e., three columns with the source, destination, and value).  Note that each row is a separate *asset* so that there will be duplicate entries at the owner-country-polymer link level, which will have to be aggregated.

In [16]:
res = []

for idx, record in resin_trade_df.iterrows():
    
    owner = record["Owner Name"]
    polymer = record["Polymer"]
    recs = record[7:].reset_index() # the trade matrix starts on the column with index 7

    recs.columns = ["country", "tradeval"]
    recs = recs[recs.tradeval > 0]
    recs["owner"] = owner
    recs["polymer"] = polymer
    recs = recs[["owner", "country", "tradeval", "polymer"]]

    res.append(recs)

trade_links = pd.concat(res, ignore_index=True)
trade_links.to_pickle("../dataframes/resin_trade_links.pkl")

### Conversion
Just plot in-scope volumes, not rigid plastics.  There are multiple matrices within this single sheet.  The other two (out-of-scope) matrices are the rigid plastics and flexible plastics that are used 

In [18]:
conversion_df = pd.read_excel(
    "../data/outputs.xlsx",
    engine='openpyxl',
    sheet_name="Conversion",
    skiprows=2,
    usecols="B:GB",
    nrows=1205,
    dtype={
        "Asset ID": np.int32
    }
)

In [20]:
res = []

for idx, record in conversion_df.iterrows():
    
    owner = record["Owner Name"]
    source_country = record["Country"]
    polymer = record["Polymer"]
    recs = record[5:].reset_index() # the trade matrix starts on the column with index 5

    recs.columns = ["country", "tradeval"]
    recs = recs[recs.tradeval > 0]
    recs["owner"] = owner
    recs["polymer"] = polymer
    recs["source_country"] = source_country
    recs = recs[["source_country", "country", "tradeval", "owner", "polymer"]]

    res.append(recs)

conversion_links = pd.concat(res, ignore_index=True)
conversion_links.to_pickle("../dataframes/conversion_links.pkl")

### Visualizations
Demonstrate the most basic flow diagram.

In [61]:
polymer = "LDPE"
n_producer = 10
min_tradeval = 30

viz_links = conversion_links[conversion_links.owner.isin(top_producers[0:n_producer])]
viz_links = viz_links[viz_links.polymer == "LDPE"].groupby(["source_country", "polymer", "country"]).sum()
viz_links = viz_links[viz_links.tradeval > min_tradeval].reset_index()
countries = set(viz_links.source_country).union(set(viz_links.country))
nodes = sorted(list(countries))
node_dict = {nodes[i]: i for i in range(len(nodes))}
viz_links["source"] = viz_links.source_country.map(node_dict)
viz_links["target"] = viz_links.country.map(node_dict)
viz_links["value"]  = viz_links.tradeval
labels = list(node_dict.keys())

link = dict(source = viz_links.source, target = viz_links.target, value = viz_links.value)
data = go.Sankey(link = link)

fig = go.Figure(data)
fig.show()