In [2]:
import pandas as pd
import numpy as np
import holoviews as hv
hv.extension('bokeh')

## Flow diagrams
Ulimately trying to generate the *sankey* diagrams in d3.js, like [this one](https://observablehq.com/@d3/sankey-diagram). The API reference, which informs the data structures, is [found here](https://github.com/d3/d3-sankey) on Github.  Python implementations can be [found here](https://plotly.com/python/sankey-diagram/), with a [tutorial here](https://coderzcolumn.com/tutorials/data-science/how-to-plot-sankey-diagram-in-python-jupyter-notebook-holoviews-and-plotly).

### Production
A data frame that shows production of polymer (e.g., LDPE) by operator in 2019, with additional variables like Region, Country, and State Province.

In [3]:
production_df = pd.read_excel(
    "../data/outputs.xlsx",
    engine='openpyxl',
    sheet_name="Production",
    skiprows=3,
    usecols="B:I,K",
    nrows=1205,
    dtype={
        "Asset ID": np.int32
    }
)

production_df.to_pickle("../dataframes/production.pkl")

Create a list of the top ten producers, just for reference in this demo notebook

In [4]:
top_producers = production_df.groupby(["Operator"]).sum().reset_index()
top_producers = top_producers[["Operator", "2019 Production"]].sort_values(by=["2019 Production"], ascending=False)
top_producers = top_producers["Operator"].to_list()
top_producers

['ExxonMobil Chemical Company',
 'Braskem S.A.',
 'Reliance Industries Ltd',
 'Equistar Chemicals LP',
 'Abu Dhabi Polymers Company Ltd. (Borouge)',
 'LyondellBasell',
 'Dow Chemical Company',
 'Indorama',
 'Chevron Phillips Chemical Company LLC',
 'INEOS Olefins & Polymers Europe',
 'Borealis AG',
 'Lotte Chemical Corporation',
 'SABIC Europe',
 'Nova Chemicals Corporation',
 'Ineos Olefins & Polymers USA',
 'Formosa Plastics',
 'Total Petrochemicals',
 'Formosa Plastics Corporation USA',
 'Jiangsu Sanfangxiang',
 'PTT Global Chemical',
 'CNOOC and Shell Petrochemicals Company Limited (CSPC)',
 'PetroChina Dushanzi Petrochemical Company',
 'PetroChina Daqing Petrochemical Company',
 'Hanwha Total Petrochemicals Co.',
 'Rabigh Refining & Petrochemical Co.',
 'Far Eastern New Century',
 'ONGC Petro additions Ltd',
 'Saudi Polymers Company LLC (SPCo)',
 'Total Petrochemicals & Refining USA',
 'Saudi Yanbu Petrochemical Co - (Yanpet)',
 'Eastern Petrochemical Company - (Sharq)',
 'LG Chem

### Resin Trade

In [5]:
resin_trade_df = pd.read_excel(
    "../data/outputs.xlsx",
    engine='openpyxl',
    sheet_name="Resin Trade",
    skiprows=3,
    usecols="C:GE",
    nrows=1205,
    dtype={
        "Asset ID": np.int32
    }
)

Create a dataframe in long-form indicating the trade from owners to countries, prepared for input into Holoviews (i.e., three columns with the source, destination, and value).  Note that each row is a separate *asset* so that there will be duplicate entries at the owner-country-polymer link level, which will have to be aggregated.

In [6]:
res = []

for idx, record in resin_trade_df.iterrows():
    
    owner = record["Owner Name"]
    polymer = record["Polymer"]
    recs = record[7:].reset_index() # the trade matrix starts on the column with index 7

    recs.columns = ["country", "tradeval"]
    recs = recs[recs.tradeval > 0]
    recs["owner"] = owner
    recs["polymer"] = polymer
    recs = recs[["owner", "country", "tradeval", "polymer"]]

    res.append(recs)

trade_links = pd.concat(res, ignore_index=True)

### Conversion
Just plot in-scope volumes, not rigid plastics.  There are multiple matrices within this single sheet.  The other two (out-of-scope) matrices are the rigid plastics and flexible plastics that are used 

In [7]:
conversion_df = pd.read_excel(
    "../data/outputs.xlsx",
    engine='openpyxl',
    sheet_name="Conversion",
    skiprows=2,
    usecols="B:GB",
    nrows=1205,
    dtype={
        "Asset ID": np.int32
    }
)

In [8]:
res = []

for idx, record in conversion_df.iterrows():
    
    owner = record["Owner Name"]
    source_country = record["Country"]
    polymer = record["Polymer"]
    recs = record[5:].reset_index() # the trade matrix starts on the column with index 5

    recs.columns = ["country", "tradeval"]
    recs = recs[recs.tradeval > 0]
    recs["owner"] = owner
    recs["polymer"] = polymer
    recs["source_country"] = source_country
    recs = recs[["source_country", "country", "tradeval", "owner", "polymer"]]

    res.append(recs)

conversion_links = pd.concat(res, ignore_index=True)

### Visualizations
Demonstrate the most basic flow diagram.

In [9]:
viz_links = trade_links[trade_links.owner.isin(top_producers)]
viz_links = viz_links[viz_links.polymer == "LDPE"].groupby(["owner", "polymer", "country"]).sum()
viz_links = viz_links[viz_links.tradeval > 30].reset_index()
viz_links = viz_links[["owner", "country", "tradeval"]]

sankey = hv.Sankey(viz_links)

sankey.opts(
    cmap='RdYlBu',
    label_position='right',
    edge_line_width=0,
    node_alpha=1.0, 
    node_width=40, 
    node_sort=True,
    width=900, 
    height=1000,
    title="Resin Trade (LDPE)"
)

In [10]:
viz_links = conversion_links[conversion_links.owner.isin(top_producers)]
viz_links = viz_links[viz_links.polymer == "LDPE"].groupby(["source_country", "polymer", "country"]).sum()
viz_links = viz_links[viz_links.tradeval > 30].reset_index()
viz_links = viz_links[["source_country", "country", "tradeval"]]
viz_links["country"] = viz_links["country"].map(lambda x: '%s.' %x) # deal with acyclic graphs for holoviews
viz_links = viz_links[viz_links.source_country != viz_links.country] # TODO: check with Minderoo on domestic trade

sankey = hv.Sankey(viz_links)

sankey.opts(
    cmap='RdYlBu',
    node_fill_color="source_country",
    edge_fill_color="source_country",
    label_position='right',
    edge_line_width=0,
    node_alpha=1.0, 
    node_width=40, 
    node_sort=True,
    width=900, 
    height=1000,
    title="Converted plastic trade (LDPE)"
)