In [None]:
import json
import pandas as pd
import numpy as np
import plotly.graph_objects as go

In [None]:
def filter_df(df, num_sources=None, num_targets=None):
    """
    A function to filter the data frame by top n sources and targets
    If num_sources or num_targets args are not supplied, they will not be filtered
    """
    if num_targets:
        top_targets = df.sum().sort_values(ascending=False)
        df = df[top_targets[:num_targets].index]
    
    if num_sources:
        top_sources = df.sum(axis=1).sort_values(ascending=False)[:num_sources]
        df = df.loc[top_sources.index]
    
    return df


def create_sankey_df(df, min_val=0):
    """
    Create the human-readable form of the Sankey chart data from an input data frame
    Data can be filtered by a threshold minimum value
    | Source | Source Value | Target | Target Value |
    |    A   |      5       |   i    |      3       |
    |    A   |      5       |   j    |      2       |
    |    B   |      7       |   i    |      1       |
    |    B   |      7       |   k    |      4       |
    """
    
    sources = []
    source_vals = []
    targets = []
    target_vals = []
    for source_name in df.index:
        row = df.loc[source_name]
        sources += [source_name] * sum(row.values > min_val)
        source_vals += [row[row.values > min_val].sum()] * sum(row.values > min_val)
        targets += list(row[row > min_val].index)
        target_vals += list(row[row > min_val].values)
    
    sankey_df = pd.DataFrame({
        'source': sources,
        'target': targets,
        'value': target_vals
    })
    
    return sankey_df


def create_label_dict(node_df, start_idx=0):
    """
    Return a dictionary with labels as keys and indices as values.  Applied 
    to each section of the flow visualization (two nodes and an edge). The
    `node_df` represents each section.
    """
    labels = set(node_df.source).union(node_df.target)
    sorted_labels = sorted(list(labels))
    return {sorted_labels[i]: i + start_idx for i in range(len(sorted_labels))}


def create_final_list(node_df, node_label_dict):
    """
    Return a list of dictionaries and the labels for each section of the flow 
    diagram.
    """
    df = pd.DataFrame({
        "source": node_df.source.map(node_label_dict),
        "target": node_df.target.map(node_label_dict),
        "value" : node_df.value
    })
    
    labels = list(node_label_dict.keys())
    return df.to_dict("records"), labels

In [None]:
investor = pd.read_excel(
    "../data/Equity investor SUP matrix.xlsx",
    engine="openpyxl",
    skiprows=3,
    usecols="B, E:GG",
)

investor = investor.rename(columns={investor.columns[0]: "Ultimate Investor"})

# drop last row because it is a table summary
investor = investor[:-1]
investor = investor.set_index('Ultimate Investor')

# Set the value of Kingdom of Saudi Arabia investment in Saudi Arabian Oil company to equal
# the second largest investment
investor.loc['KINGDOM OF SAUDI ARABIA', ('Saudi Arabian Oil Co')] = investor.max(axis=0).sort_values(ascending=False)[1]

In [None]:
financer = pd.read_excel(
    "../data/Financing SUP matrix.xlsx",
    engine="openpyxl",
    skiprows=4,
    usecols="A:AV",
)

# drop last row because it is null
financer = financer[:-1]
financer = financer.set_index('Bank')

In [None]:
producer = pd.read_excel(
    "../data/MFA matrix.xlsx",
    sheet_name="Conversion",
    engine="openpyxl",
    skiprows=1,
    usecols="C:FY",
).dropna()

producer = producer.groupby('Producer').sum()

In [None]:
waste = pd.read_excel(
    "../data/MFA matrix.xlsx",
    sheet_name="Waste",
    engine="openpyxl",
    skiprows=1,
    usecols="B, D:FY",
).dropna()

waste = waste.groupby('Country').sum()

In [None]:
destination = pd.read_excel(
    "../data/MFA matrix.xlsx",
    sheet_name="Waste",
    engine="openpyxl",
    skiprows=1,
    usecols="C:FY",
).dropna()

destination = destination.groupby('Producer').sum()

In [None]:
investor_df = create_sankey_df(investor)
financer_df = create_sankey_df(financer)
producer_df = create_sankey_df(producer)
waste_df = create_sankey_df(waste)
destination_df = create_sankey_df(destination)

## Circle Packing Diagram - Investor

In [None]:
import circlify as circ
num_circles = len(investor_df.groupby('source').sum())
circles = circ.circlify(list(investor_df.groupby('source').sum().sort_values(by='value', ascending=True)['value'])[:num_circles], show_enclosure=False)

In [None]:
num_labels = 0
labels = [''] * (num_circles - num_labels)
top_labels = list(investor_df.groupby('source').sum().sort_values(by='value', ascending=True).index[:num_labels])
labels += top_labels[::-1]
circ.bubbles(circles, labels)

In [None]:
# Countries of impact
num_circles = len(df.groupby('target').sum())
country_sum = df.groupby('target').sum().sort_values(by='value', ascending=False)[:num_circles].reset_index()
country_sum['percentage'] = country_sum['value'] / country_sum['value'].sum() * 100
circles = circ.circlify(list(country_sum['percentage']))

num_labels = 10
labels = [''] * (num_circles - num_labels)
top_labels = list(country_sum['target'][:num_labels])[::-1]
labels += top_labels
circ.bubbles(circles, labels=labels)

## Treemap Viz

In [None]:
investor_df_mod = destination_df.copy()
for index in range(len(investor_df_mod)):
    investor_df_mod.loc[index, 'target'] += '_' + str(index)

In [None]:
unique_investors = list(investor_df_mod['source'].unique())
labels = [''] + unique_investors + list(investor_df_mod['target'])

investor_parents = [''] * len(unique_investors)
parents = [''] + investor_parents + list(investor_df_mod['source'])

total = [investor_df_mod[investor_df_mod['source'] == investor].sum()['value'] for investor in unique_investors]
values = [investor_df_mod['value'].sum()] + total + list(investor_df_mod['value'])

In [None]:
import plotly.graph_objects as go

fig = go.Figure(go.Treemap(
    branchvalues = "total",
    labels = labels,
    values = values,
    parents = parents,
    #marker_colorscale = 'Blues'
    #marker_colors = ['white'] * len(list(investor_df['source'][:num_investors])) + ['lightgray'] * len(list(investor_df['source'][:num_investors].unique()))
))

fig.update_layout(
    title_text="Financers of Plastics Producers", 
    font_size=12,
    autosize=False,
    width=1500,
    height=1500,
    #uniformtext=dict(minsize=8, mode='hide'),
    treemapcolorway = ['#c9c3bc'] * len(values)
    #treemapcolorway = ['#F1D26D'] * len(values)
)
fig.write_html('../docs/destination_treemap.html', include_plotlyjs='cdn')
fig.show()

## Make a regional treemap

In [None]:
with open('../data/region_map.json', 'r') as f:
    region_dict = json.load(f)
country_to_region = { country['name']: country['region'] for country in region_dict }

In [None]:
# Assign regions to countries
df = destination_df.copy()
region = [country_to_region[country] for country in df['target']]
df['region'] = region
df.head()

### Producers as regional subset

In [None]:
regional_df = df.groupby(['region', 'source']).sum()
unique_regions = list(regional_df.index.get_level_values('region').unique())
region_list = []
producer_list = []
values_list = list(regional_df['value'])
for region in unique_regions: 
    regional_producers = regional_df.loc[region].index
    for producer in regional_producers:
        region_list.append(region)
        producer_list.append(producer + ' - ' + region)

In [None]:
labels = [''] + unique_regions + producer_list

parent_group = [''] * len(unique_regions)
parents = [''] + parent_group + region_list

parent_total = list(df.groupby('region').sum()['value'])
values = [sum(parent_total)] + parent_total + values_list

In [None]:
import plotly.graph_objects as go

# Colors from https://coolors.co/3c905f-f4b393-fc60a8-8acdea-2d728f
color_array = ["#3c905f","#f4b393","#fc60a8","#8acdea","#2d728f"]

fig = go.Figure(go.Treemap(
    branchvalues = "total",
    labels = labels,
    values = values,
    parents = parents,
    #marker_colorscale = 'Blues'
    #marker_colors = ['white'] * len(list(investor_df['source'][:num_investors])) + ['lightgray'] * len(list(investor_df['source'][:num_investors].unique()))
))

fig.update_layout(
    title_text="Volume of Plastics by Region and Producer", 
    font_size=12,
    autosize=False,
    width=400,
    height=1500,
    uniformtext=dict(minsize=8, mode='hide'),
    #treemapcolorway = ['#c9c3bc'] * len(values)
    treemapcolorway = color_array
    #treemapcolorway = ['#F1D26D'] * len(values)
)
fig.write_html('../docs/region_treemap-tall.html', include_plotlyjs='cdn')
fig.show()

### Countries as regional subset

In [None]:
regional_df = df.groupby(['region', 'target']).sum()
unique_regions = list(regional_df.index.get_level_values('region').unique())
region_list = []
country_list = list(regional_df.index.get_level_values(1))
values_list = list(regional_df['value'])
for region in unique_regions: 
    country = regional_df.loc[region].index
    for value in country:
        region_list.append(region)

In [None]:
labels = [''] + unique_regions + country_list

parent_group = [''] * len(unique_regions)
parents = [''] + parent_group + region_list

parent_total = list(df.groupby('region').sum()['value'])
values = [sum(parent_total)] + parent_total + values_list

In [None]:
import plotly.graph_objects as go

# Colors from https://coolors.co/3c905f-f4b393-fc60a8-8acdea-2d728f
color_array = ["#3c905f","#f4b393","#fc60a8","#8acdea","#2d728f"]

fig = go.Figure(go.Treemap(
    branchvalues = "total",
    labels = labels,
    values = values,
    parents = parents,
    #marker_colorscale = 'Blues'
    #marker_colors = ['white'] * len(list(investor_df['source'][:num_investors])) + ['lightgray'] * len(list(investor_df['source'][:num_investors].unique()))
))

fig.update_layout(
    title_text="Volume of Plastics by Region and Country", 
    font_size=12,
    autosize=False,
    width=750,
    height=750,
    uniformtext=dict(minsize=8, mode='hide'),
    #treemapcolorway = ['#c9c3bc'] * len(values)
    treemapcolorway = color_array
    #treemapcolorway = ['#F1D26D'] * len(values)
)
fig.write_html('../docs/region_country_treemap.html', include_plotlyjs='cdn')
fig.show()

### Region, Country, Producer Treemap

In [None]:
regional_df = df.groupby(['region', 'target']).sum()
unique_regions = list(regional_df.index.get_level_values('region').unique())
region_list = []
country_list = list(regional_df.index.get_level_values(1))
values_list = list(regional_df['value'])
for region in unique_regions: 
    country = regional_df.loc[region].index
    for value in country:
        region_list.append(region)

In [None]:
producer_list = [f"{source}_{index}" for index, source in enumerate(df['source'])]
producer_country_list = list(df['target'])
producer_values = list(df['value'])

In [None]:
labels = [''] + unique_regions + country_list + producer_list

parent_group = [''] * len(unique_regions)
parents = [''] + parent_group + region_list + producer_country_list

parent_total = list(df.groupby('region').sum()['value'])
values = [sum(parent_total)] + parent_total + values_list + producer_values

In [None]:
import plotly.graph_objects as go

# Colors from https://coolors.co/3c905f-f4b393-fc60a8-8acdea-2d728f
color_array = ["#3c905f","#f4b393","#fc60a8","#8acdea","#2d728f"]

fig = go.Figure(go.Treemap(
    branchvalues = "total",
    labels = labels,
    values = values,
    parents = parents,
    #marker_colorscale = 'Blues'
    #marker_colors = ['white'] * len(list(investor_df['source'][:num_investors])) + ['lightgray'] * len(list(investor_df['source'][:num_investors].unique()))
))

fig.update_layout(
    title_text="Volume of Plastics by Region and Country", 
    font_size=12,
    autosize=False,
    width=400,
    height=1250,
    uniformtext=dict(minsize=8, mode='hide'),
    #treemapcolorway = ['#c9c3bc'] * len(values)
    treemapcolorway = color_array
    #treemapcolorway = ['#F1D26D'] * len(values)
)
fig.write_html('../docs/region_country_producer_treemap-tall.html', include_plotlyjs='cdn')
#fig.show()

# Bar Plot

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8,5), dpi=150)
country_sum = df.groupby('target').sum().sort_values(by='value', ascending=False)[:11].reset_index()
country_sum['percentage'] = country_sum['value'] /  df.groupby('target').sum()['value'].sum() * 100
sns.barplot(x='target', y='percentage', data=country_sum)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Percentage of Global Plastics')
plt.xlabel('')
plt.show()

# Tabular Facts

In [None]:
country_sum = df.groupby('target').sum().sort_values(by='value', ascending=False)[:20].reset_index()
print("Top 20 Countries by Plastic Waste")
for index in range(len(country_sum)):
    print(f"{country_sum.iloc[index]['target']}, {country_sum.iloc[index]['value'] / df.groupby('target').sum()['value'].sum():.1%}")
print(f"Other, {sum(df.groupby('target').sum().sort_values(by='value', ascending=False)[20:].reset_index()['value'] / df.groupby('target').sum()['value'].sum()):.1%}")

In [None]:
country_sum = df.groupby('target').sum().sort_values(by='value', ascending=False)[:20].reset_index()
print("Top 20 Countries by Plastic Waste")
for index in range(len(country_sum)):
    print(f"{country_sum.iloc[index]['target']}, {country_sum.iloc[index]['value']:.0f}")
print(f"Other, {sum(df.groupby('target').sum().sort_values(by='value', ascending=False)[20:].reset_index()['value']):.0f}")

In [None]:
top_countries = df.groupby('target').sum().sort_values(by='value', ascending=False)[:5].reset_index()['target']
print("Top producers supplying plastics to the top 5 countries\n")
for country in top_countries:
    print(country)
    top_producers = df[df['target'] == country].groupby('source').sum().sort_values(by='value', ascending=False)[:5]
    total_producer_value = df[df['target'] == country].groupby('source').sum().sort_values(by='value', ascending=False).sum().value
    for producer, value in zip(top_producers.index, top_producers['value']):
        print(f"\t{producer}, {value / total_producer_value:.1%}")
    print(f"\tOther, {top_producers.sum().value / total_producer_value:.1%}\n")