In [342]:
#import packages

import numpy as np
import pandas as pd
import random
import pickle #importing file
import itertools
import folium #mapping
import seaborn as sns #color palette
import plotly
import chart_studio.plotly as py #!pip install chart_studi#o
#import pyplot_themes as themes

In [343]:
# import data

file_root = '../data/results/'
file_stem = 'full_detailed_results' #'results_pooled_cement_only_complexcost'
file_format = '.pkl'
file_name = file_root+file_stem+file_format #'../data/results/results_pooled_cement_only_complexcost.pkl' ##results_toydata_uniform.pkl'
open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

In [344]:
#convert to dataframe and parse the geolocation

df = pd.DataFrame(loaded_list, columns = ['fac name','supplier', 'pool','receiver', 'distance', 'cost','volume','cost per ton','industry','timeframe'])

#convert suppliers to list to get around weird numpy object datatype
supplier_split = df.supplier.to_list()
receiver_split = df.receiver.to_list()

#split the lists of paired geolocations into two elements each, lat and lon
supplier_lat= [supplier_split[j][0] for j in range(0,len(supplier_split))]
supplier_lon = [supplier_split[j][1] for j in range(0,len(supplier_split))]
receiver_lat= [receiver_split[j][0] for j in range(0,len(receiver_split))]
receiver_lon = [receiver_split[j][1] for j in range(0,len(receiver_split))]

#add back into the df
df['supplier_lat'] = supplier_lat
df['supplier_lon'] = supplier_lon
df['receiver_lat'] = receiver_lat
df['receiver_lon'] = receiver_lon


#clean names
fac_names = list(df['fac name'])
fac_names_clean = ['Emitter: ' +str(item) for item in fac_names ]

pool_names = list(df['pool'])
pool_names= ["Pool ID: " + str(item).replace('.0','') if '.0' in str(item) else item for item in pool_names]

pool_names


df['fac name clean'] = fac_names_clean
df['pool name clean'] = pool_names

df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,cost,volume,cost per ton,industry,timeframe,supplier_lat,supplier_lon,receiver_lat,receiver_lon,fac name clean,pool name clean
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",120018.0,"[37.68627, -121.674512]",331.5021,2003167.0,8.464567,236653.231091,cement,1,34.6222,-117.1001,37.68627,-121.674512,Emitter: CEMEX Construction Materials Pacific LLC,Pool ID: 120018
1,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1100144.0,"[39.06031418, -121.9646343]",407.764495,2837339.0,12.532808,226392.959735,cement,1,34.6222,-117.1001,39.060314,-121.964634,Emitter: CEMEX Construction Materials Pacific LLC,Pool ID: 1100144
2,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120185.0,"[39.127373, -121.961772]",411.076068,2148530.0,4.527559,474544.979838,cement,1,34.6222,-117.1001,39.127373,-121.961772,Emitter: CEMEX Construction Materials Pacific LLC,Pool ID: 1120185
3,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120194.0,"[39.15124715, -121.933119]",411.269679,2844537.0,5.905512,481674.992347,cement,1,34.6222,-117.1001,39.151247,-121.933119,Emitter: CEMEX Construction Materials Pacific LLC,Pool ID: 1120194
4,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120330.0,"[39.033785, -121.91461]",404.597346,2099694.0,14.566929,144141.143006,cement,1,34.6222,-117.1001,39.033785,-121.91461,Emitter: CEMEX Construction Materials Pacific LLC,Pool ID: 1120330


In [345]:
#create a color palette, repeat it 100 times so we don't run out of colors for large graphs
palette = ['red', 'green', 'purple', 'orange',  'darkblue',
           'gray','cadetblue', 'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', # 'darkred','lightred' these coolors looked very similar to other colors, commented out
           'black','lightgray','lightred','blue','beige']*100

In [346]:
#for testing
#df = pd.concat([df.tail(10), df.head(10)])
#df

## Sankey Diagram

In [347]:
industries = set(list(df['industry']))
timeframes = set(list(df['timeframe']))

In [348]:

#df = df.sort_values('volume') sorting is done inside the sankey by some complex algo

In [349]:
#functionalize
def plot_sankey(plot_df,industry,timeframe):

    df = plot_df
    cat_cols=['fac name clean','pool name clean']
    value_cols='volume'
    title='sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year'
    industry = industry
    timeframe = timeframe

    colorPalette = palette #['#4B8BBE', '#4B8BBE'] #,'#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []

    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp

    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))

    # # define colors based on number of levels
    # colorList = palette #[]
    # for idx, colorNum in enumerate(colorNumList): #count how many colors you need
    #     print(idx, colorNum)
    #     colorList = colorList + [colorPalette[idx]]*colorNum 

    colorList = []
    ix = 0

    for i in range(0,len(labelList)):
        #if the item contains emitter, increment the color
        if 'Emitter' in labelList[i]: 
            colorList.append(palette[ix])
            ix = ix+1
        else:
            colorList.append('Black')    

    colorList

    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','sum']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','sum']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'sum':'sum'}).reset_index()

    print (sourceTargetDf.head())

    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = 'black' # colorList #
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['sum']# ['Volume:' + str(item) for item in list(sourceTargetDf['sum'])]
        )
      )

    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )

    fig = dict(data=[data], layout=layout)

    sankey_filename = 'sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year.html'
    print(sankey_filename)
    plotly.offline.plot(fig, validate=False, filename=sankey_filename)

In [327]:
#plot_sankey(df.head(),'cement',str(1))

Unnamed: 0,fac name,supplier,pool,receiver,distance,cost,volume,cost per ton,industry,timeframe,supplier_lat,supplier_lon,receiver_lat,receiver_lon,fac name clean,pool name clean
86677,TORRANCE REFINING COMPANY LLC,"[33.854967, -118.336907]",28320019.0,"[34.34238019, -119.5193214]",75.569208,2273484.0,845384.09,2.689292,oil_refining,10,33.854967,-118.336907,34.34238,-119.519321,Emitter: TORRANCE REFINING COMPANY LLC,Pool ID: 28320019
86683,TORRANCE REFINING COMPANY LLC,"[33.854967, -118.336907]",28320038.0,"[34.34072938, -119.53178909999998]",76.158704,2134278.0,653102.17,3.267908,oil_refining,10,33.854967,-118.336907,34.340729,-119.531789,Emitter: TORRANCE REFINING COMPANY LLC,Pool ID: 28320038
86684,TORRANCE REFINING COMPANY LLC,"[33.854967, -118.336907]",28320042.0,"[34.34071738, -119.5317141]",76.154491,2123003.0,643575.03,3.298766,oil_refining,10,33.854967,-118.336907,34.340717,-119.531714,Emitter: TORRANCE REFINING COMPANY LLC,Pool ID: 28320042
86673,TORRANCE REFINING COMPANY LLC,"[33.854967, -118.336907]",28303968.0,"[34.38864724, -119.5963449]",80.926133,3069276.0,883964.29,3.472172,oil_refining,10,33.854967,-118.336907,34.388647,-119.596345,Emitter: TORRANCE REFINING COMPANY LLC,Pool ID: 28303968
86685,TORRANCE REFINING COMPANY LLC,"[33.854967, -118.336907]",28320053.0,"[34.342342200000004, -119.51987340000002]",75.596326,2129420.0,564552.69,3.771872,oil_refining,10,33.854967,-118.336907,34.342342,-119.519873,Emitter: TORRANCE REFINING COMPANY LLC,Pool ID: 28320053


steel 1
           source             target        sum
0  Emitter: TAMCO  Pool ID: 11320211  174.01575
1  Emitter: TAMCO   Pool ID: 3700818  121.38030
2  Emitter: TAMCO   Pool ID: 3707067  109.21166
3  Emitter: TAMCO   Pool ID: 3712620  122.33097
4  Emitter: TAMCO   Pool ID: 3715396  129.30547
sankey_full_detailed_results_steel_1_year.html
steel 10
           source            target        sum
0  Emitter: TAMCO  Pool ID: 3715681  663.64829
1  Emitter: TAMCO  Pool ID: 3715710  646.39108
2  Emitter: TAMCO  Pool ID: 3715819  660.62992
3  Emitter: TAMCO  Pool ID: 3716266  681.23360
4  Emitter: TAMCO  Pool ID: 3716522  679.00262
sankey_full_detailed_results_steel_10_year.html
cement 1
                                 source            target         sum
0     Emitter: HANSON PERMANENTE CEMENT  Pool ID: 5920647  164.733310
1     Emitter: HANSON PERMANENTE CEMENT  Pool ID: 5920659  173.429540
2     Emitter: HANSON PERMANENTE CEMENT  Pool ID: 5920660   60.892388
3  Emitter: LEHIGH SOUTHWEST 

In [243]:
#colorList

## Map the Data

In [362]:
#pre-work for the map 

def plot_map(plot_df,industry,timeframe):

    #find unique suppliers since we only need to plot them once
    unique_suppliers = plot_df[['supplier_lat', 'supplier_lon','fac name']]
    unique_suppliers = unique_suppliers.drop_duplicates()
    unique_suppliers =unique_suppliers.reset_index(drop=True)
    unique_suppliers

    #find max and quantile of volume for setting up thicker lines
    volumes = plot_df['volume']
    quantiles = np.quantile(volumes, [0.2,0.4,0.6,0.8])
    max_volume = np.max(volumes)


    #initialize the count of receivers to 0
    receiver_count = 0

    # Create a map centered in CA
    mapit = folium.Map( location=[37.3427, -119.2244], zoom_start=6) 

    #loop through unique suppliers (outer loop)
    #and receivers for each supplier (inner loop) 
    for i in range(0,unique_suppliers.shape[0]):

        #find all receivers associated with the supplier late and lon, filter the dataframe down to receivers
        #for this particular supplier 
        s_lat = unique_suppliers['supplier_lat'][i]
        s_lon = unique_suppliers['supplier_lon'][i]
        s_name = unique_suppliers['fac name'][i]

        #big circle for supplier
        folium.CircleMarker( 
                            location=[ s_lat,s_lon ], 
                            fill=True, 
                            fill_color=palette[i], 
                            color=palette[i],
                            tooltip = 'Emitter: ' + s_name, #+str(s_lat)+', '+str(s_lon),
                            radius=10 
                            ).add_to( mapit ) #'


        receivers = plot_df[(plot_df['supplier_lat'] == s_lat) & (plot_df['supplier_lon'] == s_lon)]

        #plot the receivers with a new color each time
        latlon = receivers['receiver']
        pool_ids = receivers['pool']
        receiver_count += receivers.shape[0] #to print at the end as a check

        #for each receiver  
        for j in range(0,len(latlon)): 

            #split apart into lat and lon
            lat = latlon.iloc[j][0]
            lon = latlon.iloc[j][1]
            pool_id = pool_ids.iloc[j]

            volume = receivers.iloc[j]['volume']


            #add dot for the receiver
            folium.CircleMarker( location=[ lat,lon ], 
                                fill=True, 
                                fill_color=palette[i], 
                                color=palette[i], 
                                tooltip =  'Pool ID: '+ str(pool_id),#'receiver tooltip placeholder',
                                radius=3 ).add_to( mapit )

    print("mapped ", receiver_count, "unique receivers")
    #mapit #can run this in the notebook if you need to view it there. better to just open the .html in another browser tab

    #save the file
    mapit.save(outfile= 'map_'+file_stem+'_'+industry+'_'+str(timeframe)+'_year.html')


In [363]:
plot_df = plot_df.head(20)
plot_map(plot_df,industry, str(timeframe))

mapped  20 unique receivers


In [373]:
def plot_html(plot_df, industry, timeframe):
    plot_html = plot_df[['fac name','pool','distance','cost','volume', 'cost per ton','industry','timeframe']]

    filename=  'html_'+file_stem+'_'+industry+'_'+str(timeframe)+'_year.html'
    
    with open(filename, 'w') as fo:
        fo.write(plot_html.to_html())

In [375]:
#sankey diagram of each

for industry in industries:
    for timeframe in timeframes:
        #print(industry, timeframe)
        plot_df = df[(df['industry'] ==industry) & (df['timeframe'] == timeframe)]

        plot_df = plot_df.sort_values('cost per ton')
        plot_df = plot_df.head(20)
        
        #call plot funciton
        plot_sankey(plot_df,industry, str(timeframe))
        plot_map(plot_df,industry, str(timeframe))
        plot_html(plot_df,industry, str(timeframe))

           source             target        sum
0  Emitter: TAMCO  Pool ID: 11320211  174.01575
1  Emitter: TAMCO   Pool ID: 3700818  121.38030
2  Emitter: TAMCO   Pool ID: 3707067  109.21166
3  Emitter: TAMCO   Pool ID: 3712620  122.33097
4  Emitter: TAMCO   Pool ID: 3715396  129.30547
sankey_full_detailed_results_steel_1_year.html
mapped  20 unique receivers
           source            target        sum
0  Emitter: TAMCO  Pool ID: 3715681  663.64829
1  Emitter: TAMCO  Pool ID: 3715710  646.39108
2  Emitter: TAMCO  Pool ID: 3715819  660.62992
3  Emitter: TAMCO  Pool ID: 3716266  681.23360
4  Emitter: TAMCO  Pool ID: 3716522  679.00262
sankey_full_detailed_results_steel_10_year.html
mapped  20 unique receivers
                                 source            target         sum
0     Emitter: HANSON PERMANENTE CEMENT  Pool ID: 5920647  164.733310
1     Emitter: HANSON PERMANENTE CEMENT  Pool ID: 5920659  173.429540
2     Emitter: HANSON PERMANENTE CEMENT  Pool ID: 5920660   60.892388