In [173]:
#import packages

import numpy as np
import pandas as pd
import random
import pickle #importing file
import itertools
import folium #mapping
import seaborn as sns #color palette
import plotly
import chart_studio.plotly as py #!pip install chart_studi#o
import datetime
from datetime import date
#import pyplot_themes as themes

In [174]:
import time
timestamp = time.strftime("%Y-%m-%d-%H:%M:%S") #+"/"
timestamp

import os
os.makedirs(timestamp)

In [175]:
# import data

file_root = '../data/results/'
file_stem = 'full_detailed_results' #'results_pooled_cement_only_complexcost'
file_format = '.pkl'
file_name = file_root+file_stem+file_format #'../data/results/results_pooled_cement_only_complexcost.pkl' ##results_toydata_uniform.pkl'
open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

In [176]:
#convert to dataframe and parse the geolocation

df = pd.DataFrame(loaded_list, columns = ['fac name','supplier', 'pool','receiver', 'distance', 'cost','volume','cost per ton','industry','timeframe'])

#convert suppliers to list to get around weird numpy object datatype
supplier_split = df.supplier.to_list()
receiver_split = df.receiver.to_list()

#split the lists of paired geolocations into two elements each, lat and lon
supplier_lat= [supplier_split[j][0] for j in range(0,len(supplier_split))]
supplier_lon = [supplier_split[j][1] for j in range(0,len(supplier_split))]
receiver_lat= [receiver_split[j][0] for j in range(0,len(receiver_split))]
receiver_lon = [receiver_split[j][1] for j in range(0,len(receiver_split))]

#add back into the df
df['supplier_lat'] = supplier_lat
df['supplier_lon'] = supplier_lon
df['receiver_lat'] = receiver_lat
df['receiver_lon'] = receiver_lon


#clean names
fac_names = list(df['fac name'])
fac_names_clean = ['Emitter: ' +str(item) for item in fac_names ]

pool_names = list(df['pool'])
pool_names= ["Pool ID: " + str(item).replace('.0','') if '.0' in str(item) else item for item in pool_names]

pool_names


df['fac name clean'] = fac_names_clean
df['pool name clean'] = pool_names

df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,cost,volume,cost per ton,industry,timeframe,supplier_lat,supplier_lon,receiver_lat,receiver_lon,fac name clean,pool name clean
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",Asphalto_Any Area_Antelope Shale,"[35.282426, -119.56323450000002]",146.754233,2301358.0,115.2231,19973.061442,cement,1,34.6222,-117.1001,35.282426,-119.563235,Emitter: CEMEX Construction Materials Pacific LLC,Asphalto_Any Area_Antelope Shale
1,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",Bowerbank_Any Area_No Pool Breakdown,"[35.421670000000006, -119.4112865]",141.956006,2001860.0,77.034121,25986.660643,cement,1,34.6222,-117.1001,35.42167,-119.411287,Emitter: CEMEX Construction Materials Pacific LLC,Bowerbank_Any Area_No Pool Breakdown
2,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",Buena Vista_Buena Vista Hills_Upper (To-Etcheg...,"[35.1898, -119.461314]",139.418375,2286294.0,5.577428,409919.059842,cement,1,34.6222,-117.1001,35.1898,-119.461314,Emitter: CEMEX Construction Materials Pacific LLC,Buena Vista_Buena Vista Hills_Upper (To-Etcheg...
3,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",Cache Creek Gas_Any Area_Starkey,"[38.735106815, -121.72364445000002]",382.50196,2785463.0,138.22491,20151.672983,cement,1,34.6222,-117.1001,38.735107,-121.723644,Emitter: CEMEX Construction Materials Pacific LLC,Cache Creek Gas_Any Area_Starkey
4,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",Calders Corner_Any Area_Stevens,"[35.392538, -119.255733]",133.094814,2273309.0,46.122788,49288.187095,cement,1,34.6222,-117.1001,35.392538,-119.255733,Emitter: CEMEX Construction Materials Pacific LLC,Calders Corner_Any Area_Stevens


In [177]:
#create a color palette, repeat it 100 times so we don't run out of colors for large graphs
palette = ['red', 'green', 'purple', 'orange',  'darkblue',
           'gray','cadetblue', 'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', # 'darkred','lightred' these coolors looked very similar to other colors, commented out
           'black','lightgray','lightred','blue','beige']*100

In [178]:
#for testing
#df = pd.concat([df.tail(10), df.head(10)])
#df

## Sankey Diagram

In [179]:
industries = set(list(df['industry']))
timeframes = set(list(df['timeframe']))

In [180]:

#df = df.sort_values('volume') sorting is done inside the sankey by some complex algo

In [181]:
#functionalize
def plot_sankey(plot_df,industry,timeframe):

    df = plot_df
    cat_cols=['fac name clean','pool name clean']
    value_cols='volume'
    title='sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year'
    industry = industry
    timeframe = timeframe

    colorPalette = palette #['#4B8BBE', '#4B8BBE'] #,'#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []

    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp

    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))

    # # define colors based on number of levels
    # colorList = palette #[]
    # for idx, colorNum in enumerate(colorNumList): #count how many colors you need
    #     print(idx, colorNum)
    #     colorList = colorList + [colorPalette[idx]]*colorNum 

    colorList = []
    ix = 0

    for i in range(0,len(labelList)):
        #if the item contains emitter, increment the color
        if 'Emitter' in labelList[i]: 
            colorList.append(palette[ix])
            ix = ix+1
        else:
            colorList.append('Black')    

    #colorList

    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','sum']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','sum']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'sum':'sum'}).reset_index()

    print (sourceTargetDf.head())

    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = 'black' # colorList #
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['sum']# ['Volume:' + str(item) for item in list(sourceTargetDf['sum'])]
        )
      )

    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )

    fig = dict(data=[data], layout=layout)

    sankey_filename = 'sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year.html'
    #print(sankey_filename)
    #plotly.offline.plot(fig, validate=False, filename= timestamp+'//'+sankey_filename)
    plotly.offline.plot(fig, validate=False, filename= timestamp+'//'+sankey_filename,auto_open=False)

In [182]:
#sankey_filename = 'sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year.html'
#sankey_filename
plot_sankey(df.head(),'cement',str(1))

                                              source  \
0  Emitter: CEMEX Construction Materials Pacific LLC   
1  Emitter: CEMEX Construction Materials Pacific LLC   
2  Emitter: CEMEX Construction Materials Pacific LLC   
3  Emitter: CEMEX Construction Materials Pacific LLC   
4  Emitter: CEMEX Construction Materials Pacific LLC   

                                              target         sum  
0                   Asphalto_Any Area_Antelope Shale  115.223100  
1               Bowerbank_Any Area_No Pool Breakdown   77.034121  
2  Buena Vista_Buena Vista Hills_Upper (To-Etcheg...    5.577428  
3                   Cache Creek Gas_Any Area_Starkey  138.224910  
4                    Calders Corner_Any Area_Stevens   46.122788  


In [183]:
#colorList

## Map the Data

In [184]:
#pre-work for the map 

def plot_map(plot_df,industry,timeframe):

    #find unique suppliers since we only need to plot them once
    unique_suppliers = plot_df[['supplier_lat', 'supplier_lon','fac name']]
    unique_suppliers = unique_suppliers.drop_duplicates()
    unique_suppliers =unique_suppliers.reset_index(drop=True)
    unique_suppliers

    #find max and quantile of volume for setting up thicker lines
    volumes = plot_df['volume']
    quantiles = np.quantile(volumes, [0.2,0.4,0.6,0.8])
    max_volume = np.max(volumes)


    #initialize the count of receivers to 0
    receiver_count = 0

    # Create a map centered in CA
    mapit = folium.Map( location=[37.3427, -119.2244], zoom_start=7) 

    #loop through unique suppliers (outer loop)
    #and receivers for each supplier (inner loop) 
    for i in range(0,unique_suppliers.shape[0]):

        #find all receivers associated with the supplier late and lon, filter the dataframe down to receivers
        #for this particular supplier 
        s_lat = unique_suppliers['supplier_lat'][i]
        s_lon = unique_suppliers['supplier_lon'][i]
        s_name = unique_suppliers['fac name'][i]

        #big circle for supplier
        folium.CircleMarker( 
                            location=[ s_lat,s_lon ], 
                            fill=True, 
                            fill_color=palette[i], 
                            color=palette[i],
                            tooltip = 'Emitter: ' + s_name, #+str(s_lat)+', '+str(s_lon),
                            radius=9
                            ).add_to( mapit ) #'


        receivers = plot_df[(plot_df['supplier_lat'] == s_lat) & (plot_df['supplier_lon'] == s_lon)]

        #plot the receivers with a new color each time
        latlon = receivers['receiver']
        pool_ids = receivers['pool']
        receiver_count += receivers.shape[0] #to print at the end as a check

        #for each receiver  
        for j in range(0,len(latlon)): 

            #split apart into lat and lon
            lat = latlon.iloc[j][0]
            lon = latlon.iloc[j][1]
            pool_id = pool_ids.iloc[j]

            volume = receivers.iloc[j]['volume']

            #line coordinates
            line_coords = [ [s_lat, s_lon], [lat, lon] ]

            line_tooltip = str('Emitter: ')+s_name+ str('<br>Pool ID: ')+ pool_id

            #create line
            my_PolyLine= folium.PolyLine(locations=line_coords,
                                         color=palette[i],
                                         tooltip = line_tooltip,
                                         weight=2) #color and supplier use i index

            #add line to map
            mapit.add_child(my_PolyLine)
            
            #add dot for the receiver
            folium.CircleMarker(location=[ lat,lon ],
                                #bounds=[ lat,lon ],
                               fill=True, 
                                fill_color=palette[i], 
                                color=palette[i], 
                                tooltip =  'Pool ID: '+ str(pool_id),#'receiver tooltip placeholder',
                                radius=5 ).add_to( mapit )

#             icon_square = folium.Icon(
#             icon_shape='rectangle-dot', 
#             border_color=palette[i], 
#             border_width=10)
        
#             folium.Marker( location=[ lat,lon ],  tooltip =  'Pool ID: '+ str(pool_id)#'receiver tooltip placeholder',
#                           , icon=icon_square).add_to(mapit)
            

    print("mapped ", receiver_count, "unique receivers")
    mapit #can run this in the notebook if you need to view it there. better to just open the .html in another browser tab

    #save the file
    mapit.save(outfile= timestamp+'//'+'map_'+file_stem+'_'+industry+'_'+str(timeframe)+'_year.html')
    
    mapit


In [185]:
#plot_map(df.head(),'cement',str(1))

In [186]:
def plot_html(plot_df, industry, timeframe):
    plot_html = plot_df[['fac name','pool','distance','cost','volume', 'cost per ton','industry','timeframe']].round(decimals=2)
    plot_html.columns = ['Emitter','Pool','Distance','Cost','Volume', 'Cost per ton','Industry','Timeframe (Years)']
    
    filename=  timestamp+'//'+'html_'+file_stem+'_'+industry+'_'+str(timeframe)+'_year.html'
    
    with open(filename, 'w') as fo:
        fo.write(plot_html.to_html(index=False))

In [187]:
#sankey diagram of each

for industry in industries:
    for timeframe in timeframes:
        #print(industry, timeframe)
        plot_df = df[(df['industry'] ==industry) & (df['timeframe'] == timeframe)]

        plot_df = plot_df.sort_values('cost per ton')
        plot_df = plot_df.head(20)
        
        #call plot funciton
        plot_sankey(plot_df,industry, str(timeframe))
        plot_map(plot_df,industry, str(timeframe))
        plot_html(plot_df,industry, str(timeframe))

                                              source  \
0           Emitter: CHEVRON PRODS.CO. RICHMOND REFY   
1                   Emitter: KERN OIL & REFINING CO.   
2                    Emitter: LUNDAY-THAGARD COMPANY   
3                    Emitter: LUNDAY-THAGARD COMPANY   
4  Emitter: Phillips 66 Los Angeles Refinery - Ca...   

                                              target        sum  
0                     Las Llajas_Any Area_Las Llajas  204524.24  
1                  Aliso Canyon_Any Area_Aliso, West  402145.38  
2  Coles Levee, North_Any Area_Stevens (Undiffere...  400917.48  
3                                Sansinena_East_pool  428088.92  
4                  Chico-Martinez_Any Area_Etchegoin  603963.10  
mapped  20 unique receivers
                                              source  \
0                   Emitter: KERN OIL & REFINING CO.   
1                   Emitter: KERN OIL & REFINING CO.   
2                    Emitter: LUNDAY-THAGARD COMPANY   
3             E

           source                                target          sum
0  Emitter: TAMCO     Aliso Canyon_Any Area_Sesnon-Frew    9702.6903
1  Emitter: TAMCO                 Cascade_Any Area_Deep   11349.9230
2  Emitter: TAMCO    Cascade_Any Area_No Pool Breakdown    9088.5687
3  Emitter: TAMCO  Castaic Junction (ABD)_Any Area_10-B    9599.4265
4  Emitter: TAMCO              Davis Southeast Gas_pool  177278.0100
mapped  20 unique receivers
                                              source  \
0  Emitter: CEMEX Construction Materials Pacific LLC   
1  Emitter: CEMEX Construction Materials Pacific LLC   
2  Emitter: CEMEX Construction Materials Pacific LLC   
3  Emitter: CEMEX Construction Materials Pacific LLC   
4  Emitter: CEMEX Construction Materials Pacific LLC   

                                    target        sum  
0  Cat Canyon_Gato Ridge_No Pool Breakdown  572.83465  
1           Chowchilla Gas_Any Area_Garzas  468.86543  
2     Cymric_Sheep Springs_Phacoides (ABD)  671.83561