In [1]:
#import packages

import numpy as np
import pandas as pd
import random
import pickle #importing file
import itertools
import folium #mapping
import seaborn as sns #color palette
import plotly
import chart_studio.plotly as py #!pip install chart_studi#o
import datetime
from datetime import date
from pyproj import Geod
import matplotlib
import os
import time
#import pyplot_themes as themes

In [2]:


DATADIR = 'D:/W210_Capstone/co2-storage-fall2021/Visualization/'
os.chdir(DATADIR)
timestamp = time.strftime("%Y-%m-%d-%H-%M-%S") #+"/"
timestamp


os.makedirs(timestamp)

In [3]:
# import data

file_root = '../data/results/'
file_stem = 'full_detailed_results' #'results_pooled_cement_only_complexcost'
file_format = '.pkl'
file_name = file_root+file_stem+file_format #'../data/results/results_pooled_cement_only_complexcost.pkl' ##results_toydata_uniform.pkl'
open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

In [4]:
#convert to dataframe and parse the geolocation

df = pd.DataFrame(loaded_list, columns = ['fac name','supplier', 'pool','receiver', 'distance', 'cost','n_trucks','industry','timeframe','volume','cost per ton'])

#convert suppliers to list to get around weird numpy object datatype
supplier_split = df.supplier.to_list()
receiver_split = df.receiver.to_list()

#split the lists of paired geolocations into two elements each, lat and lon
supplier_lat= [supplier_split[j][0] for j in range(0,len(supplier_split))]
supplier_lon = [supplier_split[j][1] for j in range(0,len(supplier_split))]
receiver_lat= [receiver_split[j][0] for j in range(0,len(receiver_split))]
receiver_lon = [receiver_split[j][1] for j in range(0,len(receiver_split))]

#add back into the df
df['supplier_lat'] = supplier_lat
df['supplier_lon'] = supplier_lon
df['receiver_lat'] = receiver_lat
df['receiver_lon'] = receiver_lon


#clean names
fac_names = list(df['fac name'])
fac_names_clean = ['Emitter: ' +str(item) for item in fac_names ]

pool_names = list(df['pool'])
pool_names= ["Pool ID: " + str(item).replace('.0','') if '.0' in str(item) else item for item in pool_names]

pool_names


df['fac name clean'] = fac_names_clean
df['pool name clean'] = pool_names

#df.head()

In [5]:
#create a color palette, repeat it 100 times so we don't run out of colors for large graphs
palette = ['red', 'green', 'purple', 'orange',  'darkblue',
           'gray','cadetblue', 'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', # 'darkred','lightred' these coolors looked very similar to other colors, commented out
           'black','lightgray','lightred','blue','beige']*100

## Sankey Diagram

In [6]:
industries = set(list(df['industry']))
timeframes = set(list(df['timeframe']))

In [7]:
#functionalize
def plot_sankey(plot_df,industry,timeframe):

    df = plot_df
    cat_cols=['fac name clean','pool name clean']
    value_cols='volume'
    title='sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year'
    industry = industry
    timeframe = timeframe

    colorPalette = palette #['#4B8BBE', '#4B8BBE'] #,'#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []

    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp

    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))

    # # define colors based on number of levels
    # colorList = palette #[]
    # for idx, colorNum in enumerate(colorNumList): #count how many colors you need
    #     print(idx, colorNum)
    #     colorList = colorList + [colorPalette[idx]]*colorNum 

    colorList = []
    ix = 0

    for i in range(0,len(labelList)):
        #if the item contains emitter, increment the color
        if 'Emitter' in labelList[i]: 
            colorList.append(palette[ix])
            ix = ix+1
        else:
            colorList.append('Black')    

    #colorList

    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','sum']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','sum']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'sum':'sum'}).reset_index()

    #print (sourceTargetDf.head())

    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

    ### set colors for the links
    # pick a spectrum to use from the matplotlib spectra
    cmap = matplotlib.cm.get_cmap('cool')
    # pick an attribute to color by
    link_colors = df['cost per ton'].to_list()
    # normalize attribute values to between 0 and 1
    maxlink = max(link_colors)
    minlink = min(link_colors)
    # If values do not form a range, all the same, use a single color (happens at least once)
    if (maxlink - minlink) == 0:
        link_colors = 'grey'
#   Otherwise extract a normalized color from the spectrum, convert to hex and add some transparency
    else:
        link_colors = [ cmap(float(i-minlink)/(maxlink-minlink)) for i in link_colors ]
        link_colors = [ matplotlib.colors.rgb2hex(i) for i in link_colors]
        link_colors = [ i + '66' for i in link_colors]
#   Create a colorbar for the title
    cbar = "<b>" + str(int(minlink)) + "  "
    for i in [x / 20.0 for x in range(1, 21, 1) ]:
        cbar += "<span style=\'color:" + matplotlib.colors.rgb2hex(cmap(i)) + "66\'>&#9608;</span>" 
    cbar += "  " + str(int(maxlink)) + "</span></b>  dollars per metric tonne"
#    print(cbar)

    # creating the sankey diagram
    data = dict(
        type='sankey',
        valueformat = ".2f",
        valuesuffix = "K tons",
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = 'black' # colorList #
        ),
        link = dict(
          valueformat = ".2f",
          valuesuffix = "K tons",
#         Apply color to the links
          color=link_colors,
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['sum']# ['Volume:' + str(item) for item in list(sourceTargetDf['sum'])]
        )
      )

    layout =  dict(
        title = cbar,
        font = dict(
          size = 8
        )
    )

    fig = dict(data=[data], layout=layout, font=dict(size=8, color='white'), pad=0,paper_bgcolor='#51504f')
    #fig.update_layout(margin_b=0)

#     fig.update_layout(title_text=, font_size=10)

    sankey_filename = 'sankey_'+file_stem+'_'+industry+'_'+timeframe+'_year.html'
    #print(sankey_filename)
    #plotly.offline.plot(fig, validate=False, filename= timestamp+'//'+sankey_filename)
    plotly.offline.plot(fig, validate=False, filename= timestamp+'//'+sankey_filename, auto_open=False)

In [8]:
#for testing
# #print(industry, timeframe)
# plot_df = df[(df['industry'] =='cement') & (df['timeframe'] == 1)]

# plot_df = plot_df.sort_values('cost per ton')
# plot_df = plot_df.head(20)
        
#         #call plot funciton 
# plot_sankey(plot_df,'cement', str(1))
# #plot_df

## Map the Data

In [9]:
#pre-work for the map 

def plot_map(plot_df,industry,timeframe):

    #find unique suppliers since we only need to plot them once
    unique_suppliers = plot_df[['supplier_lat', 'supplier_lon','fac name']]
    unique_suppliers = unique_suppliers.drop_duplicates()
    unique_suppliers =unique_suppliers.reset_index(drop=True)
    unique_suppliers

    #find max and quantile of volume for setting up thicker lines
    volumes = plot_df['volume']
    quantiles = np.quantile(volumes, [0.2,0.4,0.6,0.8])
    max_volume = np.max(volumes)

    #initialize the count of receivers to 0
    receiver_count = 0

    # Create a map centered in CA
    mapit = folium.Map( location=[36.3302, -119.2921], zoom_start=6) 

    #loop through unique suppliers (outer loop)
    #and receivers for each supplier (inner loop) 
    for i in range(0,unique_suppliers.shape[0]):

        #find all receivers associated with the supplier late and lon, filter the dataframe down to receivers
        #for this particular supplier 
        s_lat = unique_suppliers['supplier_lat'][i]
        s_lon = unique_suppliers['supplier_lon'][i]
        s_name = unique_suppliers['fac name'][i]

        #big circle for supplier
        folium.CircleMarker( #RegularPolygonMarker( #CircleMarker( 
                            location=[ s_lat,s_lon ], 
                            #fill=True, 
                            fill_color=palette[i], 
                            color=palette[i],
                            #number_of_sides=6,
                            tooltip = 'Emitter: ' + s_name, #+str(s_lat)+', '+str(s_lon),
                            radius=4
                            ).add_to( mapit ) #'


        receivers = plot_df[(plot_df['supplier_lat'] == s_lat) & (plot_df['supplier_lon'] == s_lon)]

        #plot the receivers with a new color each time
        latlon = receivers['receiver']
        pool_ids = receivers['pool']
        receiver_count += receivers.shape[0] #to print at the end as a check

        #for each receiver  
        for j in range(0,len(latlon)): 

            #split apart into lat and lon
            lat = latlon.iloc[j][0]
            lon = latlon.iloc[j][1]
            pool_id = pool_ids.iloc[j]

            volume = receivers.iloc[j]['volume']

            #line coordinates
            line_coords = [ [s_lat, s_lon], [lat, lon] ]

            line_tooltip = str('Emitter: ')+s_name+ str('<br>Pool ID: ')+ pool_id

            #create line
            my_PolyLine= folium.PolyLine(locations=line_coords,
                                         color=palette[i],
                                         tooltip = line_tooltip,
                                         weight=2) #color and supplier use i index

            #add line to map
            mapit.add_child(my_PolyLine)
            
            geodesic = Geod(ellps='WGS84')
            
            #rotation = [geodesic.inv(pair[0][1], pair[0][0], pair[1][1], pair[1][0])[0]+90 for pair in pairs]
            rot = geodesic.inv(lon, lat, s_lon, s_lat)[0]+90
            
            folium.RegularPolygonMarker(location=[ lat,lon], 
                                        fill_color=palette[i], 
                                        color=palette[i],  
                                        number_of_sides=3, 
                                        radius=4,
                                        tooltip =  'Pool ID: '+ str(pool_id)
                                        , rotation=rot
                                       ).add_to(mapit)

            
#             #add dot for the receiver
#             folium.CircleMarker(location=[ lat,lon ],
#                                 #bounds=[ lat,lon ],
#                                fill=True, 
#                                 fill_color=palette[i], 
#                                 color=palette[i], 
#                                 tooltip =  'Pool ID: '+ str(pool_id),#'receiver tooltip placeholder',
#                                 radius=5 ).add_to( mapit )

#             icon_square = folium.Icon(
#             icon_shape='rectangle-dot', 
#             border_color=palette[i], 
#             border_width=10)
        
#             folium.Marker( location=[ lat,lon ],  tooltip =  'Pool ID: '+ str(pool_id)#'receiver tooltip placeholder',
#                           , icon=icon_square).add_to(mapit)
            

    print("mapped ", receiver_count, "unique receivers")
    mapit #can run this in the notebook if you need to view it there. better to just open the .html in another browser tab

    #save the file
    mapit.save(outfile= timestamp+'//'+'map_'+file_stem+'_'+industry+'_'+str(timeframe)+'_year.html')
    
    mapit


In [10]:
#testing
#df[(df['industry'] =='cement') & (df['timeframe'] ==1) & (df['fac name'] =='LEHIGH SOUTHWEST CEMENT CO.')]

In [11]:
from math import floor, log10

# df = pd.DataFrame({'floats':[123.949, 23.87, 1.9865, 0.0129500]})

def smarter_round(sig):
    def rounder(x):
        offset = sig - floor(log10(abs(x)))
        initial_result = round(x, offset)
        if str(initial_result)[-1] == '5' and initial_result == x:
            return round(x, offset - 2)
        else:
            return round(x, offset - 1)
    return rounder

# print(df['floats'].apply(smarter_round(3)))


In [12]:
from pretty_html_table import build_table

def plot_html(plot_df, industry, timeframe):
    plot_html = plot_df[['fac name','pool','distance','volume', 'cost per ton']].round(decimals=2) 

    plot_html['distance'] = plot_html['distance'].apply(lambda x: int(x))
    plot_html['cost per ton'] = plot_html['cost per ton'].apply(lambda x: "${:.0f}".format((x)))
    plot_html['pool'] = plot_html['pool'].apply(lambda x: x.replace('_',' '))
    plot_html.columns = ['CO2 Emitter','Storage Pool Location','Distance (mi)','Volume (kt)', 'Cost per Ton']
    
    filename=  timestamp+'//'+'html_'+file_stem+'_'+industry+'_'+str(timeframe)+'_year.html'

    html_table_blue_light = build_table(plot_html, 'blue_light')
    with open(filename, 'w') as fo:
        fo.write(html_table_blue_light)

In [13]:
#sankey diagram of each

for industry in industries:
    for timeframe in timeframes:
        #print(industry, timeframe)
        plot_df = df[(df['industry'] ==industry) & (df['timeframe'] == timeframe)]

        plot_df = plot_df.sort_values('cost per ton')
        plot_df = plot_df.head(50)
        
        #call plot function
        plot_sankey(plot_df,industry, str(timeframe))
        plot_map(plot_df,industry, str(timeframe))
        plot_html(plot_df,industry, str(timeframe))

mapped  56 unique receivers
mapped  52 unique receivers
mapped  52 unique receivers
mapped  50 unique receivers
mapped  50 unique receivers
mapped  18 unique receivers
mapped  31 unique receivers
mapped  50 unique receivers
mapped  50 unique receivers
mapped  50 unique receivers
mapped  7 unique receivers
mapped  7 unique receivers
mapped  7 unique receivers
mapped  7 unique receivers
mapped  7 unique receivers
mapped  1 unique receivers
mapped  1 unique receivers
mapped  1 unique receivers
mapped  1 unique receivers
mapped  1 unique receivers
mapped  50 unique receivers
mapped  50 unique receivers
mapped  62 unique receivers
mapped  56 unique receivers
mapped  53 unique receivers


## Map all receivers and emitters

In [14]:

def plot_map_all(plot_df):

    #find unique suppliers since we only need to plot them once
    unique_suppliers = plot_df[['supplier_lat', 'supplier_lon','fac name']]
    unique_suppliers = unique_suppliers.drop_duplicates()
    unique_suppliers =unique_suppliers.reset_index(drop=True)
    unique_suppliers

    #find max and quantile of volume for setting up thicker lines
    volumes = plot_df['volume']
    quantiles = np.quantile(volumes, [0.2,0.4,0.6,0.8])
    max_volume = np.max(volumes)


    #initialize the count of receivers to 0
    receiver_count = 0

    # Create a map centered in CA
    mapit = folium.Map( location=[36.3302, -119.2921], zoom_start=6) 

    #loop through unique suppliers (outer loop)
    #and receivers for each supplier (inner loop) 
    for i in range(0,unique_suppliers.shape[0]):

        #find all receivers associated with the supplier late and lon, filter the dataframe down to receivers
        #for this particular supplier 
        s_lat = unique_suppliers['supplier_lat'][i]
        s_lon = unique_suppliers['supplier_lon'][i]
        s_name = unique_suppliers['fac name'][i]

        #big circle for supplier
        folium.CircleMarker( 
                            location=[ s_lat,s_lon ], 
                            fill=True, 
                            fill_color='blue', 
                            color='blue',
                            tooltip = 'Emitter: ' + s_name, #+str(s_lat)+', '+str(s_lon),
                            radius=1
                            ).add_to( mapit ) #'


        receivers = plot_df[(plot_df['supplier_lat'] == s_lat) & (plot_df['supplier_lon'] == s_lon)]

        #plot the receivers with a new color each time
        latlon = receivers['receiver']
        pool_ids = receivers['pool']
        receiver_count += receivers.shape[0] #to print at the end as a check

        #for each receiver  
        for j in range(0,len(latlon)): 

            #split apart into lat and lon
            lat = latlon.iloc[j][0]
            lon = latlon.iloc[j][1]
            pool_id = pool_ids.iloc[j]

            volume = receivers.iloc[j]['volume']

            #line coordinates
            #line_coords = [ [s_lat, s_lon], [lat, lon] ]

            #line_tooltip = str('Emitter: ')+s_name+ str('<br>Pool ID: ')+ pool_id

            #create line
            #my_PolyLine= folium.PolyLine(locations=line_coords,
             #                            color=palette[i],
             #                            tooltip = line_tooltip,
              #                           weight=2) #color and supplier use i index

            #add line to map
            #mapit.add_child(my_PolyLine)
            
            #add dot for the receiver
            folium.CircleMarker(location=[ lat,lon ], #bounds=[ lat,lon ],
                                fill=True, 
                                fill_color='black', 
                                color='black', 
                                tooltip =  'Pool ID: '+ str(pool_id),#'receiver tooltip placeholder',
                                radius=1 ).add_to( mapit )
            
           
##             icon_square = folium.Icon(
#             icon_shape='rectangle-dot', 
#             border_color=palette[i], 
#             border_width=10)
        
#             folium.Marker( location=[ lat,lon ],  tooltip =  'Pool ID: '+ str(pool_id)#'receiver tooltip placeholder',
#                           , icon=icon_square).add_to(mapit)
            

    print("mapped ", receiver_count, "unique receivers")
    mapit #can run this in the notebook if you need to view it there. better to just open the .html in another browser tab

    #save the file
    mapit.save(outfile= timestamp+'//'+'map_'+file_stem+'_'+'all'+'_'+'all_year.html')
    
    mapit

In [15]:
plot_map_all(df)

mapped  2584 unique receivers


In [16]:
plot_html(df.head(0),'all','all')