In [83]:
#import packages

import numpy as np
import pandas as pd
import random
import pickle #importing file
import itertools
import folium #mapping
import seaborn as sns #color palette
import plotly
import chart_studio.plotly as py #!pip install chart_studio
import plotly.graph_objects as go

In [84]:
# import data

file_root = '../data/results/'
file_stem = #'results_pooled_cement_only_distance' #'results_pooled_cement_only_complexcost'
file_format = '.pkl'
file_name = file_root+file_stem+file_format #'../data/results/results_pooled_cement_only_complexcost.pkl' ##results_toydata_uniform.pkl'
open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

In [85]:
#convert to dataframe and parse the geolocation

df = pd.DataFrame(loaded_list, columns = ['fac name','supplier', 'pool','receiver', 'distance', 'volume'])
df.head()

#convert suppliers to list to get around weird numpy object datatype
supplier_split = df.supplier.to_list()
receiver_split = df.receiver.to_list()

#split the lists of paired geolocations into two elements each, lat and lon
supplier_lat= [supplier_split[j][0] for j in range(0,len(supplier_split))]
supplier_lon = [supplier_split[j][1] for j in range(0,len(supplier_split))]
receiver_lat= [receiver_split[j][0] for j in range(0,len(receiver_split))]
receiver_lon = [receiver_split[j][1] for j in range(0,len(receiver_split))]

#add back into the df
df['supplier_lat'] = supplier_lat
df['supplier_lon'] = supplier_lon
df['receiver_lat'] = receiver_lat
df['receiver_lon'] = receiver_lon

df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",11320795.0,"[38.773042, -121.863128]",389.592677,12834.3,34.6222,-117.1001,38.773042,-121.863128
1,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920644.0,"[33.91736204, -117.8454482]",334.6985,88.300192,37.3181,-122.091,33.917362,-117.845448
2,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920647.0,"[33.90629635, -117.8173157]",336.371767,164.73331,37.3181,-122.091,33.906296,-117.817316
3,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920659.0,"[33.87178115, -117.8282233]",337.64833,211.86649,37.3181,-122.091,33.871781,-117.828223
4,LEHIGH SOUTHWEST CEMENT CO.,"[40.7369, -122.3223]",9500231.0,"[38.14384789, -121.7042643]",182.171147,1072.2,40.7369,-122.3223,38.143848,-121.704264


In [86]:
#create a color palette, repeat it 100 times so we don't run out of colors for large graphs
palette = ['red', 'green', 'purple', 'orange',  'darkblue',
           'gray','cadetblue', 'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', # 'darkred','lightred' these coolors looked very similar to other colors, commented out
           'black','lightgray','lightred','blue','beige']*100

## Clean the data

### Review the dataframe

In [87]:
df.shape

(9, 10)

In [88]:
#view a record or five
#df[df['supplier_lat'] == 35.399380]
df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",11320795.0,"[38.773042, -121.863128]",389.592677,12834.3,34.6222,-117.1001,38.773042,-121.863128
1,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920644.0,"[33.91736204, -117.8454482]",334.6985,88.300192,37.3181,-122.091,33.917362,-117.845448
2,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920647.0,"[33.90629635, -117.8173157]",336.371767,164.73331,37.3181,-122.091,33.906296,-117.817316
3,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920659.0,"[33.87178115, -117.8282233]",337.64833,211.86649,37.3181,-122.091,33.871781,-117.828223
4,LEHIGH SOUTHWEST CEMENT CO.,"[40.7369, -122.3223]",9500231.0,"[38.14384789, -121.7042643]",182.171147,1072.2,40.7369,-122.3223,38.143848,-121.704264


### EDA: Mean distances

In [89]:
#group the dataframe by mean distance
df_grouped = df.groupby(['supplier_lat','supplier_lon']).mean('distance').reset_index()
df_grouped

#sort on distance
df_grouped = df_grouped.sort_values('distance')
df_grouped

#get top 10 only
df_grouped = df_grouped.reset_index() #can add .tail(10) head(10)
df_grouped

Unnamed: 0,index,supplier_lat,supplier_lon,distance,volume,receiver_lat,receiver_lon
0,3,34.819863,-118.748732,47.630607,2476.8,34.314863,-119.318594
1,4,35.029298,-118.316236,89.149867,712.0,35.612471,-119.726892
2,6,40.7369,-122.3223,182.171147,1072.2,38.143848,-121.704264
3,5,37.3181,-122.091,336.239532,154.966664,33.89848,-117.830329
4,1,34.6045,-117.3382,367.071774,270.5,38.643526,-121.641154
5,2,34.6222,-117.1001,389.592677,12834.3,38.773042,-121.863128
6,0,34.437557,-116.891034,407.03024,421.7,38.773042,-121.863128


### EDA on volumes

In [90]:
volumes = df['volume']
volumes.describe()

count        9.000000
mean      2028.044444
std       4120.444389
min         88.300192
25%        211.866490
50%        421.700000
75%       1072.200000
max      12834.300000
Name: volume, dtype: float64

## Sankey Diagram

In [91]:
df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",11320795.0,"[38.773042, -121.863128]",389.592677,12834.3,34.6222,-117.1001,38.773042,-121.863128
1,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920644.0,"[33.91736204, -117.8454482]",334.6985,88.300192,37.3181,-122.091,33.917362,-117.845448
2,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920647.0,"[33.90629635, -117.8173157]",336.371767,164.73331,37.3181,-122.091,33.906296,-117.817316
3,HANSON PERMANENTE CEMENT,"[37.3181, -122.091]",5920659.0,"[33.87178115, -117.8282233]",337.64833,211.86649,37.3181,-122.091,33.871781,-117.828223
4,LEHIGH SOUTHWEST CEMENT CO.,"[40.7369, -122.3223]",9500231.0,"[38.14384789, -121.7042643]",182.171147,1072.2,40.7369,-122.3223,38.143848,-121.704264


In [92]:
#no function

df = df #.head()
cat_cols=['fac name','pool']
value_cols='volume'
title=file_stem

colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
labelList = []
colorNumList = []
for catCol in cat_cols:
    labelListTemp =  list(set(df[catCol].values))
    colorNumList.append(len(labelListTemp))
    labelList = labelList + labelListTemp

# remove duplicates from labelList
labelList = list(dict.fromkeys(labelList))

# define colors based on number of levels
colorList = []
for idx, colorNum in enumerate(colorNumList):
    colorList = colorList + [colorPalette[idx]]*colorNum

# transform df into a source-target pair
for i in range(len(cat_cols)-1):
    if i==0:
        sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
        sourceTargetDf.columns = ['source','target','sum']
    else:
        tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
        tempDf.columns = ['source','target','sum']
        sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
    sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'sum':'sum'}).reset_index()

print (sourceTargetDf.head())

# add index for source-target pair
sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))

# creating the sankey diagram
data = dict(
    type='sankey',
    node = dict(
      pad = 15,
      thickness = 20,
      line = dict(
        color = "black",
        width = 0.5
      ),
      label = labelList,
      color = colorList
    ),
    link = dict(
      source = sourceTargetDf['sourceID'],
      target = sourceTargetDf['targetID'],
      value = sourceTargetDf['sum']
    )
  )

layout =  dict(
    title = title,
    font = dict(
      size = 10
    )
)

fig = dict(data=[data], layout=layout)

sankey_filename = 'sankey_'+file_stem+'.html'
sankey_filename
plotly.offline.plot(fig, validate=False, filename=sankey_filename)

                                     source      target           sum
0  CEMEX Construction Materials Pacific LLC  11320795.0  12834.300000
1          CalPortland Company Mojave Plant   2921587.0    712.000000
2      CalPortland Company Oro Grande Plant  11320206.0    270.500000
3                  HANSON PERMANENTE CEMENT   5920644.0     88.300192
4                  HANSON PERMANENTE CEMENT   5920647.0    164.733310


'sankey_results_pooled_cement_only_distance.html'

In [93]:
df[df['pool'] ==1120330]

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon


In [94]:
#plotly.offline.plot?

## Map the Data

In [95]:
#pre-work for the map 

#find unique suppliers since we only need to plot them once
unique_suppliers = df[['supplier_lat', 'supplier_lon']]
unique_suppliers = unique_suppliers.drop_duplicates()
unique_suppliers =unique_suppliers.reset_index(drop=True)
unique_suppliers

#find max and quantile of volume for setting up thicker lines
volumes = df['volume']
quantiles = np.quantile(volumes, [0.2,0.4,0.6,0.8])
max_volume = np.max(volumes)


In [96]:
#for testing q uick things
#df = df.head(10)


In [97]:
#initialize the count of receivers to 0
receiver_count = 0

# Create a map centered in CA
mapit = folium.Map( location=[37.3427, -119.2244], zoom_start=6) 

#loop through unique suppliers (outer loop)
#and receivers for each supplier (inner loop) 
for i in range(0,unique_suppliers.shape[0]):
    
    #find all receivers associated with the supplier late and lon, filter the dataframe down to receivers
    #for this particular supplier 
    s_lat = unique_suppliers['supplier_lat'][i]
    s_lon = unique_suppliers['supplier_lon'][i]
    
    #big circle for supplier
    folium.CircleMarker( 
                        location=[ s_lat,s_lon ], 
                        fill=True, 
                        fill_color=palette[i], 
                        color=palette[i],
                        tooltip = 'Supplier: '+str(s_lat)+', '+str(s_lon),
                        radius=10 
                        ).add_to( mapit ) #'
    

    receivers = df[(df['supplier_lat'] == s_lat) & (df['supplier_lon'] == s_lon)]
    
    #plot the receivers with a new color each time
    latlon = receivers['receiver']
    receiver_count += receivers.shape[0] #to print at the end as a check
    
    #for each receiver  
    for j in range(0,len(latlon)): 
        
        #split apart into lat and lon
        lat = latlon.iloc[j][0]
        lon = latlon.iloc[j][1]
        
        volume = receivers.iloc[j]['volume']
        #if volume > quantiles[3]:
        #    quantile = 1 #80-100 percentile
        #elif volume > quantiles[2]:
        #    quantile = 0.8 #60-80 percentile
        #elif volume > quantiles[1]:
        #    quantile = 0.6 #40-60 percentile
        #elif volume > quantiles[0]:
        #    quantile = 0.4 #20-40 percentile
        #else:
        #    quantile  = 0.2 #0-20 percentile
        #line_weight = quantile * 5
        line_weight = volume/max_volume * 20
        
        #line coordinates
        line_coords = [ [s_lat, s_lon], [lat, lon] ]
        
        line_tooltip = str('Supplier ')+str(s_lat)+str(', ')+str(s_lon)+ str('<br>Receiver: ')+ str(lat)+str(', ')+str(lon)
        
        #create line
        my_PolyLine= folium.PolyLine(locations=line_coords,
                                     color=palette[i],
                                     tooltip = line_tooltip,
                                     weight=line_weight) #color and supplier use i index
        
        #add line to map
        mapit.add_child(my_PolyLine)
        
        
        #add dot for the receiver
        folium.CircleMarker( location=[ lat,lon ], 
                            fill=True, 
                            fill_color=palette[i], 
                            color=palette[i], 
                            tooltip =  'Receiver: '+ str(lat)+', '+str(lon),#'receiver tooltip placeholder',
                            radius=1 ).add_to( mapit )

print("mapped ", receiver_count, "unique receivers")
#mapit #can run this in the notebook if you need to view it there. better to just open the .html in another browser tab

#save the file
mapit.save(outfile= 'map_'+file_stem+'.html')


mapped  9 unique receivers


In [98]:
str(lat)+', '+str(lon)

'38.64352562, -121.6411539'