In [40]:
#import packages

import numpy as np
import pandas as pd
import random
import pickle #importing file
import itertools
import folium #mapping
import seaborn as sns #color palette
import plotly
import chart_studio.plotly as py #!pip install chart_studio
import plotly.graph_objects as go

In [32]:
# import data

file_name = '../data/results/results_pooled_cement_only_complexcost.pkl' ##results_toydata_uniform.pkl'
open_file = open(file_name, "rb")
loaded_list = pickle.load(open_file)
open_file.close()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",120018.0,"[37.68627, -121.674512]",2000792.0,8.464567
1,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1100144.0,"[39.06031418, -121.9646343]",2441070.0,12.532808
2,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120185.0,"[39.127373, -121.961772]",2037133.0,4.527559
3,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120194.0,"[39.15124715, -121.933119]",6272237.0,5.905512
4,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120330.0,"[39.033785, -121.91461]",2024923.0,14.566929


In [20]:
#convert to dataframe and parse the geolocation

df = pd.DataFrame(loaded_list, columns = ['fac name','supplier', 'pool','receiver', 'distance', 'volume'])
df.head()

#convert suppliers to list to get around weird numpy object datatype
supplier_split = df.supplier.to_list()
receiver_split = df.receiver.to_list()

#split the lists of paired geolocations into two elements each, lat and lon
supplier_lat= [supplier_split[j][0] for j in range(0,len(supplier_split))]
supplier_lon = [supplier_split[j][1] for j in range(0,len(supplier_split))]
receiver_lat= [receiver_split[j][0] for j in range(0,len(receiver_split))]
receiver_lon = [receiver_split[j][1] for j in range(0,len(receiver_split))]

#add back into the df
df['supplier_lat'] = supplier_lat
df['supplier_lon'] = supplier_lon
df['receiver_lat'] = receiver_lat
df['receiver_lon'] = receiver_lon

df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",120018.0,"[37.68627, -121.674512]",2000792.0,8.464567,34.6222,-117.1001,37.68627,-121.674512
1,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1100144.0,"[39.06031418, -121.9646343]",2441070.0,12.532808,34.6222,-117.1001,39.060314,-121.964634
2,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120185.0,"[39.127373, -121.961772]",2037133.0,4.527559,34.6222,-117.1001,39.127373,-121.961772
3,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120194.0,"[39.15124715, -121.933119]",6272237.0,5.905512,34.6222,-117.1001,39.151247,-121.933119
4,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120330.0,"[39.033785, -121.91461]",2024923.0,14.566929,34.6222,-117.1001,39.033785,-121.91461


In [21]:
#create a color palette, repeat it 100 times so we don't run out of colors for large graphs
palette = ['red', 'green', 'purple', 'orange',  'darkblue',
           'gray','cadetblue', 'darkpurple', 'white', 'pink', 'lightblue', 'lightgreen', # 'darkred','lightred' these coolors looked very similar to other colors, commented out
           'black','lightgray','lightred','blue','beige']*100

## Clean the data

### Review the dataframe

In [22]:
df.shape

(1841, 10)

In [23]:
#view a record or five
#df[df['supplier_lat'] == 35.399380]
df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",120018.0,"[37.68627, -121.674512]",2000792.0,8.464567,34.6222,-117.1001,37.68627,-121.674512
1,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1100144.0,"[39.06031418, -121.9646343]",2441070.0,12.532808,34.6222,-117.1001,39.060314,-121.964634
2,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120185.0,"[39.127373, -121.961772]",2037133.0,4.527559,34.6222,-117.1001,39.127373,-121.961772
3,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120194.0,"[39.15124715, -121.933119]",6272237.0,5.905512,34.6222,-117.1001,39.151247,-121.933119
4,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120330.0,"[39.033785, -121.91461]",2024923.0,14.566929,34.6222,-117.1001,39.033785,-121.91461


### EDA: Mean distances

In [24]:
#group the dataframe by mean distance
df_grouped = df.groupby(['supplier_lat','supplier_lon']).mean('distance').reset_index()
df_grouped

#sort on distance
df_grouped = df_grouped.sort_values('distance')
df_grouped

#get top 10 only
df_grouped = df_grouped.reset_index() #can add .tail(10) head(10)
df_grouped

Unnamed: 0,index,supplier_lat,supplier_lon,distance,volume,receiver_lat,receiver_lon
0,4,35.029298,-118.316236,2021796.0,23.733333,35.445581,-119.60433
1,0,34.437557,-116.891034,2146734.0,21.085,34.44133,-118.546588
2,2,34.6222,-117.1001,2182669.0,10.803283,35.545885,-119.610428
3,5,37.3181,-122.091,2186905.0,66.414286,33.881513,-117.825201
4,6,40.7369,-122.3223,2346808.0,2.11063,35.456693,-119.575874
5,1,34.6045,-117.3382,2359245.0,30.055556,35.255004,-119.419531
6,3,34.819863,-118.748732,2379569.0,31.351899,36.374201,-120.211744


### EDA on volumes

In [25]:
volumes = df['volume']
volumes.describe()

count    1841.000000
mean        9.914394
std        10.641432
min         0.060688
25%         2.362205
50%         7.414698
75%        14.632546
max       173.429540
Name: volume, dtype: float64

## Sankey Diagram

In [39]:
#https://medium.com/kenlok/how-to-create-sankey-diagrams-from-dataframes-in-python-e221c1b4d6b0
def genSankey(df,cat_cols=[],value_cols='',title='Sankey Diagram'):
    # maximum of 6 value cols -> 6 colors
    colorPalette = ['#4B8BBE','#306998','#FFE873','#FFD43B','#646464']
    labelList = []
    colorNumList = []
    for catCol in cat_cols:
        labelListTemp =  list(set(df[catCol].values))
        colorNumList.append(len(labelListTemp))
        labelList = labelList + labelListTemp
        
    # remove duplicates from labelList
    labelList = list(dict.fromkeys(labelList))
    
    # define colors based on number of levels
    colorList = []
    for idx, colorNum in enumerate(colorNumList):
        colorList = colorList + [colorPalette[idx]]*colorNum
        
    # transform df into a source-target pair
    for i in range(len(cat_cols)-1):
        if i==0:
            sourceTargetDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            sourceTargetDf.columns = ['source','target','count']
        else:
            tempDf = df[[cat_cols[i],cat_cols[i+1],value_cols]]
            tempDf.columns = ['source','target','count']
            sourceTargetDf = pd.concat([sourceTargetDf,tempDf])
        sourceTargetDf = sourceTargetDf.groupby(['source','target']).agg({'count':'sum'}).reset_index()
        
    # add index for source-target pair
    sourceTargetDf['sourceID'] = sourceTargetDf['source'].apply(lambda x: labelList.index(x))
    sourceTargetDf['targetID'] = sourceTargetDf['target'].apply(lambda x: labelList.index(x))
    
    # creating the sankey diagram
    data = dict(
        type='sankey',
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(
            color = "black",
            width = 0.5
          ),
          label = labelList,
          color = colorList
        ),
        link = dict(
          source = sourceTargetDf['sourceID'],
          target = sourceTargetDf['targetID'],
          value = sourceTargetDf['count']
        )
      )
    
    layout =  dict(
        title = title,
        font = dict(
          size = 10
        )
    )
       
    fig = dict(data=[data], layout=layout)
    return fig

In [35]:
df.head()

Unnamed: 0,fac name,supplier,pool,receiver,distance,volume,supplier_lat,supplier_lon,receiver_lat,receiver_lon
0,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",120018.0,"[37.68627, -121.674512]",2000792.0,8.464567,34.6222,-117.1001,37.68627,-121.674512
1,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1100144.0,"[39.06031418, -121.9646343]",2441070.0,12.532808,34.6222,-117.1001,39.060314,-121.964634
2,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120185.0,"[39.127373, -121.961772]",2037133.0,4.527559,34.6222,-117.1001,39.127373,-121.961772
3,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120194.0,"[39.15124715, -121.933119]",6272237.0,5.905512,34.6222,-117.1001,39.151247,-121.933119
4,CEMEX Construction Materials Pacific LLC,"[34.6222, -117.1001]",1120330.0,"[39.033785, -121.91461]",2024923.0,14.566929,34.6222,-117.1001,39.033785,-121.91461


In [36]:
fig = genSankey(df,cat_cols=['fac name','pool'],value_cols='volume',title='Sankey Diagram')
plotly.offline.plot(fig, validate=False)
fig

'temp-plot.html'

## Map the Data

In [26]:
#pre-work for the map 

#find unique suppliers since we only need to plot them once
unique_suppliers = df[['supplier_lat', 'supplier_lon']]
unique_suppliers = unique_suppliers.drop_duplicates()
unique_suppliers =unique_suppliers.reset_index(drop=True)
unique_suppliers

#find max and quantile of volume for setting up thicker lines
volumes = df['volume']
quantiles = np.quantile(volumes, [0.2,0.4,0.6,0.8])
max_volume = np.max(volumes)


In [16]:
#for testing q uick things
#df = df.head(10)


In [17]:
#initialize the count of receivers to 0
receiver_count = 0

# Create a map centered in CA
mapit = folium.Map( location=[37.3427, -119.2244], zoom_start=6) 

#loop through unique suppliers (outer loop)
#and receivers for each supplier (inner loop) 
for i in range(0,unique_suppliers.shape[0]):
    
    #find all receivers associated with the supplier late and lon, filter the dataframe down to receivers
    #for this particular supplier 
    s_lat = unique_suppliers['supplier_lat'][i]
    s_lon = unique_suppliers['supplier_lon'][i]
    
    #big circle for supplier
    folium.CircleMarker( 
                        location=[ s_lat,s_lon ], 
                        fill=True, 
                        fill_color=palette[i], 
                        color=palette[i],
                        tooltip = 'Supplier: '+str(s_lat)+', '+str(s_lon),
                        radius=10 
                        ).add_to( mapit ) #'
    

    receivers = df[(df['supplier_lat'] == s_lat) & (df['supplier_lon'] == s_lon)]
    
    #plot the receivers with a new color each time
    latlon = receivers['receiver']
    receiver_count += receivers.shape[0] #to print at the end as a check
    
    #for each receiver  
    for j in range(0,len(latlon)): 
        
        #split apart into lat and lon
        lat = latlon.iloc[j][0]
        lon = latlon.iloc[j][1]
        
        volume = receivers.iloc[j]['volume']
        #if volume > quantiles[3]:
        #    quantile = 1 #80-100 percentile
        #elif volume > quantiles[2]:
        #    quantile = 0.8 #60-80 percentile
        #elif volume > quantiles[1]:
        #    quantile = 0.6 #40-60 percentile
        #elif volume > quantiles[0]:
        #    quantile = 0.4 #20-40 percentile
        #else:
        #    quantile  = 0.2 #0-20 percentile
        #line_weight = quantile * 5
        line_weight = volume/max_volume * 20
        
        #line coordinates
        line_coords = [ [s_lat, s_lon], [lat, lon] ]
        
        line_tooltip = str('Supplier ')+str(s_lat)+str(', ')+str(s_lon)+ str('<br>Receiver: ')+ str(lat)+str(', ')+str(lon)
        
        #create line
        my_PolyLine= folium.PolyLine(locations=line_coords,
                                     color=palette[i],
                                     tooltip = line_tooltip,
                                     weight=line_weight) #color and supplier use i index
        
        #add line to map
        mapit.add_child(my_PolyLine)
        
        
        #add dot for the receiver
        folium.CircleMarker( location=[ lat,lon ], 
                            fill=True, 
                            fill_color=palette[i], 
                            color=palette[i], 
                            tooltip =  'Receiver: '+ str(lat)+', '+str(lon),#'receiver tooltip placeholder',
                            radius=1 ).add_to( mapit )

print("mapped ", receiver_count, "unique receivers")
#mapit #can run this in the notebook if you need to view it there. better to just open the .html in another browser tab

#save the file
mapit.save(outfile= "visualization.html")


mapped  1841 unique receivers


In [18]:
str(lat)+', '+str(lon)

'36.92397948, -120.4430548'