In [91]:
import os
import math
import json
import folium
import pickle
import warnings
import numpy as np
import pandas as pd
from dtw import dtw
import altair as alt
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from numpy import cos, sin, arcsin, sqrt
from sklearn.preprocessing import MinMaxScaler
from yellowbrick.cluster import KElbowVisualizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from yellowbrick.cluster import InterclusterDistance
from sklearn.metrics.pairwise import cosine_similarity, cosine_distances
warnings.simplefilter(action='ignore', category=Warning)
warnings.simplefilter(action='ignore', category=FutureWarning)
%matplotlib inline
plt.figure(figsize=(20,5))

PALETTE_DEG = ["FF"]
PALETTE = ["#FFC30D", "#EF404B"]
PALETTE_ALT = ["#40EFE4", "#EF404B"]
RANDOM_SEED = 2048
R = 6371
KMS_AROUND = 3.20
WINDOW_TIME = 3.12

<Figure size 1440x360 with 0 Axes>

In [2]:
df = pd.read_pickle("../data/shippify_icd_task_new.pkl", compression='gzip')
df['delta'] = df['delta']
#dict of size
size_dict = { 1: 'X Small', 2: 'Small', 3: 'Medium', 4: 'Large', 5: 'X Large' }
color_scale = alt.Scale(domain=list(size_dict.values()))
hue_order = size_dict.values()

In [3]:
columns = ['total_size', 'distance', 'task_around', 'lat', 'long', 'delta', 'cost']
LAT_AXIS = alt.X('lat', type='quantitative', scale=alt.Scale(domain=(-33.70, -33.15)), title='Latitude')
LONG_AXIS = alt.Y('long',type='quantitative', scale=alt.Scale(domain=(-70.4, -70.9)), title='Longitude')
DEN_AXIS = y=alt.Y('density', type='quantitative', title='Density')

In [4]:
def create_quantiles(dfColumn, numQuantiles):
  return pd.qcut(dfColumn, numQuantiles, labels=False, duplicates='drop')

def min_max_scaler(numArr):
  minx = np.min(numArr)
  maxx = np.max(numArr)
  numArr = (numArr - minx) / (maxx - minx)
  return numArr

def get_similarity(obj1, obj2):
  len1 = len(obj1.columns)
  len2 = len(obj2.columns)
  if not (len1 == len2):
    print("Error: Compared objects must have same number of features")
    return 0
  else:
    similarity = obj1 - obj2
    similarity = np.sum((similarity**2.0) / len1)
    similarity = 1 - math.sqrt(similarity)
    return similarity

def set_labels(title=None, x_label=None, y_label=None, fontsize=14, legend=None):
  if title is not None:
    plt.title(title, fontsize=fontsize)
  if x_label is not None:
    plt.xlabel(x_label, fontsize=fontsize)
  if y_label is not None:
    plt.ylabel(y_label, fontsize=fontsize)
  if legend is not None:
    legend['object'].set_title(legend['title'])
    for t, l in zip(legend['object'].texts, legend['texts']): t.set_text(l)

def with_hue(plot, feature, categories, hue_categories, diff=False, values=None):
  a = [p.get_height() for p in plot.patches]
  patch = [p for p in plot.patches]
  differences = {'class': [], 'value': []}
  return_value = []
  for i in range(categories):
    total = a[i] + a[i+categories]
    difference = []
    for j in range(hue_categories):
      value = round(100 * a[(j*categories + i)]/total, 2)
      percentage = '{:.1f}%'.format(value)
      x = patch[(j*categories + i)].get_x() + patch[(j*categories + i)].get_width() / 2 - 0.15
      y = patch[(j*categories + i)].get_y() + patch[(j*categories + i)].get_height() 
      plt.annotate(percentage, (x, y), size = 12)
      if values is not None:
        difference.append(value)
      if diff:
        difference.append(value)
    if values is not None:
      return_value.append(difference[values])
    if diff:
      diff_value = abs(round(difference[0]-difference[1], 2))
      differences['class'].append(feature[i])
      differences['value'].append(diff_value)
      print('{0:16s}: |{1:>5} - {2:>5}| = {3:>5}'.format(feature[i], difference[0], difference[1], diff_value))
    #print(f'{feature[i]}: {difference[0]} - {difference[1]} = {abs(round(difference[0]-difference[1], 2))}')
  plt.show()
  if values is not None:
    return return_value
  if diff:
    return pd.DataFrame.from_dict(differences)

def with_hue_unique(plot, total, feature, categories, hue_categories, diff=False):
  a = [p.get_height() for p in plot.patches]
  patch = [p for p in plot.patches]
  differences = {'class': [], 'value': []}
  for i in range(categories):
    difference = []
    for j in range(hue_categories):
      value = round(100 * a[(j*categories + i)]/total, 2)
      percentage = '{:.1f}%'.format(value)
      x = patch[(j*categories + i)].get_x() + patch[(j*categories + i)].get_width() / 2 - 0.15
      y = patch[(j*categories + i)].get_y() + patch[(j*categories + i)].get_height() 
      plt.annotate(percentage, (x, y), size = 12)
      if diff:
        difference.append(value)
    if diff:
      diff_value = abs(round(difference[0]-difference[1], 2))
      differences['class'].append(feature[i])
      differences['value'].append(diff_value)
      print('{0:16s}: |{1:>5} - {2:>5}| = {3:>5}'.format(feature[i], difference[0], difference[1], diff_value))
    #print(f'{feature[i]}: {difference[0]} - {difference[1]} = {abs(round(difference[0]-difference[1], 2))}')
  plt.show()
  if diff:
    return pd.DataFrame.from_dict(differences)
  
class display(object):
  """Display HTML representation of multiple objects"""
  template = """<div style="float: left; padding: 10px;">
  <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
  </div>"""
  def __init__(self, *args):
    self.args = args

  def _repr_html_(self):
    return '\n'.join(self.template.format(a, eval(a)._repr_html_()) for a in self.args)

  def __repr__(self):
    return '\n\n'.join(a + '\n' + repr(eval(a)) for a in self.args)

def remove_outlayers(dataframe: pd.DataFrame, feature: str, complete=False, limit=3):
  z_scores = stats.zscore(dataframe[feature])
  abs_z_scores = np.abs(z_scores)
  filtered_entries = abs_z_scores < limit
  if complete:
    return dataframe[filtered_entries]
  return dataframe[ filtered_entries ][feature]


In [None]:
## Visualización de delay a través del tiempo

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 226828 entries, 0 to 251822
Data columns (total 27 columns):
 #   Column               Non-Null Count   Dtype              
---  ------               --------------   -----              
 0   id                   226828 non-null  object             
 1   route_id             223343 non-null  object             
 2   creation_date        226828 non-null  datetime64[ns, UTC]
 3   delivery_type        226828 non-null  object             
 4   city                 226828 non-null  int8               
 5   cost                 226828 non-null  float64            
 6   total_size           226828 non-null  object             
 7   distance             226828 non-null  float64            
 8   company_id           226828 non-null  int64              
 9   company_type         226828 non-null  int64              
 10  network_id           226828 non-null  int16              
 11  pickup_dt            226828 non-null  datetime64[ns, UTC]
 12  pi

In [8]:
df['delivery_effective']

0        2020-08-07 21:05:16+00:00
1        2020-07-13 19:58:14+00:00
2        2020-08-14 21:27:06+00:00
4        2020-07-20 21:38:18+00:00
5        2020-07-10 20:41:07+00:00
                    ...           
251818   2020-07-30 19:03:27+00:00
251819   2020-07-28 19:36:56+00:00
251820   2020-07-24 22:00:28+00:00
251821   2020-08-26 22:28:35+00:00
251822   2020-07-08 20:52:15+00:00
Name: delivery_effective, Length: 226828, dtype: datetime64[ns, UTC]

In [28]:
df['delta'].describe()

count    226828.000000
mean         -3.702205
std          30.833342
min       -2947.410833
25%          -4.765069
50%          -1.624722
75%           0.425833
max        2386.768056
Name: delta, dtype: float64

In [24]:
simplified = remove_outlayers(dataframe= df, feature= 'delta', complete=True)
simplified = simplified[['delivery_effective', 'delta']].sample(4999)

In [25]:
alt.Chart(
    simplified,
    title="Daily mean delivery delta time in Santiago, CL"
).mark_rect().encode(
    x='date(delivery_effective):O',
    y='month(delivery_effective):O',
    color=alt.Color('mean(delta):Q', scale=alt.Scale(scheme="inferno")),
    tooltip=[
        alt.Tooltip('monthdate(delivery_effective):T', title='Date'),
        alt.Tooltip('mean(delta):Q', title='Delta')
    ]
).properties(width=550)

## Otro análisis teniendo en cuenta todos los valores

In [289]:
df['delivery date'] = pd.to_datetime(df['delivery_effective']).dt.floor('d')
groupedDf = df[['delivery date','delta']].groupby(
    'delivery date').mean().reset_index()
groupedDf = groupedDf[groupedDf['delivery date'].dt.month.isin([6,7,8]) ]

In [290]:
simplified = remove_outlayers(dataframe= groupedDf, feature= 'delta', complete=True)
simplified = simplified[['delivery date', 'delta']]

In [291]:
simplified['delta'].describe()

count    92.000000
mean     -6.486777
std       7.620146
min     -29.126401
25%      -6.276639
50%      -3.616482
75%      -2.619941
max      12.994017
Name: delta, dtype: float64

In [292]:
simplified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 92 entries, 0 to 91
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   delivery date  92 non-null     datetime64[ns, UTC]
 1   delta          92 non-null     float64            
dtypes: datetime64[ns, UTC](1), float64(1)
memory usage: 2.2 KB


In [293]:
simplified

Unnamed: 0,delivery date,delta
0,2020-06-01 00:00:00+00:00,-1.606417
1,2020-06-02 00:00:00+00:00,-1.766317
2,2020-06-03 00:00:00+00:00,-2.346498
3,2020-06-04 00:00:00+00:00,-7.981150
4,2020-06-05 00:00:00+00:00,-4.605450
...,...,...
87,2020-08-27 00:00:00+00:00,-4.688168
88,2020-08-28 00:00:00+00:00,-9.946330
89,2020-08-29 00:00:00+00:00,-2.926470
90,2020-08-30 00:00:00+00:00,-22.719365


In [294]:
alt.Chart(
    simplified,
    title="Daily mean of delivery delta in Santiago, CL"
).mark_rect().encode(
    x='date(delivery date):O',
    y='month(delivery date):O',
    color=alt.Color('max(delta):Q', scale=alt.Scale(scheme="inferno")),
    tooltip=[
        alt.Tooltip('date(delivery date):T', title='Date'),
        alt.Tooltip('max(delta):Q', title='Delta')
    ]
).properties(width=750).properties(height=150)

In [295]:
simplified['delta'].describe()

count    92.000000
mean     -6.486777
std       7.620146
min     -29.126401
25%      -6.276639
50%      -3.616482
75%      -2.619941
max      12.994017
Name: delta, dtype: float64

### Otro análisis por rango de horas 

In [277]:
df['delivery date'] = pd.to_datetime(df['delivery_effective']).dt.floor('d')
df['delivery hour'] = pd.to_datetime(df['delivery_effective']).dt.hour

In [278]:
df['delivery hour'].describe()

count    226828.000000
mean         18.127824
std           4.588088
min           0.000000
25%          17.000000
50%          19.000000
75%          21.000000
max          23.000000
Name: delivery hour, dtype: float64

In [279]:
df['hour range'] = pd.cut(df['delivery hour'], [0, 6, 12, 18, 23], labels=['early morning','morning','afternoon','night'])
df['hour range'] = df['hour range'].astype('string')

In [280]:
df['hour range'] 

0         night
1         night
2         night
4         night
5         night
          ...  
251818    night
251819    night
251820    night
251821    night
251822    night
Name: hour range, Length: 226828, dtype: string

In [281]:
filteredDf = df[df['delivery date'].dt.month.isin([6,7,8])]
filteredDf = filteredDf[['hour range','delivery date','delta']].groupby(['hour range', 'delivery date']).mean().reset_index()

In [282]:
filteredDf = filteredDf[pd.notnull(filteredDf['delta'])]
filteredDf

Unnamed: 0,hour range,delivery date,delta
0,afternoon,2020-06-01 00:00:00+00:00,-1.053978
1,afternoon,2020-06-02 00:00:00+00:00,-1.390038
2,afternoon,2020-06-03 00:00:00+00:00,-2.248167
3,afternoon,2020-06-04 00:00:00+00:00,-7.127405
4,afternoon,2020-06-05 00:00:00+00:00,-4.712169
...,...,...,...
321,night,2020-08-27 00:00:00+00:00,-1.940925
322,night,2020-08-28 00:00:00+00:00,-4.185506
323,night,2020-08-29 00:00:00+00:00,-5.391491
324,night,2020-08-30 00:00:00+00:00,-34.649764


In [283]:
simplified2 = remove_outlayers(dataframe=filteredDf, feature='delta', complete=True)
simplified2

Unnamed: 0,hour range,delivery date,delta
0,afternoon,2020-06-01 00:00:00+00:00,-1.053978
1,afternoon,2020-06-02 00:00:00+00:00,-1.390038
2,afternoon,2020-06-03 00:00:00+00:00,-2.248167
3,afternoon,2020-06-04 00:00:00+00:00,-7.127405
4,afternoon,2020-06-05 00:00:00+00:00,-4.712169
...,...,...,...
321,night,2020-08-27 00:00:00+00:00,-1.940925
322,night,2020-08-28 00:00:00+00:00,-4.185506
323,night,2020-08-29 00:00:00+00:00,-5.391491
324,night,2020-08-30 00:00:00+00:00,-34.649764


In [284]:
simplified2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 322 entries, 0 to 325
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   hour range     322 non-null    object             
 1   delivery date  322 non-null    datetime64[ns, UTC]
 2   delta          322 non-null    float64            
dtypes: datetime64[ns, UTC](1), float64(1), object(1)
memory usage: 10.1+ KB


In [288]:
alt.Chart(
    simplified2,
    title="Daily mean delivery delta time by hour ranges in Santiago, CL"
).mark_rect().encode(
    x='date(delivery date):O',#:O
    y='hour range:O',#:O
    color=alt.Color('delta:Q', scale=alt.Scale(scheme="inferno")),#:Q
    tooltip=[
        alt.Tooltip('delivery date:T', title='Rango'),#:T
        alt.Tooltip('delta:Q', title='Delta')#:Q
    ]
).properties(width=750).properties(height=150)

In [298]:
from access import Access, Datasets
chi_docs_dents   = Datasets.load_data('chi_doc')

chi_population   = Datasets.load_data('chi_pop')

chi_travel_costs = Datasets.load_data('chi_times')


In [300]:
chi_travel_costs.head()

Unnamed: 0,origin,dest,cost
0,17093890101,17031010100,91.2
1,17093890101,17031010201,92.82
2,17093890101,17031010202,92.95
3,17093890101,17031010300,89.4
4,17093890101,17031010400,84.97


In [301]:
chicago_primary_care = Access(demand_df = chi_population,
                              demand_index = "geoid",
                              demand_value = "pop",
                              supply_df = chi_docs_dents,
                              supply_index = "geoid",
                              supply_value = ["doc", "dentist"],
                              cost_df = chi_travel_costs,
                              cost_origin  = "origin",
                              cost_dest = "dest",
                              cost_name = "cost")

In [303]:
chicago_primary_care.plot(figsize=(10,10))

AttributeError: 'Access' object has no attribute 'plot'

In [304]:
pd.plot(chicago_primary_care)

AttributeError: module 'pandas' has no attribute 'plot'