In [None]:
import math
import time
import json
import scipy
import pickle
import numpy as np
from scipy import stats
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(20,5))

In [None]:
df = pd.read_pickle('../data/shippify_icd.pkl', compression='gzip')
df = df.dropna(subset=['pickup_effective_dt', 'delivery_effective'])

### **Agregar codigo necesario debajo de su nombre**

## Enmanuel Magallanes

In [None]:
def with_hue(plot, feature, Number_of_categories, hue_categories):
    a = [p.get_height() for p in plot.patches]
    patch = [p for p in plot.patches]
    for i in range(Number_of_categories):
        total = a[i] + a[i+8]
        for j in range(hue_categories):
            percentage = '{:.1f}%'.format(100 * a[(j*8 + i)]/total)
            x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
            y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height() 
            plt.annotate(percentage, (x, y), size = 12)
    plt.show()

In [None]:
## Question: cual son las franjas horarias en las que se regitran mas tareas atrasadas y de que (tipo|peso) son?
## Question: Are the heaviest tasks the ones that register the most delays at the time of delivery?

In [None]:
def sum_total_weight(row):
  items = json.loads(row)
  total_weight = 0
  for item in items:
    try:
      total_weight += float(item['weight'])     
    except:
      total_weight += 0
  return total_weight

df['total_weight'] = df['items'].apply(sum_total_weight).astype('float32')

In [None]:
# Remove outlayers
z_scores = stats.zscore(df['total_weight'])
abs_z_scores = np.abs(z_scores)
filtered_entries = abs_z_scores < 3
df = df[ filtered_entries ]

In [None]:
plt.figure(figsize=(10,5))
plt.xticks([x for x in range(16)])
sns.kdeplot(data=df, x='total_weight', bw_adjust=2)

In [None]:
## Boxplot of total_weight
sns.boxenplot(data=df, x='total_weight')

In [None]:
labels = ['[0,0.5) kg', '[0.5,1) kg', '[1,1.5) kg', '[1.5,2) kg', '[2,2.5) kg', '[2.5,3) kg', '[3,3.5) kg', '[3.5 kg,inf) kg']
df['class_weight'] = pd.cut(df['total_weight'], bins=[0, 0.5, 1, 1.5, 2, 2.5, 3, 3.5, np.inf], labels=labels, include_lowest=True)

In [None]:
df['class_weight'].value_counts()

In [None]:
df.columns

In [None]:
df['is_late'] = df['delivery'] < df['delivery_effective']

In [None]:
gwl = df[['id', 'class_weight', 'is_late']].groupby(['class_weight', 'is_late'])['id'].count().reset_index()

In [None]:
plt.figure(figsize=(15,7))
f = sns.barplot(data=gwl, x='class_weight', y='id', hue='is_late')
with_hue(f, df.class_weight, 8, 2)

A primera vista, el porcentaje de paquetes retrasados no es ni directa ni directamente proporcional al peso. Esto debido a que no se registra un aumento o disminución en el porcentaje de paquetes con atrasos, conforme aumenta su peso. Por tanto, el peso total de la entrega no afecta a la puntualidad de la misma.


In [None]:
## Question 2: La intensidad del trafico en el punto destino de una entrega influje en la puntualidad de la misma?
## cantidad de entregas en un radio específico
## considerar la densidad de entregas en un radio especifico, en una ventana de tiempo. 
# Basado en lat.long un radio de X km y basado en delivery date una ventana de X horas, cuantas entregas hay
## considerar cuantas entregas fueron creadas ese día en esa ciudad

In [None]:
def haversine_distance(x, y):
  """
  Calculate the great circle distance between two points
  on the earth (specified in decimal degrees)
  """
  # convert decimal degrees to radians
  lon1, lat1, lon2, lat2 = map(math.radians, [x[0], x[1], y[0], y[1]])

  # haversine formula
  dlon = lon2 - lon1
  dlat = lat2 - lat1
  a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
  c = 2 * math.asin(math.sqrt(a))
  R = 6371
  return R * c

In [None]:
def calcualte_distance(point, dataframe):
  return dataframe.apply(
    lambda row: haversine_distance((row['lon'], row['lat']), point),
    axis=1
  )

In [None]:
def task_around(row):
  diffs = row['delivery_dt'] - df['delivery_effective']
  diffs = diffs.apply(lambda x: abs(x.total_seconds())) / 3600
  df = df[ diffs <= WINDOW_TIME ]
  distances = calcualte_distance(
    (row['lon'], row['lat']),
    df[['lon', 'lat']]
  )

  distances_filt = distances <= KMS_AROUND


In [None]:
KMS_AROUND = 10
WINDOW_TIME = '2H'
df.apply(task_around, axis=1)

### Hour late vs Total weight

In [None]:
diff = df['delivery'] - df['delivery_effective']
df['delay_delivery'] = diff.apply(lambda x: abs(int(x.total_seconds()))) / 3600

## Josue Cobos

In [None]:
#Question: Are those deliveries that are further from the economic center of Belo Horizonte those with the longest delivery delay?

In [None]:
#calculate distance between 2 lat long points
def calculateDistance(point, anotherPoint):
    lat1, lon1 = point
    lat2, lon2 = anotherPoint 
    R = 6373.0
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))
    distance = R * c
    return distance #kilometers

In [None]:
#For this question we will assume Barreiro in the most important regional administration of Belo Horizonte
BARREIROPOINT = (-19.977, -44.0145)
def getDistanceFromA(row):
  #print(row)
  return calculateDistance(BARREIROPOINT, (row['lat'], row['long']))
  #return row #distance

In [None]:
df['dst_to_barreiro'] = df[['lat', 'long']].apply(getDistanceFromA, axis=1)

In [None]:
plt.figure(figsize=(15,7))
subdf = df[['dst_to_barreiro','delay_delivery']]
f2 = sns.scatterplot(x=subdf['dst_to_barreiro'], y=subdf['delay_delivery'])
#with_hue(f, df.class_weight, 8, 2)

In [None]:
subdf.corr()

In [None]:
#Answer: Due to the non-existent correlation between the delay of shipments (delay_delivery) and distance to Barreiro (dst_to_barreiro), we can conclude that no matter how far your order is, it does not necessarily mean that it has a longer delay time.