# Picking rate analysis

In [0]:
import pandas as pd
import numpy as np
import os
import pickle
from plotly.offline import iplot
import plotly.graph_objects as go
from python_for_finance import plot_histogram

## Load data

In [55]:
orders_aggregated_data = pd.read_csv("orders_aggregated_data_partial.csv")

orders_aggregated_data.head()

Unnamed: 0,order_id,n_picks,touched_at_min,touched_at_max
0,1556102,29,2019-09-30 13:58:07,2019-09-30 14:18:53
1,1656552,11,2019-11-11 16:32:09,2019-11-11 16:53:46
2,1657300,9,2019-11-11 10:26:22,2019-11-11 10:31:28
3,1657820,46,2019-11-11 09:24:33,2019-11-11 10:13:00
4,1657916,42,2019-11-12 09:10:03,2019-11-12 09:41:27


Check for null values.

In [56]:
orders_aggregated_data.isna().sum()

order_id          0
n_picks           0
touched_at_min    0
touched_at_max    0
dtype: int64

## Compute derived features

For each order we have the number of picked items (`n_picks`), the time of first scan (`touched_at_min`) and the timeof last scan (`touched_at_max`). From this we can derive such quantities as:
- Total picking time for order
- Picking rate for orders (number of picks per minute)

### Compute total picking time for orders

In [57]:
orders_aggregated_data["touched_at_min"] = pd.to_datetime(orders_aggregated_data["touched_at_min"])
orders_aggregated_data["touched_at_max"] = pd.to_datetime(orders_aggregated_data["touched_at_max"])

orders_aggregated_data["picking_time"] = (
    orders_aggregated_data["touched_at_max"] - orders_aggregated_data["touched_at_min"]
) / pd.Timedelta("1 min")

orders_aggregated_data.sample(10)

Unnamed: 0,order_id,n_picks,touched_at_min,touched_at_max,picking_time
4348,1681190,39,2019-11-13 17:04:30,2019-11-13 17:46:59,42.483333
4071,1680576,44,2019-11-13 08:40:50,2019-11-13 09:32:53,52.05
4201,1680872,24,2019-11-13 16:43:17,2019-11-13 17:05:35,22.3
5384,1683694,43,2019-11-14 14:40:39,2019-11-14 15:29:04,48.416667
6574,1686624,29,2019-11-15 14:15:53,2019-11-15 14:34:32,18.65
7073,1687888,34,2019-11-16 10:38:13,2019-11-16 11:10:39,32.433333
6981,1687666,29,2019-11-16 15:13:57,2019-11-16 15:31:11,17.233333
5321,1683558,13,2019-11-14 09:55:39,2019-11-14 10:08:51,13.2
5218,1683300,15,2019-11-14 15:28:56,2019-11-14 15:50:02,21.1
4548,1681650,28,2019-11-16 12:19:35,2019-11-16 12:39:19,19.733333


Plot the distribution of picking times.

In [59]:
plot_histogram(orders_aggregated_data["picking_time"], xaxis_title="Picking time")

In [60]:
print(f"Picking times (minutes)")
print("------------")
print(f"Min: {orders_aggregated_data['picking_time'].min()} - Max: {orders_aggregated_data['picking_time'].max()}")
print(f"Mean: {orders_aggregated_data['picking_time'].mean()} - St. dev.: {orders_aggregated_data['picking_time'].std()}")

Picking times (minutes)
------------
Min: 0.03333333333333333 - Max: 1243.3
Mean: 29.488001497806685 - St. dev.: 23.63928092007642


The distribution clearly contains outliers in its tail: it's a good idea to elmininate them, e.g. selecting values between the 5th and the 95th percentile. `numpy` offers the `percentile()` function to do just that, returning whatever percentile we want: we can then just filter out all values above that.

In [64]:
filtered_picking_times = orders_aggregated_data[
    (orders_aggregated_data["picking_time"] >= np.percentile(orders_aggregated_data["picking_time"], 5.))
    & (orders_aggregated_data["picking_time"] <= np.percentile(orders_aggregated_data["picking_time"], 95.))
]["picking_time"].values

print(f"Filtered picking times")
print("----------------------")
print(f"Min: {filtered_picking_times.min()} - Max: {filtered_picking_times.max()}")
print(f"Mean: {filtered_picking_times.mean()} - St. dev.: {filtered_picking_times.std()}")

Filtered picking times
----------------------
Min: 5.816666666666666 - Max: 67.78333333333333
Mean: 27.576002297757707 - St. dev.: 14.074574435725644


Let's plot the distribution of the filtered values.

In [65]:
plot_histogram(filtered_picking_times, xaxis_title="Picking time")

### Compute picking rate for orders

Picking rate (picks per minute) expresses well the difference between more and less efficiently-picked orders. Again we can compute it and store it in a new column of our dataframe.

In [69]:
orders_aggregated_data["picking_rate"] = orders_aggregated_data["n_picks"] / orders_aggregated_data["picking_time"]

orders_aggregated_data.sample(10)

Unnamed: 0,order_id,n_picks,touched_at_min,touched_at_max,picking_time,picking_rate
5175,1683206,36,2019-11-14 18:27:48,2019-11-14 19:08:02,40.233333,0.89478
2639,1677294,2,2019-11-12 11:07:11,2019-11-12 11:08:00,0.816667,2.44898
6743,1687036,74,2019-11-15 13:27:14,2019-11-15 14:37:19,70.083333,1.055886
4242,1680962,49,2019-11-13 14:12:18,2019-11-13 14:47:04,34.766667,1.409396
3989,1680398,31,2019-11-14 10:22:55,2019-11-14 10:43:23,20.466667,1.514658
4666,1681940,5,2019-11-13 17:47:06,2019-11-13 17:49:25,2.316667,2.158273
4256,1680992,26,2019-11-13 13:22:36,2019-11-13 13:42:45,20.15,1.290323
2865,1677794,41,2019-11-14 14:08:58,2019-11-14 16:31:37,142.65,0.287417
97,1669174,20,2019-11-11 09:38:17,2019-11-11 09:49:53,11.6,1.724138
4091,1680620,17,2019-11-13 13:01:01,2019-11-13 13:07:29,6.466667,2.628866


In [73]:
print("Picking rates (picks/minutes)")
print("-----------------------------")
print(f"Min: {orders_aggregated_data['picking_rate'].min()} - Max: {orders_aggregated_data['picking_rate'].max()}")
print(f"Mean: {orders_aggregated_data['picking_rate'].mean()} - St. dev.: {orders_aggregated_data['picking_rate'].std()}")
print(
    f"\nMax minutes per picks: {1./orders_aggregated_data['picking_rate'].min()} - "
    f"Min minute per picks: {1./orders_aggregated_data['picking_rate'].max()}"
)

Picking rates (picks/minutes)
-----------------------------
Min: 0.0106458481192335 - Max: 60.0
Mean: 1.0776547067089295 - St. dev.: 1.2334998797164518

Max minutes per picks: 93.93333333333334 - Min minute per picks: 0.016666666666666666


In [74]:
filtered_picking_rates = orders_aggregated_data[
    (orders_aggregated_data["picking_rate"] >= np.percentile(orders_aggregated_data["picking_rate"], 5.))
    & (orders_aggregated_data["picking_rate"] <= np.percentile(orders_aggregated_data["picking_rate"], 95.))
]["picking_rate"].values

print("Filtered picking rates (picks/minutes)")
print("--------------------------------------")
print(f"Min: {filtered_picking_rates.min()} - Max: {filtered_picking_rates.max()}")
print(f"Mean: {filtered_picking_rates.mean()} - St. dev.: {filtered_picking_rates.std()}")
print(
    f"\nMax minutes per picks: {1./filtered_picking_rates.min()} - "
    f"Min minute per picks: {1./filtered_picking_rates.max()}"
)

Filtered picking rates (picks/minutes)
--------------------------------------
Min: 0.2553191489361702 - Max: 2.0370370370370368
Mean: 0.9922093161867692 - St. dev.: 0.38861821601604657

Max minutes per picks: 3.916666666666667 - Min minute per picks: 0.49090909090909096


In [75]:
plot_histogram(filtered_picking_rates, xaxis_title="Picking rate")