In [1]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 1.4 MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


In [2]:
import numpy as np
import pandas as pd
from src import config
from geopy.distance import distance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_parquet(config.INT_FILE_PATH / 'transactions.parquet')

In [7]:
df_geo = pd.read_csv(config.RAW_FILE_PATH / 'olist_geolocation_dataset.csv')

In [16]:
df_geo_clean = df_geo.drop_duplicates(subset=['geolocation_zip_code_prefix','geolocation_city','geolocation_state'],keep='first')

## Get Customer Lat Lng

In [27]:
df_merge = df.merge(df_geo_clean, how='left', left_on=['customer_zip_code_prefix', 'customer_city', 'customer_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [28]:
df_merge.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge.rename(columns={'geolocation_lat':'customer_lat', 'geolocation_lng':'customer_lng'},inplace=True)

## Get Seller Lat Lng

In [68]:
df_merge2 = df_merge.merge(df_geo_clean, how='left', left_on=['seller_zip_code_prefix', 'seller_city', 'seller_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [69]:
df_merge2.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge2.rename(columns={'geolocation_lat':'seller_lat', 'geolocation_lng':'seller_lng'},inplace=True)

## Get Distance between Customer and Seller

In [70]:
df_exclude = (df_merge2['customer_lat'].isna()) | (df_merge2['customer_lng'].isna()) | (df_merge2['seller_lat'].isna()) | (df_merge2['seller_lng'].isna())

In [71]:
df_merge3 = df_merge2[~df_exclude]

In [72]:
df_merge3['customer_location'] = list(zip(df_merge3['customer_lat'], df_merge3['customer_lng']))
df_merge3['seller_location'] = list(zip(df_merge3['seller_lat'], df_merge3['seller_lng']))

In [78]:
df_merge3['distance'] = df_merge3.apply(lambda row: distance(row['customer_location'], row['seller_location']).km, axis=1)

In [214]:
df_merge3

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,customer_lng,seller_lat,seller_lng,customer_location,seller_location,distance,date,seller_order_count,order_approval_duration,order_carrier_duration
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,...,-47.397866,-23.482623,-46.374490,"(-20.509897499999997, -47.3978655)","(-23.48262344063541, -46.37448952833195)",345.713577,16/5/2017,2,0.011539,6.821088
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,29150127e6685892b6eab3eec79f59c7,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-15 17:14:59,...,-46.545746,-26.912429,-48.677381,"(-23.72685273154166, -46.54574582941039)","(-26.912428672061186, -48.6773810666573)",413.009607,12/1/2018,1,0.007037,2.851794
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,b2059ed67ce144a36e2aa97d2c9e9ad2,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-11 14:31:00,...,-46.660310,-23.482623,-46.374490,"(-23.527788191788307, -46.66030962184773)","(-23.48262344063541, -46.37448952833195)",29.618246,19/5/2018,3,1.007928,22.932813
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,951670f92359f4fe4a63112aa7306eba,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-27 23:22:42,...,-46.185352,-23.482623,-46.374490,"(-23.49693002789165, -46.185351975305366)","(-23.48262344063541, -46.37448952833195)",19.385057,13/3/2018,3,0.057419,14.302824
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,6b7d50bd145f6fc7f33cebabd7e49d0f,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-07-30 15:16:00,...,-47.151073,-21.766477,-48.831547,"(-22.98722237101393, -47.151072819246686)","(-21.7664768468313, -48.83154737836604)",219.599199,29/7/2018,2,0.012951,1.225347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113420,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP,6760e20addcf0121e9d58f2f1ff14298,delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-11 02:08:36,...,-46.499590,-22.218989,-49.643623,"(-23.587179999999933, -46.49958999999996)","(-22.218988944556266, -49.64362334020589)",356.361343,7/4/2018,1,0.014213,3.430775
113421,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP,9ec0c8947d973db4f4e8dcf1fbfa8f1b,delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-05 18:42:35,...,-46.765787,-21.760806,-48.172285,"(-23.612294462020333, -46.76578686529368)","(-21.760806096824155, -48.17228519205648)",250.847050,4/4/2018,1,0.010301,1.432095
113422,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,fed4434add09a6f332ea398efd656a5c,delivered,2018-04-08 20:11:50,2018-04-08 20:30:03,2018-04-09 17:52:17,...,-38.510859,-23.544897,-46.577772,"(-3.744127896474129, -38.51085902775693)","(-23.54489718096826, -46.57777159631251)",2356.429965,8/4/2018,1,0.012650,0.903090
113423,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,e31ec91cea1ecf97797787471f98a8c2,delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-06 18:24:41,...,-51.167614,-21.362358,-48.232976,"(-29.956390513994027, -51.16761355689882)","(-21.36235769739203, -48.232975701561735)",996.494404,3/11/2017,1,0.015822,2.886204


## Get Order Count per Day per Seller

In [97]:
df_merge3['date'] = df_merge3['order_purchase_timestamp'].apply(lambda x: '%d/%d/%d' % (x.day, x.month, x.year))

In [130]:
df_merge3['seller_order_count'] = df_merge3.groupby(['date','seller_id'])['order_id'].transform('count')

## Get Purchase Approval and Delivered Carrier Duration -> Continue From Here

In [219]:
df_merge3['order_approval_duration'] = (df_merge3['order_approved_at'] - df_merge3['order_purchase_timestamp']).dt.days #/np.timedelta64(1,'h')/24

In [220]:
df_merge3

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,customer_lng,seller_lat,seller_lng,customer_location,seller_location,distance,date,seller_order_count,order_approval_duration,order_carrier_duration
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP,00e7ee1b050b8499577073aeb2a297a1,delivered,2017-05-16 15:05:35,2017-05-16 15:22:12,2017-05-23 10:47:57,...,-47.397866,-23.482623,-46.374490,"(-20.509897499999997, -47.3978655)","(-23.48262344063541, -46.37448952833195)",345.713577,16/5/2017,2,0.0,6.821088
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP,29150127e6685892b6eab3eec79f59c7,delivered,2018-01-12 20:48:24,2018-01-12 20:58:32,2018-01-15 17:14:59,...,-46.545746,-26.912429,-48.677381,"(-23.72685273154166, -46.54574582941039)","(-26.912428672061186, -48.6773810666573)",413.009607,12/1/2018,1,0.0,2.851794
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP,b2059ed67ce144a36e2aa97d2c9e9ad2,delivered,2018-05-19 16:07:45,2018-05-20 16:19:10,2018-06-11 14:31:00,...,-46.660310,-23.482623,-46.374490,"(-23.527788191788307, -46.66030962184773)","(-23.48262344063541, -46.37448952833195)",29.618246,19/5/2018,3,1.0,22.932813
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP,951670f92359f4fe4a63112aa7306eba,delivered,2018-03-13 16:06:38,2018-03-13 17:29:19,2018-03-27 23:22:42,...,-46.185352,-23.482623,-46.374490,"(-23.49693002789165, -46.185351975305366)","(-23.48262344063541, -46.37448952833195)",19.385057,13/3/2018,3,0.0,14.302824
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP,6b7d50bd145f6fc7f33cebabd7e49d0f,delivered,2018-07-29 09:51:30,2018-07-29 10:10:09,2018-07-30 15:16:00,...,-47.151073,-21.766477,-48.831547,"(-22.98722237101393, -47.151072819246686)","(-21.7664768468313, -48.83154737836604)",219.599199,29/7/2018,2,0.0,1.225347
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113420,17ddf5dd5d51696bb3d7c6291687be6f,1a29b476fee25c95fbafc67c5ac95cf8,3937,sao paulo,SP,6760e20addcf0121e9d58f2f1ff14298,delivered,2018-04-07 15:48:17,2018-04-07 16:08:45,2018-04-11 02:08:36,...,-46.499590,-22.218989,-49.643623,"(-23.587179999999933, -46.49958999999996)","(-22.218988944556266, -49.64362334020589)",356.361343,7/4/2018,1,0.0,3.430775
113421,e7b71a9017aa05c9a7fd292d714858e8,d52a67c98be1cf6a5c84435bd38d095d,6764,taboao da serra,SP,9ec0c8947d973db4f4e8dcf1fbfa8f1b,delivered,2018-04-04 08:20:22,2018-04-04 08:35:12,2018-04-05 18:42:35,...,-46.765787,-21.760806,-48.172285,"(-23.612294462020333, -46.76578686529368)","(-21.760806096824155, -48.17228519205648)",250.847050,4/4/2018,1,0.0,1.432095
113422,5e28dfe12db7fb50a4b2f691faecea5e,e9f50caf99f032f0bf3c55141f019d99,60115,fortaleza,CE,fed4434add09a6f332ea398efd656a5c,delivered,2018-04-08 20:11:50,2018-04-08 20:30:03,2018-04-09 17:52:17,...,-38.510859,-23.544897,-46.577772,"(-3.744127896474129, -38.51085902775693)","(-23.54489718096826, -46.57777159631251)",2356.429965,8/4/2018,1,0.0,0.903090
113423,56b18e2166679b8a959d72dd06da27f9,73c2643a0a458b49f58cea58833b192e,92120,canoas,RS,e31ec91cea1ecf97797787471f98a8c2,delivered,2017-11-03 21:08:33,2017-11-03 21:31:20,2017-11-06 18:24:41,...,-51.167614,-21.362358,-48.232976,"(-29.956390513994027, -51.16761355689882)","(-21.36235769739203, -48.232975701561735)",996.494404,3/11/2017,1,0.0,2.886204


In [158]:
df_merge3['order_carrier_duration'] = (df_merge3['order_delivered_carrier_date'] - df_merge3['order_purchase_timestamp'])/np.timedelta64(1,'h')/24

## Data Preparation & Manipulation

In [210]:
# 'order_approval_duration', 'order_carrier_duration' -> improve estimated delivery date?
# what is order_item_id?

df_before_purchase = df_merge3[['seller_id', 'seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'delivery_days']]
df_after_approval = df_merge3[['seller_id', 'seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'delivery_days']]
df_carrier_received = df_merge3[['seller_id', 'seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'delivery_days']]

In [211]:
# Fill/drop blank column
df_carrier_received['product_category_name'].fillna('NC', inplace=True)
df_carrier_received = df_carrier_received[~(df_carrier_received['product_weight_g'].isna())]
df_carrier_received = df_carrier_received[~(df_carrier_received['order_carrier_duration'].isna())]
df_carrier_received = df_carrier_received[~(df_carrier_received['delivery_days'].isna())]

In [212]:
numerics = ['int', 'int64', 'float64']
numerical = df_carrier_received.select_dtypes(include=numerics).columns
categorical = df_carrier_received.drop(numerical,axis=1).columns

In [213]:
df_carrier_received

Unnamed: 0,seller_id,seller_order_count,distance,product_category_name,price,freight_value,shipping_cost_perc,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_size,purchase_dow,order_approval_duration,order_carrier_duration,delivery_days
0,7c67e1448b00f6e969d365cea6b010ab,2,345.713577,office_furniture,124.99,21.88,0.148975,8683.0,54.0,64.0,31.0,107136.0,1,0.011539,6.821088,8.0
1,b8bc237ba3788b23da09c0f1f3a3288c,1,413.009607,housewares,289.00,46.48,0.138548,10150.0,89.0,15.0,40.0,53400.0,4,0.007037,2.851794,16.0
2,7c67e1448b00f6e969d365cea6b010ab,3,29.618246,office_furniture,139.94,17.79,0.112788,8267.0,52.0,52.0,17.0,45968.0,5,1.007928,22.932813,26.0
3,7c67e1448b00f6e969d365cea6b010ab,3,19.385057,office_furniture,149.94,23.36,0.134795,12160.0,56.0,51.0,28.0,79968.0,1,0.057419,14.302824,14.0
4,4a3ca9315b744ce9f8e9374361493884,2,219.599199,home_confort,230.00,22.25,0.088206,5200.0,45.0,15.0,35.0,23625.0,6,0.012951,1.225347,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113420,527801b552d0077ffd170872eb49683b,1,356.361343,books_general_interest,74.90,13.88,0.156342,611.0,22.0,22.0,23.0,11132.0,5,0.014213,3.430775,6.0
113421,3fd1e727ba94cfe122d165e176ce7967,1,250.847050,sports_leisure,114.90,14.16,0.109716,1211.0,25.0,24.0,22.0,13200.0,2,0.010301,1.432095,7.0
113422,d9e7e7778b32987280a6f2cb9a39c57d,1,2356.429965,health_beauty,37.00,19.04,0.339757,870.0,25.0,20.0,18.0,9000.0,6,0.012650,0.903090,30.0
113423,4869f7a5dfa277a7dca6462dcf3b52b2,1,996.494404,watches_gifts,689.00,22.07,0.031038,710.0,19.0,13.0,14.0,3458.0,4,0.015822,2.886204,12.0


In [216]:
categorical

Index(['seller_id', 'product_category_name'], dtype='object')

In [None]:
## Train test split 80, 20

In [None]:
## One Hot Encoding

In [None]:
## GridSearchCV

In [None]:
# Path 1

In [None]:
## RandomForestRegressor -> Delivery Duration Prediction -> Check against actual delivery duration -> Late or Not Late

In [None]:
## MSE/MAPE

In [None]:
## Feature Importance

In [None]:
# Path 2

In [None]:
## LogReg -> Late or Not Late

In [None]:
## Accuracy

In [None]:
## Feature Importance