In [1]:
!pip install geopy

Collecting geopy
  Downloading geopy-2.2.0-py3-none-any.whl (118 kB)
[K     |████████████████████████████████| 118 kB 1.4 MB/s eta 0:00:01
[?25hCollecting geographiclib<2,>=1.49
  Downloading geographiclib-1.52-py3-none-any.whl (38 kB)
Installing collected packages: geographiclib, geopy
Successfully installed geographiclib-1.52 geopy-2.2.0


In [396]:
import numpy as np
import pandas as pd
from src import config
from geopy.distance import distance

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle

import warnings
warnings.filterwarnings("ignore")

In [149]:
def feature_engineering(df):
    # Get Shipping Cost Percentage
    df['shipping_cost_perc'] = df['freight_value']/df['order_total_price']
    
    # Get Purchase Day of Week
    df['purchase_dow'] = df['order_purchase_timestamp'].dt.dayofweek
    
    # Get Distance Between Customer and Seller
    df['customer_location'] = list(zip(df['customer_lat'], df['customer_lng']))
    df['seller_location'] = list(zip(df['seller_lat'], df['seller_lng']))
    df['distance'] = df.apply(lambda row: distance(row['customer_location'], row['seller_location']).km, axis=1)
    
    # Get Order Count per Day per Seller
    df['date'] = df['order_purchase_timestamp'].dt.date.nunique()
    df['seller_order_count'] = df.groupby(['date','seller_id'])['order_id'].transform('count')
    
    # Get Purchase Approval and Delivered Carrier Duration
    df['order_approval_duration'] = (df['order_approved_at'].dt.date - df['order_purchase_timestamp'].dt.date).dt.days
    df['order_carrier_duration'] = (df['order_delivered_carrier_date'].dt.date - df['order_purchase_timestamp'].dt.date).dt.days
    
    # Get Product Size
    df['product_size'] = df['product_length_cm'] * df['product_width_cm'] * df['product_height_cm']
    
    return df

In [93]:
def product_categorization(df):
    df['fashion'] = np.where(df['product_category_name'].str.contains(r'(?:fashio|perfumery)'), 1, 0)
    df['consumption'] = np.where(df['product_category_name'].str.contains(r'(?:food|drink|cuisine)'), 1, 0)
    df['garden'] = np.where(df['product_category_name'].str.contains(r'(?:garden|flower)'), 1, 0)
    df['electronics'] = np.where(df['product_category_name'].str.contains(r'(?:electronic|computer|telephony|console|audio)'), 1, 0)
    df['furniture_appliances'] = np.where(df['product_category_name'].str.contains(r'(?:furniture|appliance|bed|houseware|air_conditioning|table|kitchen|comfort|confort)'), 1, 0)
    df['construction'] = np.where(df['product_category_name'].str.contains(r'(?:construction)'), 1, 0)
    df['hobby_entertainment'] = np.where(df['product_category_name'].str.contains(r'(?:console|book|music|art|dvd)'), 1, 0)
    df['kids'] = np.where(df['product_category_name'].str.contains(r'(?:baby|toy|diaper|children)'), 1, 0)
    
    return df

In [2]:
df = pd.read_parquet(config.INT_FILE_PATH / 'transactions.parquet')

In [3]:
df_geo = pd.read_csv(config.RAW_FILE_PATH / 'olist_geolocation_dataset.csv')

In [4]:
df_geo_clean = df_geo.drop_duplicates(subset=['geolocation_zip_code_prefix','geolocation_city','geolocation_state'],keep='first')

In [5]:
df.columns

Index(['customer_id', 'customer_unique_id', 'customer_zip_code_prefix',
       'customer_city', 'customer_state', 'order_id', 'order_status',
       'order_purchase_timestamp', 'order_approved_at',
       'order_delivered_carrier_date', 'order_delivered_customer_date',
       'order_estimated_delivery_date', 'order_item_id', 'product_id',
       'seller_id', 'shipping_limit_date', 'price', 'freight_value',
       'product_name_length', 'product_description_length',
       'product_photos_qty', 'product_weight_g', 'product_length_cm',
       'product_height_cm', 'product_width_cm', 'seller_zip_code_prefix',
       'seller_city', 'seller_state', 'product_category_name',
       'order_total_price', 'shipping_cost_perc', 'purchase_dow',
       'late_delivery', 'delivery_days', 'days_late', 'product_size'],
      dtype='object')

In [6]:
df_original = df.iloc[:,:-6].dropna()

## Get Customer Lat Lng

In [7]:
df_merge = df_original.merge(df_geo_clean, how='left', left_on=['customer_zip_code_prefix', 'customer_city', 'customer_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [8]:
df_merge.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge.rename(columns={'geolocation_lat':'customer_lat', 'geolocation_lng':'customer_lng'},inplace=True)

## Get Seller Lat Lng

In [9]:
df_merge2 = df_merge.merge(df_geo_clean, how='left', left_on=['seller_zip_code_prefix', 'seller_city', 'seller_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [10]:
df_merge2.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge2.rename(columns={'geolocation_lat':'seller_lat', 'geolocation_lng':'seller_lng'},inplace=True)

## Remove any Missing Lat Lng

In [12]:
df_exclude = (df_merge2['customer_lat'].isna()) | (df_merge2['customer_lng'].isna()) | (df_merge2['seller_lat'].isna()) | (df_merge2['seller_lng'].isna())

In [188]:
df_merge3 = df_merge2[~df_exclude]

## Train Test Split

In [189]:
df_merge3['delivery_days'] = (df_merge3['order_delivered_customer_date'].dt.date - df_merge3['order_purchase_timestamp'].dt.date).dt.days
df_merge3['estimated_days'] = (df_merge3['order_estimated_delivery_date'].dt.date - df_merge3['order_purchase_timestamp'].dt.date).dt.days

In [190]:
X = df_merge3.drop(['delivery_days','estimated_days','order_delivered_customer_date','order_estimated_delivery_date'],axis=1)

In [191]:
y = df_merge3[['delivery_days','estimated_days']]

In [192]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=2022)

In [193]:
# Flag Top 10 Items sold in the app
top_products = df.groupby('product_category_name').order_total_price.sum().nlargest(10).index

X_train['top_ten'] = np.where(X_train['product_category_name'].isin(top_products), 1, 0)
X_test['top_ten'] = np.where(X_test['product_category_name'].isin(top_products), 1, 0)

In [194]:
# Feature Engineering
X_train2 = feature_engineering(X_train)
X_test2 = feature_engineering(X_test)

In [195]:
# Get length before further product categorization
train_len = X_train2.shape[1]
test_len = X_test2.shape[1]

In [196]:
# Additional Feature Engineering on Further Product Categorization
X_train3 = product_categorization(X_train2)
X_test3 = product_categorization(X_test2)

In [197]:
# Get length of categorization columns
prod_cat_len_train = X_train3.shape[1] - train_len
prod_cat_len_test = X_test3.shape[1] - test_len

In [275]:
X_train_before_purchase

Unnamed: 0,seller_order_count,distance,price,freight_value,shipping_cost_perc,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_size,purchase_dow,top_ten,fashion,consumption,garden,electronics,furniture_appliances,construction,hobby_entertainment,kids
7662,11,1079.604268,51.00,1.20,0.022989,600.0,38.0,16.0,25.0,15200.0,0,1,0,0,1,0,0,0,0,0
104725,451,619.056752,19.90,15.10,0.431429,100.0,23.0,8.0,18.0,3312.0,5,0,0,0,0,1,0,0,0,0
39902,71,862.255119,359.99,48.82,0.119420,3000.0,105.0,16.0,16.0,26880.0,4,0,0,0,0,0,0,0,0,0
71449,1534,2.221859,53.90,11.86,0.180353,1550.0,30.0,22.0,30.0,19800.0,0,1,0,0,1,0,0,0,0,0
92180,4,31.098132,99.90,9.45,0.086420,750.0,22.0,7.0,22.0,3388.0,2,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48289,162,18.825386,29.99,8.29,0.216562,600.0,19.0,13.0,20.0,4940.0,0,1,0,0,0,1,0,0,0,0
106253,159,850.575649,215.00,27.59,0.113731,4650.0,37.0,20.0,34.0,25160.0,2,1,0,0,0,0,0,0,0,0
17105,610,348.531366,99.90,16.46,0.141458,427.0,15.0,17.0,14.0,3570.0,1,1,0,0,0,0,0,0,0,0
1292,99,735.166260,53.90,19.57,0.266367,950.0,28.0,14.0,14.0,5488.0,3,1,0,0,0,0,1,0,0,0


In [198]:
X_train_before_purchase = X_train3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'top_ten'] + list(X_train3.columns[-prod_cat_len_train:])]
X_train_after_approval = X_train3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'top_ten'] + list(X_train3.columns[-prod_cat_len_train:])]
X_train_carrier_received = X_train3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'top_ten'] + list(X_train3.columns[-prod_cat_len_train:])]

In [198]:
X_test_before_purchase = X_test3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'top_ten'] + list(X_test3.columns[-prod_cat_len_test:])]
X_test_after_approval = X_test3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'top_ten'] + list(X_test3.columns[-prod_cat_len_test:])]
X_test_carrier_received = X_test3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'top_ten'] + list(X_test3.columns[-prod_cat_len_test:])]

## Train Model

In [200]:
rfr = RandomForestRegressor(random_state=2022)

In [201]:
rfr.fit(X_train_before_purchase, y_train['delivery_days'])

In [210]:
# Pickle trained model
# with open("RandomForestRegressor.pickle","wb") as f:
#     pickle.dump(rfr, f)

In [202]:
y_pred = rfr.predict(X_test_before_purchase)

In [227]:
res = pd.DataFrame({'actual_days':y_test['delivery_days'], 'estimated_days':y_test['estimated_days'], 'pred_days':y_pred})

In [427]:
mean_absolute_error(y_test['delivery_days'],y_pred+6)

8.44007071042321

In [398]:
mean_absolute_error(y_test['delivery_days'],y_test['estimated_days'])

13.439713282065888

In [426]:
mean_squared_error(y_test['delivery_days'],y_pred+6, squared=False)

10.362088638205158

In [414]:
mean_squared_error(y_test['delivery_days'],y_test['estimated_days'], squared=False)

15.792274885976461

In [230]:
res['estimated_late'] = np.where(res['actual_days']>res['estimated_days'], 1, 0)
res['pred_late'] = np.where(res['actual_days']>res['pred_days'], 1, 0)

In [231]:
res.estimated_late.mean(), res.pred_late.mean()

(0.06474888445836893, 0.36765403968480015)

In [207]:
feature_importance = pd.DataFrame({'feature':rfr.feature_names_in_, 'score':rfr.feature_importances_})

In [208]:
feature_importance.sort_values('score',ascending=False)

Unnamed: 0,feature,score
1,distance,0.32711
3,freight_value,0.100801
4,shipping_cost_perc,0.081101
0,seller_order_count,0.070185
5,product_weight_g,0.070131
2,price,0.068287
9,product_size,0.062321
10,purchase_dow,0.049709
7,product_height_cm,0.046966
6,product_length_cm,0.043196


## Predict Late or Not Late

In [304]:
y_train_late = y_train.copy()

In [305]:
y_train_late['late'] = np.where(y_train_late['delivery_days'] > y_train_late['estimated_days'], 1, 0)

In [359]:
y_train_late

Unnamed: 0,delivery_days,estimated_days,late
7662,15,36,0
104725,23,30,0
39902,6,31,0
71449,3,11,0
92180,5,7,0
...,...,...,...
48289,2,15,0
106253,8,28,0
17105,15,23,0
1292,12,33,0


#### Using LogReg

In [364]:
lr = LogisticRegression(solver='liblinear', random_state=2022)

In [338]:
scaler = StandardScaler()

In [339]:
scaler.fit(X_train_before_purchase)

In [340]:
X_train_before_purchase_scaler = scaler.transform(X_train_before_purchase)

In [341]:
X_train_before_purchase_scaler2 = pd.DataFrame(X_train_before_purchase_scaler)

In [342]:
X_train_before_purchase_scaler2.columns = scaler.feature_names_in_

In [343]:
X_train_before_purchase_scaler2.set_index(X_train_before_purchase.index,inplace=True)

In [344]:
X_train_before_purchase_scaler2

Unnamed: 0,seller_order_count,distance,price,freight_value,shipping_cost_perc,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_size,purchase_dow,top_ten,fashion,consumption,garden,electronics,furniture_appliances,construction,hobby_entertainment,kids
7662,-0.744260,0.823023,-0.376663,-1.195367,-1.475603,-0.397040,0.488380,-0.035957,0.176491,0.007117,-1.396321,0.745185,-0.23374,-0.099438,4.607840,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
104725,0.233526,0.041573,-0.547856,-0.306114,1.687399,-0.532899,-0.440749,-0.637038,-0.431466,-0.511950,1.152111,-1.341948,-0.23374,-0.099438,-0.217021,2.331521,-0.636332,-0.138104,-0.167855,-0.261627
39902,-0.610925,0.454228,1.324198,1.851124,-0.728830,0.255083,4.638491,-0.035957,-0.605168,0.517103,0.642425,-1.341948,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
71449,2.640211,-1.005063,-0.360700,-0.513394,-0.256958,-0.138908,-0.007156,0.414854,0.610746,0.207967,-1.396321,0.745185,-0.23374,-0.099438,4.607840,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
92180,-0.759815,-0.956066,-0.107489,-0.667573,-0.984385,-0.356283,-0.502691,-0.712173,-0.084062,-0.508632,-0.376948,-1.341948,-0.23374,-0.099438,-0.217021,-0.428905,1.571506,-0.138104,-0.167855,-0.261627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48289,-0.408702,-0.976890,-0.492314,-0.741784,0.023451,-0.397040,-0.688517,-0.261362,-0.257764,-0.440867,-1.396321,0.745185,-0.23374,-0.099438,-0.217021,2.331521,-0.636332,-0.138104,-0.167855,-0.261627
106253,-0.415368,0.434410,0.526089,0.492934,-0.772884,0.703418,0.426438,0.264583,0.958150,0.442002,-0.376948,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
17105,0.586862,-0.417450,-0.107489,-0.219108,-0.558167,-0.444048,-0.936285,0.039178,-0.778869,-0.500685,-0.886634,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
1292,-0.548703,0.238586,-0.360700,-0.020146,0.409147,-0.301939,-0.131040,-0.186227,-0.778869,-0.416939,0.132739,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,1.571506,-0.138104,-0.167855,-0.261627


In [365]:
lr.fit(X_train_before_purchase_scaler2, y_train_late['late'])

In [347]:
X_test_before_purchase_scaler = scaler.transform(X_test_before_purchase)

In [348]:
X_test_before_purchase_scaler2 = pd.DataFrame(X_test_before_purchase_scaler)

In [349]:
X_test_before_purchase_scaler2.columns = scaler.feature_names_in_

In [350]:
X_test_before_purchase_scaler2.set_index(X_test_before_purchase.index,inplace=True)

In [351]:
X_test_before_purchase_scaler2

Unnamed: 0,seller_order_count,distance,price,freight_value,shipping_cost_perc,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_size,purchase_dow,top_ten,fashion,consumption,garden,electronics,furniture_appliances,construction,hobby_entertainment,kids
76333,-0.764260,-0.648953,0.162786,-0.372648,-0.985886,-0.220424,-0.131040,-0.336498,-0.257764,-0.363146,0.642425,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
37060,-0.690926,0.247188,-0.217581,-0.292680,-0.408358,-0.424212,-0.440749,1.241340,-1.039422,-0.292019,-1.396321,-1.341948,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,3.822236
55555,-0.764260,-0.176232,-0.382223,-0.146177,0.362886,-0.261181,-0.069098,-0.186227,-0.605168,-0.372927,-0.376948,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
48680,-0.742038,-0.539940,-0.404737,-0.431505,0.069907,-0.525291,-0.936285,-0.637038,-0.865720,-0.588448,-1.396321,-1.341948,-0.23374,-0.099438,-0.217021,2.331521,-0.636332,-0.138104,-0.167855,-0.261627
59686,-0.684259,-0.442036,-0.189507,-0.498679,-0.689305,-0.522574,-1.122111,-0.561903,-0.431466,-0.571682,0.642425,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,-0.636332,-0.138104,-0.167855,-0.261627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101706,-0.706482,-0.911729,-0.145471,-0.695722,-0.969633,1.491400,1.851103,-0.336498,1.479256,0.600936,1.152111,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,1.571506,-0.138104,-0.167855,-0.261627
47220,-0.739815,-0.973867,0.498566,-0.713635,-1.344544,-0.519314,-0.812401,-0.411633,-0.865720,-0.550418,-1.396321,-1.341948,-0.23374,-0.099438,-0.217021,2.331521,-0.636332,-0.138104,5.957515,-0.261627
1577,-0.746482,-0.419522,-0.564369,-1.164659,-0.953408,-0.505728,-0.874343,-0.336498,-1.039422,-0.564346,-0.886634,0.745185,-0.23374,-0.099438,-0.217021,-0.428905,1.571506,-0.138104,-0.167855,-0.261627
69363,-0.613148,-0.573238,-0.387672,-0.460294,-0.060620,-0.333730,-0.688517,-0.336498,-0.518317,-0.487324,0.642425,0.745185,-0.23374,-0.099438,-0.217021,2.331521,-0.636332,-0.138104,-0.167855,-0.261627


In [366]:
late_pred = lr.predict(X_test_before_purchase_scaler2)

In [367]:
len(late_pred), late_pred.sum()

(21066, 2)

#### Using RFC

In [369]:
rfc = RandomForestClassifier(random_state=2022)

In [370]:
rfc.fit(X_train_before_purchase, y_train_late['late'])

In [371]:
y_pred_late = rfc.predict(X_test_before_purchase)

In [374]:
# Pickle trained model
# with open("RandomForestClassifier.pickle","wb") as f:
#     pickle.dump(rfc, f)

In [377]:
res['pred_late_flag'] = y_pred_late

In [383]:
res[(res['pred_late']==1) & (res['pred_diff']<5)].sort_values('actual_days',ascending=False)

Unnamed: 0,actual_days,estimated_days,pred_days,estimated_diff,pred_diff,estimated_late,pred_late,pred_late_flag
59885,45,17,40.37,28,4.63,1,1,1
59888,45,17,40.37,28,4.63,1,1,1
29970,41,25,40.75,16,0.25,1,1,0
35992,40,47,35.88,7,4.12,0,1,0
62903,36,33,35.28,3,0.72,1,1,1
...,...,...,...,...,...,...,...,...
424,3,10,2.96,7,0.04,0,1,0
56974,3,10,2.92,7,0.08,0,1,0
87236,3,18,2.99,15,0.01,0,1,0
99674,3,16,2.78,13,0.22,0,1,0


In [380]:
res[res['pred_late_flag']==1].sort_values('actual_days',ascending=False)

Unnamed: 0,actual_days,estimated_days,pred_days,estimated_diff,pred_diff,estimated_late,pred_late,pred_late_flag
25587,66,52,25.17,14,40.83,1,1,1
88344,66,30,25.82,36,40.18,1,1,1
17885,62,27,43.29,35,18.71,1,1,1
105348,59,24,48.15,35,10.85,1,1,1
72849,57,37,38.89,20,18.11,1,1,1
...,...,...,...,...,...,...,...,...
9845,8,22,23.98,14,15.98,0,0,1
29746,7,22,16.26,15,9.26,0,0,1
51111,6,26,18.35,20,12.35,0,0,1
32291,6,5,5.16,1,0.84,1,1,1


In [425]:
# If we add x days to predicted days
np.where(res['actual_days']>np.ceil(res['pred_days'])+6, 1, 0).mean()

0.10566790088293933

In [394]:
rfc.feature_names_in_, rfc.feature_importances_

(array(['seller_order_count', 'distance', 'price', 'freight_value',
        'shipping_cost_perc', 'product_weight_g', 'product_length_cm',
        'product_height_cm', 'product_width_cm', 'product_size',
        'purchase_dow', 'top_ten', 'fashion', 'consumption', 'garden',
        'electronics', 'furniture_appliances', 'construction',
        'hobby_entertainment', 'kids'], dtype=object),
 array([0.06220166, 0.20349596, 0.09286278, 0.10883738, 0.11733543,
        0.07929336, 0.04936198, 0.04993556, 0.04624691, 0.06884827,
        0.08271826, 0.00912718, 0.00351976, 0.00115326, 0.00289571,
        0.00579134, 0.00737709, 0.00210946, 0.00259321, 0.00429546]))

## GridSearchCV

In [182]:
rfr2 = RandomForestRegressor(random_state=2022)

In [184]:
parameters = {
    "n_estimators":[50],
    "max_depth":[30],
    "min_samples_leaf":[30],
    "max_features":[10]
}

In [186]:
cv = GridSearchCV(rfr2, parameters, cv=2, scoring="r2")
cv.fit(X_train_before_purchase, y_train['delivery_days'])

In [187]:
mean_score = cv.cv_results_['mean_test_score']
std_score = cv.cv_results_['std_test_score']
params = cv.cv_results_['params']

for mean, std, param in zip(mean_score, std_score, params):
    print(mean, '+/-', std, 'with params', param)
    
cv.best_params_

0.23910340238179495 +/- 0.0032973290136841804 with params {'max_depth': 30, 'max_features': 10, 'min_samples_leaf': 30, 'n_estimators': 50}


{'max_depth': 30,
 'max_features': 10,
 'min_samples_leaf': 30,
 'n_estimators': 50}

## With hold out data

In [None]:
train_size = int(len(df_merge3) * 0.80)

In [49]:
df_train = df_merge3.sort_values('order_purchase_timestamp').iloc[:train_size]
df_test = df_merge3.sort_values('order_purchase_timestamp').iloc[train_size:]

X_train = df_train.drop(['delivery_days','order_delivered_customer_date','order_estimated_delivery_date'],axis=1)
y_train = df_train[['delivery_days','order_estimated_delivery_date']]

X_test = df_test.drop(['delivery_days','order_delivered_customer_date','order_estimated_delivery_date'],axis=1)
y_test = df_test[['delivery_days','order_estimated_delivery_date']]

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state,order_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,...,product_width_cm,seller_zip_code_prefix,seller_city,seller_state,product_category_name,order_total_price,customer_lat,customer_lng,seller_lat,seller_lng
23098,86dc2ffce2dfff336de2f386a786e574,830d5b7aaa3b6f1e9ad63703bec97d23,14600,sao joaquim da barra,SP,bfbd0f9bdef84302105ad712db648a6c,delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-07 17:11:53,...,16.0,81810.0,curitiba,PR,health_beauty,47.82,-20.581177,-47.858931,-25.507144,-49.272075
23099,86dc2ffce2dfff336de2f386a786e574,830d5b7aaa3b6f1e9ad63703bec97d23,14600,sao joaquim da barra,SP,bfbd0f9bdef84302105ad712db648a6c,delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-07 17:11:53,...,16.0,81810.0,curitiba,PR,health_beauty,47.82,-20.581177,-47.858931,-25.507144,-49.272075
23100,86dc2ffce2dfff336de2f386a786e574,830d5b7aaa3b6f1e9ad63703bec97d23,14600,sao joaquim da barra,SP,bfbd0f9bdef84302105ad712db648a6c,delivered,2016-09-15 12:16:38,2016-09-15 12:16:38,2016-11-07 17:11:53,...,16.0,81810.0,curitiba,PR,health_beauty,47.82,-20.581177,-47.858931,-25.507144,-49.272075
42152,355077684019f7f60a031656bd7262b8,32ea3bdedab835c3aa6cb68ce66565ef,4106,sao paulo,SP,3b697a20d9e427646d92567910af6d57,delivered,2016-10-03 09:44:50,2016-10-06 15:50:54,2016-10-23 14:02:13,...,16.0,85801.0,cascavel,PR,watches_gifts,45.46,-23.580204,-46.629783,-24.961401,-53.458166
60229,7ec40b22510fdbea1b08921dd39e63d8,2f64e403852e6893ae37485d5fcacdaf,98280,panambi,RS,be5bc2f0da14d8071e2d45451ad119d9,delivered,2016-10-03 16:56:50,2016-10-06 16:03:44,2016-10-21 16:33:46,...,16.0,15802.0,catanduva,SP,sports_leisure,39.09,-28.295275,-53.499593,-21.142131,-48.989022
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92417,e60df9449653a95af4549bbfcb18a6eb,5c58de6fb80e93396e2f35642666b693,80045,curitiba,PR,0b223d92c27432930dfe407c6aea3041,delivered,2018-08-29 14:18:23,2018-08-29 14:31:07,2018-08-29 15:29:00,...,49.0,82400.0,curitiba,PR,kitchen_dining_laundry_garden_furniture,255.48,-25.425530,-49.254480,-25.393345,-49.350986
92418,e60df9449653a95af4549bbfcb18a6eb,5c58de6fb80e93396e2f35642666b693,80045,curitiba,PR,0b223d92c27432930dfe407c6aea3041,delivered,2018-08-29 14:18:23,2018-08-29 14:31:07,2018-08-29 15:29:00,...,49.0,82400.0,curitiba,PR,kitchen_dining_laundry_garden_furniture,255.48,-25.425530,-49.254480,-25.393345,-49.350986
534,6e353700bc7bcdf6ebc15d6de16d7002,7febafa06d9d8f232a900a2937f04338,38600,paracatu,MG,168626408cb32af0ffaf76711caae1dc,delivered,2018-08-29 14:18:28,2018-08-29 14:30:23,2018-08-29 18:51:00,...,28.0,30111.0,belo horizonte,MG,health_beauty,61.29,-17.211338,-46.886436,-19.916247,-43.936234
54447,496630b6740bcca28fce9ba50d8a26ef,b701bebbdf478f5500348f03aff62121,9541,sao caetano do sul,SP,03ef5dedbe7492bdae72eec50764c43f,delivered,2018-08-29 14:52:00,2018-08-29 15:05:22,2018-08-29 20:01:00,...,19.0,1320.0,sao paulo,SP,party_supplies,33.23,-23.621055,-46.565101,-23.557338,-46.638777


## Get Distance between Customer and Seller

In [72]:
df_merge3['customer_location'] = list(zip(df_merge3['customer_lat'], df_merge3['customer_lng']))
df_merge3['seller_location'] = list(zip(df_merge3['seller_lat'], df_merge3['seller_lng']))

In [78]:
df_merge3['distance'] = df_merge3.apply(lambda row: distance(row['customer_location'], row['seller_location']).km, axis=1)

## Get Order Count per Day per Seller

In [97]:
#df_merge3['date'] = df_merge3['order_purchase_timestamp'].apply(lambda x: '%d/%d/%d' % (x.day, x.month, x.year))
df_merge3['date'] = df_merge3['order_purchase_timestamp'].dt.date.nunique()

In [130]:
df_merge3['seller_order_count'] = df_merge3.groupby(['date','seller_id'])['order_id'].transform('count')

## Get Purchase Approval and Delivered Carrier Duration -> Continue From Here

In [230]:
df_merge3['order_approval_duration'] = (df_merge3['order_approved_at'].dt.date - df_merge3['order_purchase_timestamp'].dt.date).dt.days
#df_merge3['order_approval_duration'] = (df_merge3['order_approved_at'] - df_merge3['order_purchase_timestamp'])/np.timedelta64(1,'h')

In [258]:
df_merge3['order_carrier_duration'] = (df_merge3['order_delivered_carrier_date'].dt.date - df_merge3['order_purchase_timestamp'].dt.date).dt.days
#df_merge3['order_carrier_duration'] = (df_merge3['order_delivered_carrier_date'] - df_merge3['order_purchase_timestamp'])/np.timedelta64(1,'h')

## Data Preparation & Manipulation

In [210]:
# 'order_approval_duration', 'order_carrier_duration' -> improve estimated delivery date?
df_before_purchase = df_merge3[['seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'delivery_days']]
df_after_approval = df_merge3[['seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'delivery_days']]
df_carrier_received = df_merge3[['seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'delivery_days']]

In [211]:
# Fill/drop blank column
df_carrier_received['product_category_name'].fillna('NC', inplace=True)
df_carrier_received = df_carrier_received[~(df_carrier_received['product_weight_g'].isna())]
df_carrier_received = df_carrier_received[~(df_carrier_received['order_carrier_duration'].isna())]
df_carrier_received = df_carrier_received[~(df_carrier_received['delivery_days'].isna())]

In [212]:
numerics = ['int', 'int64', 'float64']
numerical = df_carrier_received.select_dtypes(include=numerics).columns
categorical = df_carrier_received.drop(numerical,axis=1).columns

In [213]:
df_carrier_received

Unnamed: 0,seller_id,seller_order_count,distance,product_category_name,price,freight_value,shipping_cost_perc,product_weight_g,product_length_cm,product_height_cm,product_width_cm,product_size,purchase_dow,order_approval_duration,order_carrier_duration,delivery_days
0,7c67e1448b00f6e969d365cea6b010ab,2,345.713577,office_furniture,124.99,21.88,0.148975,8683.0,54.0,64.0,31.0,107136.0,1,0.011539,6.821088,8.0
1,b8bc237ba3788b23da09c0f1f3a3288c,1,413.009607,housewares,289.00,46.48,0.138548,10150.0,89.0,15.0,40.0,53400.0,4,0.007037,2.851794,16.0
2,7c67e1448b00f6e969d365cea6b010ab,3,29.618246,office_furniture,139.94,17.79,0.112788,8267.0,52.0,52.0,17.0,45968.0,5,1.007928,22.932813,26.0
3,7c67e1448b00f6e969d365cea6b010ab,3,19.385057,office_furniture,149.94,23.36,0.134795,12160.0,56.0,51.0,28.0,79968.0,1,0.057419,14.302824,14.0
4,4a3ca9315b744ce9f8e9374361493884,2,219.599199,home_confort,230.00,22.25,0.088206,5200.0,45.0,15.0,35.0,23625.0,6,0.012951,1.225347,11.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113420,527801b552d0077ffd170872eb49683b,1,356.361343,books_general_interest,74.90,13.88,0.156342,611.0,22.0,22.0,23.0,11132.0,5,0.014213,3.430775,6.0
113421,3fd1e727ba94cfe122d165e176ce7967,1,250.847050,sports_leisure,114.90,14.16,0.109716,1211.0,25.0,24.0,22.0,13200.0,2,0.010301,1.432095,7.0
113422,d9e7e7778b32987280a6f2cb9a39c57d,1,2356.429965,health_beauty,37.00,19.04,0.339757,870.0,25.0,20.0,18.0,9000.0,6,0.012650,0.903090,30.0
113423,4869f7a5dfa277a7dca6462dcf3b52b2,1,996.494404,watches_gifts,689.00,22.07,0.031038,710.0,19.0,13.0,14.0,3458.0,4,0.015822,2.886204,12.0


In [216]:
categorical

Index(['seller_id', 'product_category_name'], dtype='object')

In [None]:
## Train test split 80, 20

In [None]:
## One Hot Encoding

In [None]:
## GridSearchCV

In [None]:
# Path 1

In [None]:
## RandomForestRegressor -> Delivery Duration Prediction -> Check against actual delivery duration -> Late or Not Late

In [None]:
## MSE/MAPE

In [None]:
## Feature Importance

In [None]:
# Path 2

In [None]:
## LogReg -> Late or Not Late

In [None]:
## Accuracy

In [None]:
## Feature Importance