In [None]:
!pip install geopy

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src import config
from geopy.distance import distance

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, fbeta_score, recall_score, precision_score
import pickle

import warnings
warnings.filterwarnings("ignore")

In [None]:
def feature_engineering(df):
    # Get Shipping Cost Percentage
    df['shipping_cost_perc'] = df['freight_value']/df['order_total_price']
    
    # Get Purchase Day of Week
    df['purchase_dow'] = df['order_purchase_timestamp'].dt.dayofweek
    
    # Get Distance Between Customer and Seller
    df['customer_location'] = list(zip(df['customer_lat'], df['customer_lng']))
    df['seller_location'] = list(zip(df['seller_lat'], df['seller_lng']))
    df['distance'] = df.apply(lambda row: distance(row['customer_location'], row['seller_location']).km, axis=1)
    
    # Get Order Count per Day per Seller
    df['date'] = df['order_purchase_timestamp'].dt.date.nunique()
    df['seller_order_count'] = df.groupby(['date','seller_id'])['order_id'].transform('count')
    
    # Get Purchase Approval and Delivered Carrier Duration
    df['order_approval_duration'] = (df['order_approved_at'].dt.date - df['order_purchase_timestamp'].dt.date).dt.days
    df['order_carrier_duration'] = (df['order_delivered_carrier_date'].dt.date - df['order_purchase_timestamp'].dt.date).dt.days
    
    # Get Product Size
    df['product_size'] = df['product_length_cm'] * df['product_width_cm'] * df['product_height_cm']
    
    return df

In [None]:
def product_categorization(df):
    df['fashion'] = np.where(df['product_category_name'].str.contains(r'(?:fashio|perfumery)'), 1, 0)
    df['consumption'] = np.where(df['product_category_name'].str.contains(r'(?:food|drink|cuisine)'), 1, 0)
    df['garden'] = np.where(df['product_category_name'].str.contains(r'(?:garden|flower)'), 1, 0)
    df['electronics'] = np.where(df['product_category_name'].str.contains(r'(?:electronic|computer|telephony|console|audio)'), 1, 0)
    df['furniture_appliances'] = np.where(df['product_category_name'].str.contains(r'(?:furniture|appliance|bed|houseware|air_conditioning|table|kitchen|comfort|confort)'), 1, 0)
    df['construction'] = np.where(df['product_category_name'].str.contains(r'(?:construction)'), 1, 0)
    df['hobby_entertainment'] = np.where(df['product_category_name'].str.contains(r'(?:console|book|music|art|dvd)'), 1, 0)
    df['kids'] = np.where(df['product_category_name'].str.contains(r'(?:baby|toy|diaper|children)'), 1, 0)
    
    return df

## Read Dataset

In [None]:
df = pd.read_parquet(config.INT_FILE_PATH / 'transactions.parquet')

In [None]:
df_geo = pd.read_csv(config.RAW_FILE_PATH / 'olist_geolocation_dataset.csv')

In [None]:
df_geo_clean = df_geo.drop_duplicates(subset=['geolocation_zip_code_prefix','geolocation_city','geolocation_state'],keep='first')

In [None]:
df_review = pd.read_csv(config.RAW_FILE_PATH / 'olist_order_reviews_dataset.csv')

## Filter only original data

In [None]:
df_original = df.iloc[:,:-6].dropna()

## Get Customer Lat Lng

In [None]:
df_merge = df_original.merge(df_geo_clean, how='left', left_on=['customer_zip_code_prefix', 'customer_city', 'customer_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [None]:
df_merge.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge.rename(columns={'geolocation_lat':'customer_lat', 'geolocation_lng':'customer_lng'},inplace=True)

## Get Seller Lat Lng

In [None]:
df_merge2 = df_merge.merge(df_geo_clean, how='left', left_on=['seller_zip_code_prefix', 'seller_city', 'seller_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [None]:
df_merge2.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge2.rename(columns={'geolocation_lat':'seller_lat', 'geolocation_lng':'seller_lng'},inplace=True)

## Remove any Missing Lat Lng

In [None]:
df_exclude = (df_merge2['customer_lat'].isna()) | (df_merge2['customer_lng'].isna()) | (df_merge2['seller_lat'].isna()) | (df_merge2['seller_lng'].isna())

In [None]:
df_merge3 = df_merge2[~df_exclude]

## Correlation

In [None]:
df_corr = feature_engineering(df_merge3)

In [None]:
df_corr['delivery_days'] = (df_corr['order_delivered_customer_date'].dt.date - df_corr['order_purchase_timestamp'].dt.date).dt.days
df_corr['estimated_days'] = (df_corr['order_estimated_delivery_date'].dt.date - df_corr['order_purchase_timestamp'].dt.date).dt.days

In [None]:
df_corr['late_flag'] = np.where(df_corr['delivery_days']>df_corr['estimated_days'],1,0)

In [None]:
df_corr2 = df_corr.merge(df_review[['order_id','review_score']],how='left')

In [None]:
# Removing unnecessary columns for correlation
df_corr2.drop(['date','seller_lng','seller_lat','customer_lng','customer_lat','seller_zip_code_prefix','customer_zip_code_prefix'],axis=1,inplace=True)

In [None]:
mask = np.zeros_like(df_corr2[df_corr2.columns[::-1]].corr(), dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
plt.figure(figsize=(25, 15))
sns.heatmap(df_corr2[df_corr2.columns[::-1]].corr(), cmap=sns.diverging_palette(20, 220, n=200), annot=True, mask=mask, center = 0)
plt.title("Features Correlation Heatmap", fontsize = 30)
plt.savefig('Features Correlation Heatmap.png',bbox_inches='tight',facecolor='white')
plt.show()

## Review Score Top 10 Correlation

In [None]:
corr = df_corr2[df_corr2.columns[::-1]].corr()

In [None]:
corr['review_score'].sort_values(ascending=True).head(10)

From the above correlation, we can see that late flag, delivery days, order carrier duration and estimated days are in the top 5 negative correlation with review score. Delivery related components are the main driver that causes low review.

## Estimated Days % share and MAE

In [None]:
df_corr2['late_flag'].sum()/df_corr2.shape[0]

In [None]:
mean_absolute_error(df_corr2['delivery_days'],df_corr2['estimated_days'])

We can see that there are roughly 6% of products are delivered late, this looks quite low but it we look at the number of estimated delivery days, we can see that the MAE is 13-14 days which is relatively high. It is likely that the company would like to avoid any late delivery by increasing the number of buffer days in the estimated delivery days.

## Estimated Delivery Review Score Trend

In [None]:
est_del_review = pd.DataFrame({'min_estimated_days':[0,10,20,30,40,50], 'max_estimated_days':[10,20,30,40,50,np.inf]})

In [None]:
est_del_review['review_score'] = est_del_review.apply(lambda x: df_corr2[(df_corr2['estimated_days'] > x['min_estimated_days']) & (df_corr2['estimated_days'] <= x['max_estimated_days'])].review_score.mean(), axis=1)

In [None]:
est_del_review

From the above table we could see that the longer the estimated delivery days, the lower the review score. Hence, the strategy to increase estimated delivery days might not be the best solution as it still impacts the review score.

Due to this reason we would like to propose a prediction model to better estimate the delivery duration as well as to avoid late delivery as much as possible. It is important to strike a balance between the estimated delivery duration and late delivery.

## Late Days Review Score Trend

In [None]:
late_days = df_corr2[df_corr2['late_flag']==1][['review_score','delivery_days','estimated_days']]

In [None]:
late_days['late_days'] = late_days['delivery_days'] - late_days['estimated_days']

In [None]:
bins = [0,10,20,30,40,50,60,70,80,90,100,np.inf]
late_days['late_days_bin'] = pd.cut(late_days['late_days'],bins)

In [None]:
late_days.groupby('late_days_bin').review_score.mean().reset_index()

In [None]:
late_days.late_days_bin.value_counts()

From the above we can see that the review score is decreasing from 0 to 40 days of late delivery. Strangely, there is an increasing trend from 40 days and above late delivery. There are relatively small count of data for late delivery above 40 days, thus we would assume that the review might not be an accurate representative.

## Late Flag based on Delivery Days

In [None]:
df_corr2[df_corr2['late_flag']==0].delivery_days.max()

In [None]:
df_corr2[df_corr2['delivery_days']>70].shape[0]/df_corr2.shape[0]

In [None]:
df_corr2[df_corr2['delivery_days']>70].shape[0]/df_corr2[df_corr2['late_flag']==1].shape[0]

In [None]:
sns.boxplot(df_corr2['late_flag'],df_corr2['delivery_days'])
plt.savefig('Late Days.png',bbox_inches='tight',facecolor='white')
plt.show()

In addition, we also see that delivery days of 70 and above are all estimated incorrectly. Hence, all products that is delivered beyond 70 days are considered as late delivery. <br>
There is also not enough information as to why there are cases with a very long delivery duration. Perhaps one data that could help with this is mode of delivery information (train, ship, flight, etc.). However, there is only 0.1% of such cases in the entire dataset or 2% of the entire late delivery.

## Train Test Split

In [None]:
df_merge3['delivery_days'] = (df_merge3['order_delivered_customer_date'].dt.date - df_merge3['order_purchase_timestamp'].dt.date).dt.days
df_merge3['estimated_days'] = (df_merge3['order_estimated_delivery_date'].dt.date - df_merge3['order_purchase_timestamp'].dt.date).dt.days

In [None]:
X = df_merge3.drop(['delivery_days','estimated_days','order_delivered_customer_date','order_estimated_delivery_date'],axis=1)

In [None]:
y = df_merge3[['delivery_days','estimated_days']]

We split the data into 80% training and 20% testing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=2022)

In [None]:
# Flag Top 10 Items sold in the app
top_products = df.groupby('product_category_name').order_total_price.sum().nlargest(10).index

X_train['top_ten'] = np.where(X_train['product_category_name'].isin(top_products), 1, 0)
X_test['top_ten'] = np.where(X_test['product_category_name'].isin(top_products), 1, 0)

In [None]:
# Feature Engineering
X_train2 = feature_engineering(X_train)
X_test2 = feature_engineering(X_test)

In [None]:
# Get length before further product categorization
train_len = X_train2.shape[1]
test_len = X_test2.shape[1]

In [None]:
# Additional Feature Engineering on Further Product Categorization
X_train3 = product_categorization(X_train2)
X_test3 = product_categorization(X_test2)

In [None]:
# Get length of categorization columns
prod_cat_len_train = X_train3.shape[1] - train_len
prod_cat_len_test = X_test3.shape[1] - test_len

In [None]:
X_train_before_purchase = X_train3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'top_ten'] + list(X_train3.columns[-prod_cat_len_train:])]
X_train_carrier_received = X_train3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'top_ten'] + list(X_train3.columns[-prod_cat_len_train:])]

In [None]:
X_test_before_purchase = X_test3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'top_ten'] + list(X_test3.columns[-prod_cat_len_test:])]
X_test_carrier_received = X_test3[['seller_order_count', 'distance', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'top_ten'] + list(X_test3.columns[-prod_cat_len_test:])]

## Train Estimated Delivery Duration Model

### Using Random Forest Regressor

#### Training Steps (run if pickled model is not available, else skip this section)

In [None]:
rfr = RandomForestRegressor(n_jobs=-1, random_state=2022)

In [None]:
rfr.fit(X_train_before_purchase, y_train['delivery_days'])

In [None]:
# Pickle trained model
with open("RandomForestRegressor.pickle","wb") as f:
    pickle.dump(rfr, f)

#### End of Training Steps

#### Load Pickled Training Model

In [None]:
f = open("RandomForestRegressor.pickle",'rb')
rfr = pickle.load(f)

#### End of Load Training Model

In [None]:
y_pred = rfr.predict(X_test_before_purchase)

In [None]:
res = pd.DataFrame({'actual_days':y_test['delivery_days'], 'estimated_days':y_test['estimated_days'], 'pred_days':y_pred})

### Compare MAE and RMSE between Prediction Model and Original Estimated Delivery Days

In [None]:
mean_absolute_error(y_test['delivery_days'],y_pred)

In [None]:
mean_absolute_error(y_test['delivery_days'],y_test['estimated_days'])

In [None]:
mean_squared_error(y_test['delivery_days'],y_pred, squared=False)

In [None]:
mean_squared_error(y_test['delivery_days'],y_test['estimated_days'], squared=False)

From the above result, we can see that our prediction model is able to predict a tighter estimated delivery duration compare to the original estimation days. Our prediction is on average 8 days closer to the actual delivery days.

### Compare % share of Late Delivery between Prediction Model and Original Estimated Delivery Days

In [None]:
res['estimated_late'] = np.where(res['actual_days']>res['estimated_days'], 1, 0)
res['pred_late'] = np.where(res['actual_days']>res['pred_days'], 1, 0)

In [None]:
res.estimated_late.mean(), res.pred_late.mean()

From the above result, we can see that the 37% of the estimated delivery days predicted by prediction model are late. One of the reason is because the prediction model tries to estimate the exact delivery days from both direction without knowing late or early delivery. Because of this, we can add buffer to the predicted days, in our case it is 6 days (MAE is 5 days, so we assume the prediction is +/- 5 days from delivery days)

### Add Buffer to Model Prediction

In [None]:
buffer = np.ceil(mean_absolute_error(y_test['delivery_days'],y_pred))

In [None]:
res['pred_late'] = np.where(res['actual_days']>res['pred_days'] + buffer, 1, 0)

In [None]:
res.estimated_late.mean(), res.pred_late.mean()

In [None]:
mean_absolute_error(y_test['delivery_days'], y_pred+buffer)

In [None]:
mean_squared_error(y_test['delivery_days'], y_pred+buffer, squared=False)

With the additional of buffer, we can see that the late prediction is reduced to only around 12%. However, it is still considered high. Thus, we would like to improve this by predicting late delivery after item is received by carrier.<br>
On the other hand, the MAE is still lower by the estimated delivery by 5 days.

### Feature Importance

In [None]:
feature_importance = pd.DataFrame({'feature':rfr.feature_names_in_, 'score':rfr.feature_importances_})

In [None]:
feature_importance.sort_values('score',ascending=False).head(10)

From the above feature importance, we can see that distance is the main factor on how delivery duration is being estimated. Then, it is followd by freight value and shipping cost percentage. These two factors are related to the cost of freight, the more expensive it is, the more premium the delivery method.

### Using XGBoost Regressor

In [None]:
import xgboost as xgb

In [None]:
xg_reg = xgb.XGBRegressor()

In [None]:
xg_reg.fit(X_train_before_purchase, y_train['delivery_days'])

In [None]:
y_pred_xgb = xg_reg.predict(X_test_before_purchase)

### Compare MAE and RMSE between Prediction Model and Original Estimated Delivery Days

In [None]:
res['pred_days_xgb'] = y_pred_xgb

In [None]:
mean_absolute_error(y_test['delivery_days'],y_pred_xgb)

In [None]:
mean_squared_error(y_test['delivery_days'],y_pred_xgb, squared=False)

### Compare % share of Late Delivery between Prediction Model and Original Estimated Delivery Days

In [None]:
res['pred_late_xgb'] = np.where(res['actual_days']>res['pred_days_xgb'], 1, 0)

In [None]:
res.estimated_late.mean(), res.pred_late_xgb.mean()

### Add Buffer to Model Prediction

In [None]:
buffer = np.ceil(mean_absolute_error(y_test['delivery_days'],y_pred_xgb))

In [None]:
mean_absolute_error(y_test['delivery_days'],y_pred_xgb+buffer)

In [None]:
mean_squared_error(y_test['delivery_days'],y_pred_xgb+buffer, squared=False)

In [None]:
res['pred_late_xgb'] = np.where(res['actual_days']>res['pred_days_xgb'] + buffer, 1, 0)

In [None]:
res.estimated_late.mean(), res.pred_late_xgb.mean()

As we can see, the result is slightly worse than Random Forest model, hence we will use Random Forest model as our predictor.

In [None]:
feature_importance_xg_reg = pd.DataFrame({'feature':xg_reg.feature_names_in_, 'score':xg_reg.feature_importances_})

In [None]:
feature_importance_xg_reg.sort_values('score',ascending=False).head(10)

We can also see that distance is the main factor of how delivery duration is being estimated. However, hobby_entertainment comes next and it difficult to justify why this is so. Hence, we will proceed with Random Forest as our predictor.

## Late Prediction After Carrier Received

### Using Random Forest Classifier

In [None]:
# Assuming estimated days are the predicted days for now
y_train['late'] = np.where(y_train['delivery_days'] > y_train['estimated_days'], 1, 0)

#### Training Steps (run if pickled model is not available, else skip this section)

In [None]:
rfc = RandomForestClassifier(random_state=2022)

In [None]:
#rfc.fit(X_train_carrier_received, y_train['late'])
rfc.fit(X_test_carrier_received, res['pred_late'])

In [None]:
# Pickle trained model
with open("RandomForestClassifier_carrier.pickle","wb") as f:
    pickle.dump(rfc, f)

#### End of Training Steps

#### Load Pickled Training Model

In [None]:
f = open("RandomForestClassifier_carrier.pickle",'rb')
rfc = pickle.load(f)

#### End of Load Training Model

### Predict Late Delivery

In [None]:
y_pred_late = rfc.predict(X_test_carrier_received)

In [None]:
y_pred_late.sum()

Note that the number of predicted late delivery is relatively low due to low number of deliveries from the source (imbalance data).

In the following case, we would like to implement a strategy to send early notification to inform customer regarding the late delivery. We could also send a voucher to the customer to minimize the damage. With this, we hope that the customer expectation could be readjusted and thus improve customer review score.

In [None]:
res['pred_late_flag'] = y_pred_late

In [None]:
# For all pred late flag, customer will be informed for the delay
res['pred_days_adjusted'] = np.where(res['pred_late_flag']==1, res['actual_days'], res['pred_days']+buffer)

In [None]:
# If we add x days to predicted days
np.where(res['actual_days']>np.ceil(res['pred_days_adjusted']), 1, 0).mean()

With the above implementation, we could achieve late rate of just below 10%. Although this is still slightly higher than the original estimated delivery, but we manage to strike a balance between the estimated delivery days and late orders.

### Using XGBoost Classifier

In [None]:
xg_cls = xgb.XGBClassifier()

In [None]:
xg_cls.fit(X_train_carrier_received, y_train['late'])

### Predict Late Delivery

In [None]:
y_pred_late_xgb = xg_cls.predict(X_test_carrier_received)

In [None]:
y_pred_late_xgb.sum()

In [None]:
res['pred_late_flag_xgb'] = y_pred_late_xgb

In [None]:
# For all pred late flag, customer will be informed for the delay
res['pred_days_adjusted_xgb'] = np.where(res['pred_late_flag_xgb']==1, res['actual_days'], res['pred_days_xgb']+buffer)

In [None]:
# If we add x days to predicted days
np.where(res['actual_days']>np.ceil(res['pred_days_adjusted_xgb']), 1, 0).mean()

Using XGBoost, the late rate is still hovering around 11%-12% and thus is inferior compare to Random Forest model. Thus, we will use the Random Foreset as our predictor for late prediction as well.

## Add F1 / F2 score ???

In [None]:
rf_late = np.where(res['actual_days']>np.ceil(res['pred_days_adjusted']), 1, 0)

In [None]:
fbeta_score(res['estimated_late'], rf_late, beta=1)

In [None]:
fbeta_score(res['estimated_late'], rf_late, beta=2)

We also measure the F1 and F2 score. F1 score measures the Harmonic Mean where Precision and Recall are equally weighted, while F2 score measures Harmonic Mean where Recall is being weighted heavier compared to Precision. From the above, we can see that the F1 and F2 score is 59% and 71% respectively.

Precision: Measuring how many late prediction is correctly predicted<br>
Recall: Measuring how many actual late is predicted correctly

In [None]:
precision_score(res['estimated_late'], res['pred_late_flag'])

In [None]:
recall_score(res['estimated_late'], res['pred_late_flag'])

In [None]:
fbeta_score(res['estimated_late'], res['pred_late_flag'], beta=1)

In [None]:
fbeta_score(res['estimated_late'], res['pred_late_flag'], beta=2)

## Late Prediction After Carrier Received with Model Prediction Data (Check)

### Using Random Forest Classifier

In [None]:
new_train_cls = pd.concat([X_test_carrier_received, res['pred_late']],axis=1)

In [None]:
new_train_cls.columns

In [None]:
new_X = new_train_cls.drop('pred_late',axis=1)
new_y = new_train_cls['pred_late']

In [None]:
new_X_train, new_X_test, new_y_train, new_y_test = train_test_split(new_X, new_y, train_size=0.8, test_size=0.2, random_state=2022)

#### Training Steps (run if pickled model is not available, else skip this section)

In [None]:
rfc = RandomForestClassifier(random_state=2022)

In [None]:
#rfc.fit(X_train_carrier_received, y_train['late'])
rfc.fit(new_X_train, new_y_train)

In [None]:
# Pickle trained model
with open("RandomForestClassifier_carrier_new.pickle","wb") as f:
    pickle.dump(rfc, f)

#### End of Training Steps

#### Load Pickled Training Model

In [None]:
f = open("RandomForestClassifier_carrier.pickle",'rb')
rfc = pickle.load(f)

#### End of Load Training Model

### Predict Late Delivery

In [None]:
new_y_pred_late = rfc.predict(new_X_test)

In [None]:
new_y_pred_late.sum()

In [None]:
fbeta_score(new_y_test, new_y_pred_late, beta=1)

In [None]:
fbeta_score(new_y_test, new_y_pred_late, beta=2)

Note that the number of predicted late delivery is relatively low because the number of late deliveries are very low from the source (imbalance data).

In [None]:
new_res = pd.DataFrame({'pred_late':new_y_test,'new_pred_late':new_y_pred_late})

In [None]:
new_res[new_res['new_pred_late']==1].pred_late.value_counts()