In [None]:
!pip install geopy

In [None]:
import numpy as np
import pandas as pd
from src import config
from geopy.distance import distance

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_parquet(config.INT_FILE_PATH / 'transactions.parquet')

In [None]:
df_geo = pd.read_csv(config.RAW_FILE_PATH / 'olist_geolocation_dataset.csv')

In [None]:
df_geo_clean = df_geo.drop_duplicates(subset=['geolocation_zip_code_prefix','geolocation_city','geolocation_state'],keep='first')

## Get Customer Lat Lng

In [None]:
df_merge = df.merge(df_geo_clean, how='left', left_on=['customer_zip_code_prefix', 'customer_city', 'customer_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [None]:
df_merge.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge.rename(columns={'geolocation_lat':'customer_lat', 'geolocation_lng':'customer_lng'},inplace=True)

### Get Seller Lat Lng

In [None]:
df_merge2 = df_merge.merge(df_geo_clean, how='left', left_on=['seller_zip_code_prefix', 'seller_city', 'seller_state'], right_on=['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'])

In [None]:
df_merge2.drop(['geolocation_zip_code_prefix', 'geolocation_city', 'geolocation_state'], axis=1,inplace=True)
df_merge2.rename(columns={'geolocation_lat':'seller_lat', 'geolocation_lng':'seller_lng'},inplace=True)

### Get Distance between Customer and Seller

In [None]:
df_exclude = (df_merge2['customer_lat'].isna()) | (df_merge2['customer_lng'].isna()) | (df_merge2['seller_lat'].isna()) | (df_merge2['seller_lng'].isna())

In [None]:
df_merge3 = df_merge2[~df_exclude]

In [None]:
df_merge3['customer_location'] = list(zip(df_merge3['customer_lat'], df_merge3['customer_lng']))
df_merge3['seller_location'] = list(zip(df_merge3['seller_lat'], df_merge3['seller_lng']))

In [None]:
df_merge3['distance'] = df_merge3.apply(lambda row: distance(row['customer_location'], row['seller_location']).km, axis=1)

In [None]:
df_merge3

### Get Order Count per Day per Seller

In [None]:
df_merge3['date'] = df_merge3['order_purchase_timestamp'].apply(lambda x: '%d/%d/%d' % (x.day, x.month, x.year))

In [None]:
df_merge3['seller_order_count'] = df_merge3.groupby(['date','seller_id'])['order_id'].transform('count')

### Get Purchase Approval and Delivered Carrier Duration -> Continue From Here

In [None]:
df_merge3['order_approval_duration'] = (df_merge3['order_approved_at'] - df_merge3['order_purchase_timestamp']).dt.days #/np.timedelta64(1,'h')/24

In [None]:
df_merge3

In [None]:
df_merge3['order_carrier_duration'] = (df_merge3['order_delivered_carrier_date'] - df_merge3['order_purchase_timestamp'])/np.timedelta64(1,'h')/24

### Data Preparation & Manipulation

In [None]:
# 'order_approval_duration', 'order_carrier_duration' -> improve estimated delivery date?
# what is order_item_id?

df_before_purchase = df_merge3[['seller_id', 'seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'delivery_days']]
df_after_approval = df_merge3[['seller_id', 'seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'delivery_days']]
df_carrier_received = df_merge3[['seller_id', 'seller_order_count', 'distance', 'product_category_name', 'price', 'freight_value', 'shipping_cost_perc', 'product_weight_g', 'product_length_cm', 'product_height_cm', 'product_width_cm', 'product_size', 'purchase_dow', 'order_approval_duration', 'order_carrier_duration', 'delivery_days']]

In [None]:
# Fill/drop blank column
df_carrier_received['product_category_name'].fillna('NC', inplace=True)
df_carrier_received = df_carrier_received[~(df_carrier_received['product_weight_g'].isna())]
df_carrier_received = df_carrier_received[~(df_carrier_received['order_carrier_duration'].isna())]
df_carrier_received = df_carrier_received[~(df_carrier_received['delivery_days'].isna())]

In [None]:
numerics = ['int', 'int64', 'float64']
numerical = df_carrier_received.select_dtypes(include=numerics).columns
categorical = df_carrier_received.drop(numerical,axis=1).columns

In [None]:
df_carrier_received

In [None]:
categorical

In [None]:
## Train test split 80, 20

In [None]:
## One Hot Encoding

In [None]:
## GridSearchCV

In [None]:
# Path 1

In [None]:
## RandomForestRegressor -> Delivery Duration Prediction -> Check against actual delivery duration -> Late or Not Late

In [None]:
## MSE/MAPE

In [None]:
## Feature Importance

In [None]:
# Path 2

In [None]:
## LogReg -> Late or Not Late

In [None]:
## Accuracy

In [None]:
## Feature Importance