# DEPENDENCIES

In [24]:
import numpy as np
import pandas as pd
import pandas_gbq
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (20, 12)

import plotly.express as px
import plotly.graph_objects as go
import plotly.offline as pyoff
import plotly.io as pio
pio.renderers.default = 'notebook_connected'

### Connect to BQ Kraken Database

In [25]:
import os 
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/Users/asani/Downloads/octopus-data-prod-294206-8028c6b220ce.json'

In [26]:
project_id = 'octopus-data-prod-294206'
sql = """
SELECT *
FROM `octopus-data-prod-294206.KRAKEN_temp_view.customer_orders`
"""
df = pandas_gbq.read_gbq(sql, project_id=project_id)

Downloading: 100%|[32m██████████[0m|


In [27]:
df.head()

Unnamed: 0,order_id,system,timestamp,total_quantity,unit,amount_per_qty,total_amount,total_weight,user_id,receiver_id,...,latitude,longitude,country,province,city,district,village,order_status,route_id,total_distance
0,db6a35c0-c64e-4864-87a8-6ec5144340b4,V2,2021-08-22 20:14:51.507747,19.0,,33.0,155.0,865.775,67197,1239,...,-6.941175,107.740361,Indonesia,Jawa Barat,Bandung,Cileunyi,Cileunyi Kulon,completed,5353,8.37059
1,7226a173-2d68-4590-bfed-083f972eb185,V2,2021-06-15 09:10:49.214557,20.0,,12.5,250.0,5000.0,61422,783,...,-6.976872,107.668886,Indonesia,Jawa Barat,Bandung,Kecamatan Bojongsoang,Buahbatu,completed,3501,0.0
2,b6193928-1782-47e4-a3dd-5457b5f4e6e9,V2,2021-05-19 20:50:04.808810,20.0,,12.5,250.0,5000.0,58766,796,...,-6.976072,107.655083,Indonesia,Jawa Barat,Bandung,Kecamatan Bojongsoang,Cipagalo,completed,2020,0.0
3,d60857d5-b189-4344-9fb4-de33160d181c,V2,2021-08-04 12:55:42.248596,16.0,,23.0,70.0,1454.04,46019,818,...,-6.976277,107.655804,Indonesia,Jawa Barat,Bandung,Kecamatan Bojongsoang,Cipagalo,completed,4997,7.582155
4,dbf9a628-23f4-42f3-b739-e7ff25c9c3d7,V2,2021-06-12 08:21:56.350409,5.0,,8.0,40.0,75.755,64708,806,...,-7.008113,107.644134,Indonesia,Jawa Barat,Bandung,Kecamatan Baleendah,Manggahang,completed,3388,0.0


In [28]:
# Data info

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49717 entries, 0 to 49716
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   order_id        49717 non-null  object        
 1   system          49717 non-null  object        
 2   timestamp       49717 non-null  datetime64[ns]
 3   total_quantity  48963 non-null  object        
 4   unit            4657 non-null   object        
 5   amount_per_qty  48963 non-null  object        
 6   total_amount    48963 non-null  object        
 7   total_weight    48963 non-null  object        
 8   user_id         49717 non-null  Int64         
 9   receiver_id     33725 non-null  Int64         
 10  region_id       49717 non-null  Int64         
 11  region_name     48356 non-null  object        
 12  latitude        49717 non-null  float64       
 13  longitude       49717 non-null  float64       
 14  country         48382 non-null  object        
 15  pr

In [29]:
# Check missing value

df.isnull().sum()

order_id              0
system                0
timestamp             0
total_quantity      754
unit              45060
amount_per_qty      754
total_amount        754
total_weight        754
user_id               0
receiver_id       15992
region_id             0
region_name        1361
latitude              0
longitude             0
country            1335
province           1361
city               1361
district           1433
village            1387
order_status          0
route_id          20364
total_distance    20364
dtype: int64

In [30]:
# Check Order Status

df.order_status.value_counts()

completed    37035
cancelled    12682
Name: order_status, dtype: int64

# DATA PREPROCESSING

In [31]:
df.loc[:,['total_quantity','total_amount', 'total_weight']].astype('float')

Unnamed: 0,total_quantity,total_amount,total_weight
0,19.0,155.0,865.775
1,20.0,250.0,5000.000
2,20.0,250.0,5000.000
3,16.0,70.0,1454.040
4,5.0,40.0,75.755
...,...,...,...
49712,3.0,2300.0,3000.000
49713,3.0,2275.0,3000.000
49714,1.0,800.0,1000.000
49715,3.0,1850.0,3000.000


In [32]:
# Datetime Preprocessing

def date_preprocessing(df):
    df = df.copy().loc[:,['order_id', 'user_id', 'timestamp', 'total_quantity', 'total_amount', 'total_weight', 
         'order_status', 'province', 'city']]
    #df['timestamp'] = pd.to_datetime(df['timestamp'])
    df['day'] = df['timestamp'].dt.day
    df['day_name'] = df['timestamp'].dt.day_name()
    df['month'] = df['timestamp'].dt.month
    df['year'] = df['timestamp'].dt.year
    df['month_year'] = df['timestamp'].dt.to_period('M').astype(str)
    df['total_amount'] = df['total_amount'].astype('float')
    df['total_quantity'] = df['total_quantity'].astype('float')
    df['total_weight'] = df['total_weight'].astype('float')
    return df

In [33]:
# Convert timestamp to datetime

df = date_preprocessing(df)
df.head()

Unnamed: 0,order_id,user_id,timestamp,total_quantity,total_amount,total_weight,order_status,province,city,day,day_name,month,year,month_year
0,db6a35c0-c64e-4864-87a8-6ec5144340b4,67197,2021-08-22 20:14:51.507747,19.0,155.0,865.775,completed,Jawa Barat,Bandung,22,Sunday,8,2021,2021-08
1,7226a173-2d68-4590-bfed-083f972eb185,61422,2021-06-15 09:10:49.214557,20.0,250.0,5000.0,completed,Jawa Barat,Bandung,15,Tuesday,6,2021,2021-06
2,b6193928-1782-47e4-a3dd-5457b5f4e6e9,58766,2021-05-19 20:50:04.808810,20.0,250.0,5000.0,completed,Jawa Barat,Bandung,19,Wednesday,5,2021,2021-05
3,d60857d5-b189-4344-9fb4-de33160d181c,46019,2021-08-04 12:55:42.248596,16.0,70.0,1454.04,completed,Jawa Barat,Bandung,4,Wednesday,8,2021,2021-08
4,dbf9a628-23f4-42f3-b739-e7ff25c9c3d7,64708,2021-06-12 08:21:56.350409,5.0,40.0,75.755,completed,Jawa Barat,Bandung,12,Saturday,6,2021,2021-06


In [34]:
# Check Rows that total_quantity, total_amount, and total_weight are NA Based on order_status

order_missing = df[df[['total_quantity', 'total_amount','total_weight']].isna().any(axis=1)]
order_missing.order_status.value_counts()

cancelled    730
completed     24
Name: order_status, dtype: int64

In [35]:
# Check Rows that total_quantity, total_amount, and total_weight are NA Based on order_status

order_missing[order_missing['order_status']=='completed'].head()

Unnamed: 0,order_id,user_id,timestamp,total_quantity,total_amount,total_weight,order_status,province,city,day,day_name,month,year,month_year
2629,633dac7f-d76e-4b25-a2bf-69ac67e7fdc2,73886,2021-08-30 16:14:56.466087,,,,completed,Jawa Barat,Bandung,30,Monday,8,2021,2021-08
3916,33784908-0b0d-4d30-91da-84b685bffc84,74219,2021-09-11 11:09:23.129378,,,,completed,Jawa Barat,Kota Bandung,11,Saturday,9,2021,2021-09
9077,bb90deb0-ed18-4b1d-a667-0e933ad9e425,15744,2021-08-29 20:50:13.956088,,,,completed,Sulawesi Selatan,Kabupaten Gowa,29,Sunday,8,2021,2021-08
14159,ded041c9-8fc6-4729-ad47-d23f0a69a77f,15744,2021-08-30 05:00:20.483905,,,,completed,Bali,Kabupaten Badung,30,Monday,8,2021,2021-08
15999,509c5310-fd6f-4808-901a-09bb5e4bb253,75182,2021-10-01 15:37:42.956597,,,,completed,Jawa Barat,Kota Bandung,1,Friday,10,2021,2021-10


* There were 24 completed orders which *total quantity*, *total_amount*, and *total_weight* are missing (N/A).
* We investigate it on psql backend prod database, the listed order_id is not captured on v2.order_item, so we assume there's bug in backend process.
* Hence, we'll drop these missing values for now. (Already confirmed by tech and product team)

In [36]:
# Remove missing values of quantity, amount, and weight

df = df.dropna(subset=['total_quantity', 'total_amount','total_weight'])
df.isnull().sum()

order_id             0
user_id              0
timestamp            0
total_quantity       0
total_amount         0
total_weight         0
order_status         0
province          1269
city              1269
day                  0
day_name             0
month                0
year                 0
month_year           0
dtype: int64

In [37]:
# Filter out cancelled order

df_completed = df[df['order_status']=='completed']
df_completed.head()

Unnamed: 0,order_id,user_id,timestamp,total_quantity,total_amount,total_weight,order_status,province,city,day,day_name,month,year,month_year
0,db6a35c0-c64e-4864-87a8-6ec5144340b4,67197,2021-08-22 20:14:51.507747,19.0,155.0,865.775,completed,Jawa Barat,Bandung,22,Sunday,8,2021,2021-08
1,7226a173-2d68-4590-bfed-083f972eb185,61422,2021-06-15 09:10:49.214557,20.0,250.0,5000.0,completed,Jawa Barat,Bandung,15,Tuesday,6,2021,2021-06
2,b6193928-1782-47e4-a3dd-5457b5f4e6e9,58766,2021-05-19 20:50:04.808810,20.0,250.0,5000.0,completed,Jawa Barat,Bandung,19,Wednesday,5,2021,2021-05
3,d60857d5-b189-4344-9fb4-de33160d181c,46019,2021-08-04 12:55:42.248596,16.0,70.0,1454.04,completed,Jawa Barat,Bandung,4,Wednesday,8,2021,2021-08
4,dbf9a628-23f4-42f3-b739-e7ff25c9c3d7,64708,2021-06-12 08:21:56.350409,5.0,40.0,75.755,completed,Jawa Barat,Bandung,12,Saturday,6,2021,2021-06


In [38]:
df_completed.loc[:,['total_quantity', 'total_amount','total_weight']].describe()

Unnamed: 0,total_quantity,total_amount,total_weight
count,37011.0,37011.0,37011.0
mean,39.214171,22799.74,6038.418
std,65.721626,186179.8,23259.06
min,0.0,0.0,0.0
25%,8.5,235.0,554.016
50%,21.0,296.0,2148.154
75%,48.0,1143.0,5780.302
max,2450.0,9815420.0,1540000.0


Here we can see the that:
1. Minimal total_quantity, total amount, and total weight are zero (make no sense).  Thus I'll filter the transaction that greater than 0.
2. Since the unit for total quantity is not the same, I'll skip this for now.
2. The median and mean for total amount and total weight are far apart. For instance, the median and average of total weight of customer orders are 2,500 gr and 6,149.6 gr respectively. This wide gap might occurs due to type of waste that user assign (some might assign plastics waste and some might assign electronic waste like pc)

In [39]:
# Distribution of total_amount and total_weight Before Filtering Out Zero Values

fig = go.Figure()
fig.add_trace(go.Box(y=df_completed['total_amount'], name='Total Amount (points)'))
fig.add_trace(go.Box(y=df_completed['total_weight'], name='Total Weight (grams)'))
fig.update_layout(title='Distribution of Total Amount and Total Weight')
pio.base_renderers.default = "browser"
fig.show()

In [40]:
# Distribution of total_amount and total_weight After Filtering Out Zero Values

fig = go.Figure()
fig.add_trace(go.Box(y=df_completed['total_amount'][df_completed['total_amount']>0], name='Total Amount (points)', boxmean=True))
fig.add_trace(go.Box(y=df_completed['total_weight'][df_completed['total_weight']>0], name='Total Weight (grams)', boxmean=True))
fig.update_layout(title='Distribution of Total Amount and Total Weight')
fig.show()

In [41]:
# Distribution of Amount and Weight After Transforming to Log Scale

fig = go.Figure()
fig.add_trace(go.Box(y=np.log(df_completed['total_amount'][df_completed['total_amount']>0]), name='Total Amount (points)', boxmean=True))
fig.add_trace(go.Box(y=np.log(df_completed['total_weight'][df_completed['total_weight']>0]), name='Total Weight (grams)', boxmean=True))
fig.update_layout(title='Distribution of Total Amount and Total Weight (log)')
fig.show()

In [42]:
df_completed.loc[:,['total_amount','total_weight']][(df_completed.total_amount > 0) & (df_completed.total_weight > 0)].describe()

Unnamed: 0,total_amount,total_weight
count,36417.0,36417.0
mean,23171.62,5648.78
std,187669.1,20349.06
min,1.0,0.02
25%,250.0,540.013
50%,300.0,2089.029
75%,1200.0,5736.546
max,9815420.0,1540000.0


In [43]:
len(df_completed[df_completed['total_amount']==0]), len(df_completed[df_completed['total_weight']==0])

(593, 26)

What can we get from boxplot above:
1. There are many extreme values (outliers) for total amounts, and total weight. The cause might be due to type of waste customer assigned. 
2. After performing log transformation total weight is normal-enough distributed.
3. Total amount and weight is right skewed distributed. It means that total points gained by user orders is mostly below the average point of 7,727.95

In [44]:
df_completed = df_completed[(df_completed['total_amount']>0) & (df_completed['total_weight']>0)]

In [45]:
%store df_completed

Stored 'df_completed' (DataFrame)


In [46]:
%store df

Stored 'df' (DataFrame)
