In [164]:
# data processing
import numpy as np
import pandas as pd
from datetime import date

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

# pd.options.mode.chained_assignment = None

# ignore less critical warnings
#import warnings
#warnings.filterwarnings('ignore')

In [165]:
# full path of the orders dataset
orders_file = 'C:/Users/short/Desktop/python_vscode/631_Assignment3/data/olist_orders_dataset.csv'

# full path of the customer dataset
cust_file = 'C:/Users/short/Desktop/python_vscode/631_Assignment3/data/olist_customers_dataset.csv'

# join the names of the brazilian states, from https://www.kaggle.com/olistbr/brazilian-ecommerce/discussion/74572
states = pd.read_html('https://en.wikipedia.org/wiki/ISO_3166-2:BR')

In [166]:
def read_olist_data(file1, file2, use_cols, cutoff_date):
    # read the orders data
    orders = pd.read_csv(file1)

    # date-time conversion
    orders['order_purchase_timestamp'] = pd.to_datetime(orders['order_purchase_timestamp'])

    # convert the order purchase timestamps into dates
    orders['order_purchase_date'] = orders['order_purchase_timestamp'].dt.date

    # convert order_delivered_customer_date
    orders['order_delivered_customer_date'] = pd.to_datetime(orders['order_delivered_customer_date'], format = '%Y/%m/%d')
    # orders['order_delivered_customer_date'] = orders['order_delivered_customer_date'].dt.date

    # convert estimated delivery date into dates
    orders['order_estimated_delivery_date'] = pd.to_datetime(orders['order_estimated_delivery_date'], format = '%Y/%m/%d') 
    
    # create an order date col
    orders['order_date'] = pd.to_datetime(orders['order_purchase_date'], format = '%Y/%m/%d') 

    # extract weekday from the order date
    orders['weekday'] = orders['order_purchase_timestamp'].dt.day_name()

    # read the file that contains the unique customer identifier
    # also, let's keep only the following two columns: customer_id, customer_unique_id
    cust = pd.read_csv(file2, usecols = ['customer_id', 'customer_unique_id', 'customer_state'])

    # merge orders and cust dataframes using an inner join on customer_id
    orders_out = pd.merge(orders, cust, on = 'customer_id', how  = 'inner')
    
    # discard (incomplete) data -- data collected on/after 8/22/2018
    # we do this here by using a boolean (True/False) mask
    mask = orders_out['order_purchase_date'] <= cutoff_date
    
    # apply the mask so that we only grab TRUE records.
    orders_out = orders_out[mask]
    
    # sort the output dataset by order date for each customer
    orders_out = orders_out[use_cols].sort_values(['customer_unique_id', 'order_purchase_timestamp'])
    
    return orders_out

In [167]:
# we will keep only these columns for this exercise
use_cols = ['customer_unique_id', 'order_id', 'order_status', 'order_purchase_timestamp', 'order_date', 'order_estimated_delivery_date', 'order_delivered_customer_date', 'weekday', 'customer_state']

# we will discard (incomplete) data after 08/22/2018 (see 06_pandas_wrangle.ipynb for the rationale)
cutoff_date = date(2018, 8, 22)

In [168]:
orders_df = read_olist_data(orders_file, cust_file, use_cols, cutoff_date)

In [169]:
orders_df.head()

Unnamed: 0,customer_unique_id,order_id,order_status,order_purchase_timestamp,order_date,order_estimated_delivery_date,order_delivered_customer_date,weekday,customer_state
52798,0000366f3b9a7992bf8c76cfdf3221e2,e22acc9c116caa3f2b7121bbb380d08e,delivered,2018-05-10 10:56:27,2018-05-10,2018-05-21,2018-05-16 20:48:37,Thursday,SP
73889,0000b849f77a49e4a4ce2b2a4ca5be3f,3594e05a005ac4d06a72673270ef9ec9,delivered,2018-05-07 11:11:27,2018-05-07,2018-05-15,2018-05-10 18:02:42,Monday,SP
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,delivered,2017-03-10 21:05:03,2017-03-10,2017-04-07,2017-04-05 14:38:47,Friday,SC
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,delivered,2017-10-12 20:29:41,2017-10-12,2017-11-13,2017-11-01 21:23:05,Thursday,PA
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,delivered,2017-11-14 19:45:42,2017-11-14,2017-12-05,2017-11-27 23:08:56,Tuesday,SP


In [170]:
# first set of brackets, states2[0], is the table we want from the list states2, then get all rows, cols 0 - 2
br_states = states[0].iloc[:, 0:2]

br_states.reset_index(drop = True, inplace = True)

In [171]:
br_states.columns = ['cust_state', 'state_name']

br_states['customer_state'] = br_states['cust_state'].str.replace('BR-', '')

br_states = br_states.drop('cust_state', axis = 1)

# make this a dictionary we can use as a lookup table
repl = br_states.set_index('customer_state')['state_name'].to_dict()

# replace the state abbrevs with the names of each state
orders_df = orders_df.replace(repl)

In [172]:
orders_df.head()

Unnamed: 0,customer_unique_id,order_id,order_status,order_purchase_timestamp,order_date,order_estimated_delivery_date,order_delivered_customer_date,weekday,customer_state
52798,0000366f3b9a7992bf8c76cfdf3221e2,e22acc9c116caa3f2b7121bbb380d08e,delivered,2018-05-10 10:56:27,2018-05-10,2018-05-21,2018-05-16 20:48:37,Thursday,São Paulo
73889,0000b849f77a49e4a4ce2b2a4ca5be3f,3594e05a005ac4d06a72673270ef9ec9,delivered,2018-05-07 11:11:27,2018-05-07,2018-05-15,2018-05-10 18:02:42,Monday,São Paulo
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,delivered,2017-03-10 21:05:03,2017-03-10,2017-04-07,2017-04-05 14:38:47,Friday,Santa Catarina
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,delivered,2017-10-12 20:29:41,2017-10-12,2017-11-13,2017-11-01 21:23:05,Thursday,Pará
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,delivered,2017-11-14 19:45:42,2017-11-14,2017-12-05,2017-11-27 23:08:56,Tuesday,São Paulo


## Create customer attributes

#### first, subset the data in the observation period using a mask

In [173]:
# set the snapshot date
snapshot_date = date(2017, 12, 31)
#odate = orders_df['order_purchase_timestamp'].dt.date

df2 = orders_df.query('order_date <= @snapshot_date')

# mask = orders_df['order_purchase_timestamp'].dt.date <= snapshot_date

# mask.head()

# apply the filter
# df_raw = orders_df[mask]

len(orders_df), len(df2)

(98906, 45430)

In [None]:
# mask = orders_df['order_purchase_timestamp'].dt.date <= snapshot_date

# mask.head()

# apply the filter
# df_raw = orders_df[mask]

# len(orders_df), len(df_raw)

In [174]:
# check the max date
# df_raw.order_purchase_timestamp.max()
df2.order_purchase_timestamp.max()

Timestamp('2017-12-31 23:29:31')

#### Customer Recency (replicated attribute)

In [175]:
# To calculate recency (days since the most recent order), we will have to extract 
# the most recent order date for each customer.
cust_recency = df2.groupby('customer_unique_id')['order_purchase_timestamp'].max().reset_index()

cust_recency.head()

Unnamed: 0,customer_unique_id,order_purchase_timestamp
0,0000f46a3911fa3c0805444483337064,2017-03-10 21:05:03
1,0000f6ccb0745a6a4b88665a16c9f078,2017-10-12 20:29:41
2,0004aac84e0df4da2b147fca70cf8255,2017-11-14 19:45:42
3,0005e1862207bf6ccc02e4228effd9a0,2017-03-04 23:32:12
4,0006fdc98a402fceb4eb0ee528f6a8d4,2017-07-18 09:23:10


In [176]:
# count the number of days between the most recent order date and the snapshot date
delta = snapshot_date - cust_recency['order_purchase_timestamp'].dt.date

delta.head()

0   296 days
1    80 days
2    47 days
3   302 days
4   166 days
Name: order_purchase_timestamp, dtype: timedelta64[ns]

In [177]:
# grab the number of days (as an integer) from the calculated deltas
# divide by 30 to get number of months
cust_recency['order_recency'] = delta.dt.days / 30

cust_recency['order_recency'] = cust_recency['order_recency'].astype(int)

cust_recency.head()

Unnamed: 0,customer_unique_id,order_purchase_timestamp,order_recency
0,0000f46a3911fa3c0805444483337064,2017-03-10 21:05:03,9
1,0000f6ccb0745a6a4b88665a16c9f078,2017-10-12 20:29:41,2
2,0004aac84e0df4da2b147fca70cf8255,2017-11-14 19:45:42,1
3,0005e1862207bf6ccc02e4228effd9a0,2017-03-04 23:32:12,10
4,0006fdc98a402fceb4eb0ee528f6a8d4,2017-07-18 09:23:10,5


In [178]:
# drop the date from this df
cust_recency = cust_recency.drop('order_purchase_timestamp', axis = 1)

# how many records (unique customers) do we have for this timeframe?
# This is the number of customers who are included in the modeling dasetset for this specific time-frame.
len(cust_recency)

44034

In [179]:
# Let's take a look at the cumulative distribution of customers by recency.
recency_distr = cust_recency.groupby('order_recency').size().reset_index()

recency_distr = recency_distr.rename(columns = {0: 'cust_count'})

recency_distr.head()

Unnamed: 0,order_recency,cust_count
0,0,5336
1,1,7548
2,2,4354
3,3,4160
4,4,3972


In [180]:
# calculate cumulative percentages 
# cumulative_pctg = recency_distr['cust_count'].cumsum() / sum(recency_distr['cust_count'])

# cumulative_pctg.head()

#### Days to delivery (experimental attribute)

In [None]:
# delta2 = df_raw['order_delivered_customer_date'] - df_raw['order_purchase_timestamp']

In [181]:
df2.fillna(0)

df2.head()

Unnamed: 0,customer_unique_id,order_id,order_status,order_purchase_timestamp,order_date,order_estimated_delivery_date,order_delivered_customer_date,weekday,customer_state
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,delivered,2017-03-10 21:05:03,2017-03-10,2017-04-07,2017-04-05 14:38:47,Friday,Santa Catarina
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,delivered,2017-10-12 20:29:41,2017-10-12,2017-11-13,2017-11-01 21:23:05,Thursday,Pará
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,delivered,2017-11-14 19:45:42,2017-11-14,2017-12-05,2017-11-27 23:08:56,Tuesday,São Paulo
71235,0005e1862207bf6ccc02e4228effd9a0,ae76bef74b97bcb0b3e355e60d9a6f9c,delivered,2017-03-04 23:32:12,2017-03-04,2017-04-06,2017-03-09 08:33:08,Saturday,Rio de Janeiro
26203,0006fdc98a402fceb4eb0ee528f6a8d4,6681163e3dab91c549952b2845b20281,delivered,2017-07-18 09:23:10,2017-07-18,2017-08-15,2017-08-03 18:42:49,Tuesday,Espírito Santo


In [185]:
# note that we need to use engine = 'python' in order to run this on the timedelta64
df2.eval('days_to_delivery =  order_delivered_customer_date - order_date', inplace = True, engine = 'python')

In [186]:
# just curious
df2.dtypes

customer_unique_id                        object
order_id                                  object
order_status                              object
order_purchase_timestamp          datetime64[ns]
order_date                        datetime64[ns]
order_estimated_delivery_date     datetime64[ns]
order_delivered_customer_date     datetime64[ns]
weekday                                   object
customer_state                            object
days_to_delivery                 timedelta64[ns]
dtype: object

In [187]:
# df2.loc['days_to_delivery'] = df2.copy(pd.to_numeric(df2['days_to_delivery']), deep = False)
# dfView = df2.copy(deep = False)
dfView.loc[:, 'days_to_delivery'] = df2['days_to_delivery'].dt.days / 7

#cust_recency['order_recency'] = delta.dt.days / 30

#cust_recency['order_recency'] = cust_recency['order_recency'].astype(int)
#view.loc[:, 'days_to_delivery'] = pd.to_numeric(df2['days_to_delivery'])
# view_of_df = df.copy(deep=False)

In [188]:
dfView['days_to_delivery'] = pd.to_numeric(dfView['days_to_delivery'])

In [189]:
dfView.head()

Unnamed: 0,customer_unique_id,order_id,order_status,order_purchase_timestamp,order_date,order_estimated_delivery_date,order_delivered_customer_date,weekday,customer_state,days_to_delivery
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,delivered,2017-03-10 21:05:03,2017-03-10,2017-04-07,2017-04-05 14:38:47,Friday,Santa Catarina,3.714286
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,delivered,2017-10-12 20:29:41,2017-10-12,2017-11-13,2017-11-01 21:23:05,Thursday,Pará,2.857143
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,delivered,2017-11-14 19:45:42,2017-11-14,2017-12-05,2017-11-27 23:08:56,Tuesday,São Paulo,1.857143
71235,0005e1862207bf6ccc02e4228effd9a0,ae76bef74b97bcb0b3e355e60d9a6f9c,delivered,2017-03-04 23:32:12,2017-03-04,2017-04-06,2017-03-09 08:33:08,Saturday,Rio de Janeiro,0.714286
26203,0006fdc98a402fceb4eb0ee528f6a8d4,6681163e3dab91c549952b2845b20281,delivered,2017-07-18 09:23:10,2017-07-18,2017-08-15,2017-08-03 18:42:49,Tuesday,Espírito Santo,2.285714


In [190]:
dfView.dtypes

customer_unique_id                       object
order_id                                 object
order_status                             object
order_purchase_timestamp         datetime64[ns]
order_date                       datetime64[ns]
order_estimated_delivery_date    datetime64[ns]
order_delivered_customer_date    datetime64[ns]
weekday                                  object
customer_state                           object
days_to_delivery                        float64
dtype: object

In [191]:
weeks_to_delivery = dfView.groupby('customer_unique_id')['days_to_delivery'].mean().reset_index()

weeks_to_delivery.head()

Unnamed: 0,customer_unique_id,days_to_delivery
0,0000f46a3911fa3c0805444483337064,3.714286
1,0000f6ccb0745a6a4b88665a16c9f078,2.857143
2,0004aac84e0df4da2b147fca70cf8255,1.857143
3,0005e1862207bf6ccc02e4228effd9a0,0.714286
4,0006fdc98a402fceb4eb0ee528f6a8d4,2.285714


In [192]:
weeks_to_delivery.rename(columns = { weeks_to_delivery.columns[1]: 'avg_weeks_to_delivery' }, inplace = True)

In [193]:
weeks_to_delivery.dtypes

customer_unique_id        object
avg_weeks_to_delivery    float64
dtype: object

#### Total Customer Orders (replicated attribute)

In [96]:
# count total records (i.e., orders) per customer
cust_orders = df2.groupby('customer_unique_id').size().reset_index().rename(columns = {0: 'total_orders'})

cust_orders.total_orders.value_counts()

# cust_orders.head()

1     42770
2      1164
3        80
4        14
5         4
10        1
6         1
Name: total_orders, dtype: int64

#### Order Status (experimental attribute)

In [87]:
# get dummies
df2 = pd.get_dummies(df2, columns = ['order_status'])

df2.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp,order_date,order_estimated_delivery_date,order_delivered_customer_date,weekday,customer_state,days_to_delivery,order_status_approved,order_status_canceled,order_status_created,order_status_delivered,order_status_invoiced,order_status_processing,order_status_shipped,order_status_unavailable
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,2017-03-10 21:05:03,2017-03-10,2017-04-07,2017-04-05 14:38:47,Friday,Santa Catarina,26 days 14:38:47,0,0,0,1,0,0,0,0
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,2017-10-12 20:29:41,2017-10-12,2017-11-13,2017-11-01 21:23:05,Thursday,Pará,20 days 21:23:05,0,0,0,1,0,0,0,0
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,2017-11-14 19:45:42,2017-11-14,2017-12-05,2017-11-27 23:08:56,Tuesday,São Paulo,13 days 23:08:56,0,0,0,1,0,0,0,0
71235,0005e1862207bf6ccc02e4228effd9a0,ae76bef74b97bcb0b3e355e60d9a6f9c,2017-03-04 23:32:12,2017-03-04,2017-04-06,2017-03-09 08:33:08,Saturday,Rio de Janeiro,5 days 08:33:08,0,0,0,1,0,0,0,0
26203,0006fdc98a402fceb4eb0ee528f6a8d4,6681163e3dab91c549952b2845b20281,2017-07-18 09:23:10,2017-07-18,2017-08-15,2017-08-03 18:42:49,Tuesday,Espírito Santo,16 days 18:42:49,0,0,0,1,0,0,0,0


In [88]:
# grab all binary columns that we just created
dummies = [x for x in df2.columns if 'order_status' in x]

# aggreagte data to the customer-level, so that we have one record per customer
status = df2.groupby('customer_unique_id')[dummies].sum().reset_index()

status.head()

Unnamed: 0,customer_unique_id,order_status_approved,order_status_canceled,order_status_created,order_status_delivered,order_status_invoiced,order_status_processing,order_status_shipped,order_status_unavailable
0,0000f46a3911fa3c0805444483337064,0,0,0,1,0,0,0,0
1,0000f6ccb0745a6a4b88665a16c9f078,0,0,0,1,0,0,0,0
2,0004aac84e0df4da2b147fca70cf8255,0,0,0,1,0,0,0,0
3,0005e1862207bf6ccc02e4228effd9a0,0,0,0,1,0,0,0,0
4,0006fdc98a402fceb4eb0ee528f6a8d4,0,0,0,1,0,0,0,0


#### Weekday (replicated attribute)

In [90]:
# get dummies
df2 = pd.get_dummies(df2, columns = ['weekday'])

df2.head()

Unnamed: 0,customer_unique_id,order_id,order_purchase_timestamp,order_date,order_estimated_delivery_date,order_delivered_customer_date,customer_state,days_to_delivery,order_status_approved,order_status_canceled,...,order_status_processing,order_status_shipped,order_status_unavailable,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
26460,0000f46a3911fa3c0805444483337064,b33ec3b699337181488304f362a6b734,2017-03-10 21:05:03,2017-03-10,2017-04-07,2017-04-05 14:38:47,Santa Catarina,26 days 14:38:47,0,0,...,0,0,0,1,0,0,0,0,0,0
98493,0000f6ccb0745a6a4b88665a16c9f078,41272756ecddd9a9ed0180413cc22fb6,2017-10-12 20:29:41,2017-10-12,2017-11-13,2017-11-01 21:23:05,Pará,20 days 21:23:05,0,0,...,0,0,0,0,0,0,0,1,0,0
41564,0004aac84e0df4da2b147fca70cf8255,d957021f1127559cd947b62533f484f7,2017-11-14 19:45:42,2017-11-14,2017-12-05,2017-11-27 23:08:56,São Paulo,13 days 23:08:56,0,0,...,0,0,0,0,0,0,0,0,1,0
71235,0005e1862207bf6ccc02e4228effd9a0,ae76bef74b97bcb0b3e355e60d9a6f9c,2017-03-04 23:32:12,2017-03-04,2017-04-06,2017-03-09 08:33:08,Rio de Janeiro,5 days 08:33:08,0,0,...,0,0,0,0,0,1,0,0,0,0
26203,0006fdc98a402fceb4eb0ee528f6a8d4,6681163e3dab91c549952b2845b20281,2017-07-18 09:23:10,2017-07-18,2017-08-15,2017-08-03 18:42:49,Espírito Santo,16 days 18:42:49,0,0,...,0,0,0,0,0,0,0,0,1,0


In [91]:
len(df2)

45431

In [92]:
# grab all binary columns that we just created
dummies = [x for x in df2.columns if 'weekday' in x]

# aggreagte data to the customer-level, so that we have one record per customer
cust_weekday = df2.groupby('customer_unique_id')[dummies].sum().reset_index()

cust_weekday.head()

Unnamed: 0,customer_unique_id,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday
0,0000f46a3911fa3c0805444483337064,1,0,0,0,0,0,0
1,0000f6ccb0745a6a4b88665a16c9f078,0,0,0,0,1,0,0
2,0004aac84e0df4da2b147fca70cf8255,0,0,0,0,0,1,0
3,0005e1862207bf6ccc02e4228effd9a0,0,0,1,0,0,0,0
4,0006fdc98a402fceb4eb0ee528f6a8d4,0,0,0,0,0,1,0


#### Weekend vs. Weekday (experimental attribute)

In [112]:
df2.eval('weekend_purchase =  weekday_Saturday >= 1 or weekday_Sunday >= 1', inplace = True)

df2['weekend_purchase'] = [1 if x == True else 0 for x in df2['weekend_purchase']]

In [110]:
df2.eval('weekday_purchase =  weekday_Saturday == 0 and weekday_Sunday == 0', inplace = True)

df2['weekday_purchase'] = [1 if x == True else 0 for x in df2['weekday_purchase']]

In [139]:
weekend_purchase.dtypes

customer_unique_id    object
weekend_purchase       int64
dtype: object

In [140]:
# aggreagte data to the customer-level, so that we have one record per customer
weekend_purchase = df2.groupby('customer_unique_id')['weekend_purchase'].sum().reset_index()

weekend_purchase.head()

customer_unique_id    ffffd2657e2aad2907e67c3e9daecbeb
weekend_purchase                                     4
dtype: object

In [116]:
# aggreagte data to the customer-level, so that we have one record per customer
weekday_purchase = df2.groupby('customer_unique_id')['weekday_purchase'].sum().reset_index()

weekday_purchase.head()

Unnamed: 0,customer_unique_id,weekday_purchase
0,0000f46a3911fa3c0805444483337064,1
1,0000f6ccb0745a6a4b88665a16c9f078,1
2,0004aac84e0df4da2b147fca70cf8255,1
3,0005e1862207bf6ccc02e4228effd9a0,0
4,0006fdc98a402fceb4eb0ee528f6a8d4,1


### Perform a QA step: check that we get the same number of rows on our customer attributes

In [127]:
# replicated attributes
replicates = dict({'cust_recency':cust_recency.shape, 'cust_orders':cust_orders.shape, 'cust_weekday':cust_weekday.shape})

# new attributes
experimental = dict({'status':status.shape, 'weeks_to_delivery':weeks_to_delivery.shape, 'weekday_purchases':weekday_purchase.shape, \
                     'weekend_purchases':weekend_purchase.shape})

In [131]:
replicates

{'cust_recency': (44034, 2),
 'cust_orders': (44034, 2),
 'cust_weekday': (44034, 8)}

In [130]:
experimental

{'status': (44034, 9),
 'weeks_to_delivery': (44034, 2),
 'weekday_purchases': (44034, 2),
 'weekend_purchases': (44034, 2)}

## Create a mask and select orders placed between January and July 2018

In [145]:
mask2 = orders_df['order_date'].between('2018-01-01', '2018-07-31', inclusive = True)

target_events_raw = orders_df[mask2]

len(target_events_raw)

47479

In [146]:
# confirm the min and max dates are between Jan and July 2018
print (target_events_raw['order_purchase_timestamp'].min(), target_events_raw['order_purchase_timestamp'].max())

2018-01-01 02:48:41 2018-07-31 23:54:20


In [None]:
## Merge all customer attributes

In [153]:
# merging customer attributes
customer_attributes = pd.merge(cust_recency, cust_orders, on = 'customer_unique_id').merge(weekend_purchase, on = 'customer_unique_id').merge(weekday_purchase, on = 'customer_unique_id').merge(weeks_to_delivery, on = 'customer_unique_id').merge(status, on = 'customer_unique_id')

customer_attributes.shape

(44034, 14)

In [154]:
customer_attributes.head()

Unnamed: 0,customer_unique_id,order_recency,total_orders,weekend_purchase,weekday_purchase,avg_weeks_to_delivery,order_status_approved,order_status_canceled,order_status_created,order_status_delivered,order_status_invoiced,order_status_processing,order_status_shipped,order_status_unavailable
0,0000f46a3911fa3c0805444483337064,9,1,0,1,3.714286,0,0,0,1,0,0,0,0
1,0000f6ccb0745a6a4b88665a16c9f078,2,1,0,1,2.857143,0,0,0,1,0,0,0,0
2,0004aac84e0df4da2b147fca70cf8255,1,1,0,1,1.857143,0,0,0,1,0,0,0,0
3,0005e1862207bf6ccc02e4228effd9a0,10,1,1,0,0.714286,0,0,0,1,0,0,0,0
4,0006fdc98a402fceb4eb0ee528f6a8d4,5,1,0,1,2.285714,0,0,0,1,0,0,0,0


In [155]:
customer_attributes.weekend_purchase.value_counts()

0    33639
1    10191
2      196
3        7
4        1
Name: weekend_purchase, dtype: int64

In [156]:
customer_attributes.weekday_purchase.value_counts()

1    33015
0    10156
2      801
3       51
4        6
5        3
7        1
6        1
Name: weekday_purchase, dtype: int64

In [157]:
# Exclude customers who have not placed an order within the past one year.
customer_attributes = customer_attributes[customer_attributes['order_recency'] <= 12]

len(customer_attributes)

43713

In [158]:
# count the number of orders (we will convert this into a binary flag later)
target_events = target_events_raw.groupby('customer_unique_id').size().reset_index().rename(columns = {0: 'purch'})

# target_events.head()

print(f'Number of customers who made at least one purchase during the prediction window: {len(target_events):,}')

Number of customers who made at least one purchase during the prediction window: 46,412


In [160]:
# merge `target_events` with the dataframe that contains customer attributes
customer_attributes = pd.merge(customer_attributes, target_events, how = 'left', on = 'customer_unique_id')

customer_attributes.shape

(43713, 15)

In [161]:
# create the binary target flag
customer_attributes['purch'] = [1 if x > 0 else 0 for x in customer_attributes['purch']]

customer_attributes.purch.value_counts()

0    43091
1      622
Name: purch, dtype: int64

In [162]:
# % distribution of the target flag
customer_attributes.purch.value_counts() / len(customer_attributes)

0    0.985771
1    0.014229
Name: purch, dtype: float64