## This notebook loads the data, engineers features then saves the engineered dataset to a pickle file

In [1]:
import pandas as pd
import numpy as np
import datetime as dt

## Create customer demographic dataframe, add the features listed below

account_age - time since account was created in days. 'Now' is taken to be the most recent date in the receipt database, 30/3/2015

account_age_months - same as above but in months rather than days

customer_age - customer's age in years rounded down

Additionally:

Churn label is also converted from 2 and 1 to 1 and 0 (churn/not churn respectively)

Dates are converted to datetime objects

In [2]:
def create_customer_df():
    
    ## latest date in receipt data
    latest_date = dt.date(2014,3,30)

    ## read customer demographic data and add header
    customers_df = pd.read_table("customer/000000_0",
                                 header=None,
                                 names = ['customer_id','churn_label',
                                          'gender','country','date_created',
                                          'YOB','premier'])
    
    ##convert churn label to 0 and 1
    customers_df["churn_label"] = customers_df["churn_label"] - 1

    ##convert date_created to datetime object
    customers_df["date_created"] = pd.to_datetime(customers_df["date_created"])

    ##convert year of birth to datetime object
    customers_df["YOB"] = pd.to_datetime(customers_df["YOB"],format='%Y')

    ##insert account age in days, most recent order in receipts data is 30/3/2014
    customers_df["account_age"] = (latest_date - customers_df["date_created"]).dt.days

    ##insert account age in months (rounded to nearest month), better for plotting
    customers_df["account_age_months"] = np.rint((latest_date - customers_df["date_created"]).dt.days/(365/12))

    ##insert customer age in years
    customers_df["customer_age"] = np.floor((latest_date - customers_df["YOB"]).dt.days/365)
    
    return customers_df

## Read receipt data

Convert signal_datetime to datetime object

In [3]:
def read_receipts_data():
    
    dfs_to_concat = []

    for i in range(0,3):                                                            
        dfs_to_concat.append(
            pd.read_table("receipts/00000%d_0" % i,header=None,names = ['customer_id','product_id',
                                                                'source_id','division_id','item_qty',
                                                                'signal_datetime','receipt_id','price']))
        
    receipts_df = pd.concat(dfs_to_concat)
        
    receipts_df["signal_datetime"] = pd.to_datetime(receipts_df["signal_datetime"])
        
    return receipts_df

## Create order summary dataframe

Dataframe contains a summary of each order. In the receipts data there is a row for every unique product in an order. These rows can be aggregated into a summary for each order by grouping them by receipt_id (the unique identifier of the order) and summing over price and number of items to give total order value and number of items bought for a given order.

In [4]:
def create_order_summary_df(receipts_df):

    ##group all orders (groupby columns are not aggregated) and sum over item_qty and price
    ##to give total noumber of items and total order value
    order_sum_df = receipts_df.groupby(
        ['receipt_id','signal_datetime','customer_id']).sum().reset_index()

    ##drop product_id, source_id, division_id columns as they are now meaningless (we have summed them up)
    order_sum_df.drop(['product_id','source_id', 'division_id'], axis = 1, inplace = True)

    ##add in date (used late for groupby operations)
    order_sum_df["date"] = order_sum_df["signal_datetime"].dt.date
    
    return order_sum_df

## Create customer's orders summary dataframe

Dataframe contains a summary of all of a customer's orders.

In [5]:
def create_customer_order_summary_df(receipts_df):
    
    ##group orders by customer id and aggregate by summing
    customer_order_sum_df = receipts_df.groupby('customer_id').sum().reset_index()
    
    #drop meaningless columns (we have summed over them)
    customer_order_sum_df.drop(
    ['product_id','source_id', 'division_id','receipt_id'], axis = 1, inplace = True)
    
    customer_order_sum_df.rename(columns={'price' : 'total_spent'}, inplace = True)
    
    #add rounded price for plotting
    customer_order_sum_df["rounded_total_spent"] = np.rint(customer_order_sum_df["total_spent"])
    
    return customer_order_sum_df

## Read returns data and return dataframe

Add separate column for each return action

In [6]:
def read_returns_data():
    
    ##read returns data
    returns_df = pd.read_table("returns/000000_0",header=None,names = ['customer_id','product_id',
                                                    'source_id','division_id','item_qty',
                                                    'signal_datetime','receipt_id', 'return_id',
                                                    'return_action','return_reason'])
    
    ##convert signal timestampe to datetime
    returns_df["signal_datetime"] = pd.to_datetime(returns_df["signal_datetime"])
    
    #add new column for each return action, 1 indicates that, that return action that was taken
    return_actions = ["Refund","Cancel","Replacement","Reject"]
    
    for action in return_actions:
        returns_df[action] = returns_df["return_action"].apply(lambda x : 1 if x == action else 0)
    
    return returns_df

## Create returns summary dataframe

Dataframe contains a summary of each return. Just like the receipts data there is a row for every unique product returned.

In [7]:
def create_returns_summary_df(returns_df):
    
    # create a summary dataframe of returns
    returns_sum_df = returns_df.groupby(
        ['return_id','signal_datetime','customer_id']).sum().reset_index()

    #drop columns that are now meaningless (we have summed over them)
    returns_sum_df.drop(['receipt_id','product_id','source_id', 'division_id'], axis = 1, inplace = True)
    
    return returns_sum_df

## Create customer's returns summary dataframe

Dataframe contains a summary of each return. Just like the receipts data there is a row for every unique product returned.

In [8]:
def create_customer_returns_summary_df(returns_df):
    
    # create a summary of customer's returns dataframe
    customer_returns_sum_df = returns_df.groupby('customer_id').sum().reset_index()

    #rename sum of items returned as number of returned items
    customer_returns_sum_df.rename(columns={'item_qty': 'no_returned_items'}, inplace=True)

    #drop columns that are now meaningless
    customer_returns_sum_df.drop(
        ['receipt_id','return_id','product_id','source_id', 'division_id'], axis = 1, inplace = True)
    
    return customer_returns_sum_df

## Load browsing data

Load browsing data, drop information to do with browser (large dataset)

In [9]:
def read_browsing_data():

    dfs_to_concat = []

    for i in range(0,18):                                                            
        dfs_to_concat.append(
            pd.read_table("sessionsummary/0000%02d_0" % i,
                          header=None,
                          names = ['customer_id','country',
                                    'start_time','site','page_views',
                                    'non_page_view_events','user_agent',
                                    'screen_res','browser_size','product_views',
                                    'product_views_dist',
                                    'added_to_bag','saved_for_l8r_prod',
                                    'saved_for_l8r_cat','purchased_distinct',
                                    'purchased_total']))
        
    browsing_df = pd.concat(dfs_to_concat)

    #drop website and browser info
    browsing_df.drop(["site","user_agent","screen_res","browser_size"],axis = 1,inplace=True)

    #convert start time timestamp to datetime object
    browsing_df["start_time"] = pd.to_datetime(browsing_df["start_time"])
    
    return browsing_df

## Feature engineering functions

Insert time elapsed since the customers last order

In [10]:
def insert_time_elapsed_since_last_order(customers_df,recipts_df):

    latest_date = dt.datetime(2014,4,30)

    #group orders by customer id then find the max order datetime (latest)
    last_order_df = pd.DataFrame(receipts_df.groupby("customer_id")["signal_datetime"].max()).reset_index()

    ##rename signal_datetime
    last_order_df.rename(columns={'signal_datetime' : 'last_order_datetime'}, inplace=True)

    ##calculate elapsed time
    last_order_df["time_elapsed_since_last"] = (latest_date - last_order_df["last_order_datetime"]).dt.days
        
    customers_df = pd.merge(
    customers_df,last_order_df[["customer_id","time_elapsed_since_last"]],on="customer_id",how="left")
    
    return customers_df

Insert the total number of orders a customer has made

In [11]:
def insert_number_of_orders(customers_df,order_sum_df):

    #create dataframe of number of orders, groupby customer_id then count number of orders
    #(you can count any column as you are just counting the number of rows for a given customer_id)
    num_of_orders = pd.DataFrame(order_sum_df.groupby(["customer_id"])["receipt_id"].count()).reset_index()

    #rename columns
    num_of_orders.rename(columns={"receipt_id" : "no_of_orders"}, inplace = True)

    customers_df = pd.merge(customers_df,num_of_orders[["customer_id","no_of_orders"]],on="customer_id",
                           how='left')
    
    return customers_df

In [12]:
def insert_number_of_orders_in_last_n_days(customers_df,order_sum_df,days):
    
    latest_date = dt.datetime(2014,4,30)
    
    ##find date n days ago
    n_days_ago = latest_date - dt.timedelta(days)
    
    #find orders in last n_days, groupby customer_id then count number of orders
    #(you can count any column as you are just counting the number of rows for a given customer_id)
    orders_last_n_days = order_sum_df[order_sum_df.signal_datetime > n_days_ago].groupby(
        ["customer_id"])["receipt_id"].count().reset_index()
    
    ##rename column
    orders_last_n_days.rename(columns={'receipt_id': 'orders_last_'+str(days)+'_days'}, inplace=True)

    ## add to customer df
    customers_df = pd.merge(
        customers_df,orders_last_n_days[['customer_id','orders_last_'+str(days)+'_days']],on="customer_id",
        how='left')
    
    return customers_df

In [13]:
def insert_amount_spent_in_last_n_days(customers_df,order_sum_df,days):
    
    latest_date = dt.datetime(2014,4,30)
    
    ##find date n days ago
    n_days_ago = latest_date - dt.timedelta(days)
    
    #create dataframe of number of orders, groupby customer_id then sum over price to get total spend
    spent_last_n_days = order_sum_df[order_sum_df.signal_datetime > n_days_ago].groupby(
        ["customer_id"])["price"].sum().reset_index()
    
    spent_last_n_days.rename(columns={'price': 'spent_last_'+str(days)+'_days'}, inplace=True)

    customers_df = pd.merge(
        customers_df,spent_last_n_days[['customer_id','spent_last_'+str(days)+'_days']],on="customer_id",
        how='left')
    
    return customers_df

In [14]:
def insert_time_elapsed_since_last_browse(customers_df,browsing_df):
    
    latest_date = dt.datetime(2014,4,30)
    
    last_browse_df = pd.DataFrame(browsing_df.groupby("customer_id")["start_time"].max()).reset_index()

    last_browse_df.rename(columns={'start_time': 'last_browse'}, inplace=True)

    last_browse_df["days_since_last_browse"] = (latest_date - last_browse_df["last_browse"]).dt.days
    
    customers_df = pd.merge(
        customers_df,last_browse_df[['customer_id','days_since_last_browse']],on="customer_id", how='left')
    
    return customers_df

In [15]:
def insert_site_visits_last_n_days(customers_df,browsing_df,days):
    
    latest_date = dt.datetime(2014,4,30)
    
    ##find date n days ago
    n_days_ago = latest_date - dt.timedelta(days)
    
    #create dataframe of number of site visits in last n days
    visits_last_n_days = browsing_df[browsing_df.start_time > n_days_ago].groupby(
        ["customer_id"])["start_time"].count().reset_index()
    
    visits_last_n_days.rename(columns={'start_time': 'visits_last_'+str(days)+'_days'}, inplace=True)

    customers_df = pd.merge(
        customers_df,visits_last_n_days[['customer_id','visits_last_'+str(days)+'_days']],on="customer_id",
        how='left')
    
    return customers_df

## Load datasets and engineer features

In [16]:
"""

Load the datasets

"""

## Load customer data
customers_df = create_customer_df()

## Load receipts data
receipts_df = read_receipts_data()

## Create order summary and customer order summary dataframes

order_sum_df = create_order_summary_df(receipts_df)

customer_order_sum_df = create_customer_order_summary_df(receipts_df)

## Load returns data

returns_df = read_returns_data()

## Create returns summary and customer returns summary dataframes

returns_sum_df = create_returns_summary_df(returns_df)

customer_returns_sum_df = create_customer_returns_summary_df(returns_df) 

## Load browsing data

browsing_df = read_browsing_data()

## Create customer browsing summary dataframe

customer_browsing_df = browsing_df.groupby("customer_id").sum().reset_index()

"""

Engineer features and add them to customer_df

"""

## First add order, return and browsing summaries

customers_df = pd.merge(
    customers_df,customer_order_sum_df,on="customer_id",how='left')

customers_df = pd.merge(
    customers_df,customer_returns_sum_df,on="customer_id",how='left')

customers_df = pd.merge(
    customers_df,customer_browsing_df,on="customer_id",how='left')

## Add bought/return ratio

customers_df.rename(columns={"item_qty" : "no_items_bought"}, inplace = True)

customers_df["bought_return_ratio"] = customers_df["no_returned_items"]/customers_df["no_items_bought"]

## Add time elapsed since last order

customers_df = insert_time_elapsed_since_last_order(customers_df,receipts_df)

customers_df = insert_number_of_orders(customers_df,order_sum_df)

## Add number of orders in last year, last 6 months, last 3 days and last month

time_periods = [365, 182, 92, 31]

for days in time_periods:
    customers_df = insert_number_of_orders_in_last_n_days(customers_df,order_sum_df, days)

## Add amount spent in last year, last 6 months, last 3 days and last month
    
for days in time_periods:
    customers_df = insert_amount_spent_in_last_n_days(customers_df,order_sum_df, days)

## Add time elapsed since last browse    
    
customers_df = insert_time_elapsed_since_last_browse(customers_df,browsing_df)

## Insert site visits in last year, last 6 months, last 3 days and last month
for days in time_periods:
    customers_df = insert_site_visits_last_n_days(customers_df,browsing_df,days)
    
## Insert total site visits (just use large number of days)

customers_df = insert_site_visits_last_n_days(customers_df,browsing_df,10000)

customers_df.rename(columns={'visits_last_10000_days' : 'total_site_visits'}, inplace = True)

  if self.run_code(code, result):
  if self.run_code(code, result):
  if self.run_code(code, result):


## Save to pickle file

In [17]:
customers_df.to_pickle('./customer_df_with_engineered_features.pkl')

In [18]:
customers_df.shape

(470169, 41)