## This notebook loads the data, engineers features then saves the engineered dataset to a pickle file

In [2]:
import pandas as pd
import numpy as np
import datetime as dt

## Create customer demographic dataframe, add the features listed below

account_age - time since account was created in days. 'Now' is taken to be the most recent date in the receipt database, 30/3/2015

account_age_months - same as above but in months rather than days

customer_age - customer's age in years rounded down

Additionally:

Churn label is also converted from 2 and 1 to 1 and 0 (churn/not churn respectively)

Dates are converted to datetime objects

In [11]:
def create_customer_df():
    
    ## latest date in receipt data
    latest_date = dt.date(2014,3,30)

    ## read customer demographic data and add header
    customers_df = pd.read_table("customer/000000_0",
                                 header=None,
                                 names = ['customer_id','churn_label',
                                          'gender','country','date_created',
                                          'YOB','premier'])
    
    ##convert churn label to 0 and 1
    customers_df["churn_label"] = customers_df["churn_label"] - 1

    ##convert date_created to datetime object
    customers_df["date_created"] = pd.to_datetime(customers_df["date_created"])

    ##convert year of birth to datetime object
    customers_df["YOB"] = pd.to_datetime(customers_df["YOB"],format='%Y')

    ##insert account age in days, most recent order in receipts data is 30/3/2014
    customers_df["account_age"] = (latest_date - customers_df["date_created"]).dt.days

    ##insert account age in months (rounded to nearest month), better for plotting
    customers_df["account_age_months"] = np.rint((latest_date - customers_df["date_created"]).dt.days/(365/12))

    ##insert customer age in years
    customers_df["customer_age"] = np.floor((latest_date - customers_df["YOB"]).dt.days/365)
    
    return customers_df

## Read receipt data and return data frame

Convert signal_datetime to datetime object

In [18]:
def read_receipts_data():
    
    dfs_to_concat = []

    for i in range(0,3):                                                            
        dfs_to_concat.append(
            pd.read_table("receipts/00000%d_0" % i,header=None,names = ['customer_id','product_id',
                                                                'source_id','division_id','item_qty',
                                                                'signal_datetime','receipt_id','price']))
        
    receipts_df = pd.concat(dfs_to_concat)
        
    receipts_df["signal_datetime"] = pd.to_datetime(receipts_df["signal_datetime"])
        
    return receipts_df

## Create order summary dataframe

Dataframe contains a summary of each order. In the receipts data there is a row for every unique product in an order. These rows can be aggregated into a summary for each order by grouping them by receipt_id (the unique identifier of the order) and summing over price and number of items to give total order value and number of items bought for a given order.

In [13]:
def create_order_summary_df(receipts_df):

    ##group all orders (groupby columns are not aggregated) and sum over item_qty and price
    ##to give total noumber of items and total order value
    order_sum_df = receipts_df.groupby(
        ['receipt_id','signal_datetime','customer_id']).sum().reset_index()

    ##drop product_id, source_id, division_id columns as they are now meaningless (we have summed them up)
    order_sum_df.drop(['product_id','source_id', 'division_id'], axis = 1, inplace = True)

    ##add in date (used late for groupby operations)
    order_sum_df["date"] = order_sum_df["signal_datetime"].dt.date
    
    return order_sum_df

## Create customer's orders summary dataframe

Dataframe contains a summary of all of a customer's orders.

In [21]:
def create_customer_order_summary_df(receipts_df):
    
    ##group orders by customer id and aggregate by summing
    customer_order_sum_df = receipts_df.groupby('customer_id').sum().reset_index()
    
    #drop meaningless columns (we have summed over them)
    customer_order_sum_df.drop(
    ['product_id','source_id', 'division_id','receipt_id'], axis = 1, inplace = True)
    
    #add rounded price for plotting
    customer_order_sum_df["rounded_price"] = np.rint(customer_order_sum_df["price"])
    
    return customer_order_sum_df

In [None]:


latest_date = dt.datetime(2014,4,30)

#create time elapsed since last order feature
last_order_df = pd.DataFrame(receipts_df.groupby("customer_id")["signal_datetime"].max()).reset_index()

last_order_df.rename(columns={'signal_datetime': 'last_order_datetime'}, inplace=True)

last_order_df["time_elapsed_since_last"] = (latest_date - last_order_df["last_order_datetime"]).dt.days

customer_order_sum_df = pd.merge(
    customer_order_sum_df,last_order_df[["customer_id","time_elapsed_since_last"]],on="customer_id",how="left")

#add number of orders a customer has made
num_of_orders = pd.DataFrame(order_sum_df.groupby(["customer_id"])["receipt_id"].count()).reset_index()

num_of_orders.columns = ["customer_id","no_of_orders"]

customer_order_sum_df = pd.merge(
    customer_order_sum_df,num_of_orders[["customer_id","no_of_orders"]],on="customer_id")

#create number of orders in the last month, 6 months, year
one_month_ago = latest_date - dt.timedelta(365/12)
two_months_ago = latest_date - dt.timedelta(365/6)
three_months_ago = latest_date - dt.timedelta(365/4)
six_months_ago = latest_date - dt.timedelta(365/2)
one_year_ago = latest_date - dt.timedelta(365)

#create seperate dataframes containing all orders from the last month, 6months, year

orders_last_month = order_sum_df[order_sum_df.signal_datetime > one_month_ago].groupby(
    ["customer_id"])["receipt_id"].count().reset_index()

orders_last_month.rename(columns={'receipt_id': 'orders_last_month'}, inplace=True)

                                            
orders_last_2_months = order_sum_df[order_sum_df.signal_datetime > two_months_ago].groupby(
    ["customer_id"])["receipt_id"].count().reset_index()

orders_last_2_months.rename(columns={'receipt_id': 'orders_last_2_month'}, inplace=True)
                        
                                            
orders_last_3_months = order_sum_df[order_sum_df.signal_datetime > three_months_ago].groupby(
    ["customer_id"])["receipt_id"].count().reset_index()

orders_last_3_months.rename(columns={'receipt_id': 'orders_last_3_month'}, inplace=True)

                                            
orders_last_6_months = order_sum_df[order_sum_df.signal_datetime > six_months_ago].groupby(
    ["customer_id"])["receipt_id"].count().reset_index()

orders_last_6_months.rename(columns={'receipt_id': 'orders_last_6_month'}, inplace=True)

                                            
orders_last_year = order_sum_df[order_sum_df.signal_datetime > one_year_ago].groupby(
    ["customer_id"])["receipt_id"].count().reset_index()

orders_last_year.rename(columns={'receipt_id': 'orders_last_year'}, inplace=True)


# add them to customer dataframe

customer_order_sum_df = pd.merge(
    customer_order_sum_df,orders_last_month,on="customer_id",how='left')

customer_order_sum_df = pd.merge(
    customer_order_sum_df,orders_last_2_months,on="customer_id",how='left')

customer_order_sum_df = pd.merge(
    customer_order_sum_df,orders_last_3_months,on="customer_id",how='left')

customer_order_sum_df = pd.merge(
    customer_order_sum_df,orders_last_6_months,on="customer_id",how='left')

customer_order_sum_df = pd.merge(
    customer_order_sum_df,orders_last_year,on="customer_id",how='left')

#same thing as above but for amount spent

spent_last_month = order_sum_df[order_sum_df.signal_datetime > one_month_ago].groupby(
    ["customer_id"])["price"].sum().reset_index()

spent_last_month.rename(columns={'price': 'spent_last_month'}, inplace=True)

spent_last_6_months = order_sum_df[order_sum_df.signal_datetime > six_months_ago].groupby(
    ["customer_id"])["price"].sum().reset_index()

spent_last_6_months.rename(columns={'price': 'spent_last_6_month'}, inplace=True)

spent_last_year = order_sum_df[order_sum_df.signal_datetime > one_year_ago].groupby(
    ["customer_id"])["price"].sum().reset_index()

spent_last_year.rename(columns={'price': 'spent_last_year'}, inplace=True)

customer_order_sum_df = pd.merge(
    customer_order_sum_df,spent_last_month,on="customer_id",how='left')

customer_order_sum_df = pd.merge(
    customer_order_sum_df,spent_last_6_months,on="customer_id",how='left')

customer_order_sum_df = pd.merge(
    customer_order_sum_df,spent_last_year,on="customer_id",how='left')

#fill nulls with zeroes (those who haven't ordered anything in the above time periods)
customer_order_sum_df.fillna(0,inplace=True)

In [19]:
customers_df = create_customer_df()

receipts_df = read_receipts_data()

order_sum_df = create_order_summary_df(receipts_df)

customer_order_sum_df = create_customer_order_summary_df(receipts_df)