In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load Data

In [2]:
path = "input/Retail_Data_Transactions.csv"
df = pd.read_csv(path)
df.head()

Unnamed: 0,customer_id,trans_date,tran_amount
0,CS5295,11-Feb-13,35
1,CS4768,15-Mar-15,39
2,CS2122,26-Feb-13,52
3,CS1217,16-Nov-11,99
4,CS1850,20-Nov-13,78


# Preprocess Data

In [5]:
def preprocess_df(df):
    return (df
        .assign(
            customer_id = lambda df_: df_["customer_id"].astype(str), 
            transaction_date = lambda df_: pd.to_datetime(df_["trans_date"], format='%d-%b-%y'), 
        )
        .loc[:, ["transaction_date", "customer_id"]]
        .dropna(how="any")
        .reset_index(drop=True)
    )

df_clean = preprocess_df(df)

# Cohort Analysis

In [87]:
(df_clean
    .sort_values(["transaction_date", "customer_id"])
    .assign(
        # customer first purchase
        first_purchase = lambda df_: (df_
                                      .groupby(["customer_id"])["transaction_date"]
                                      .transform("min") + 
                                      pd.offsets.MonthEnd(0) - 
                                      pd.offsets.MonthBegin(1)
                                      ),
        # distance betweeen first purchase and transaction date (in month)
        months_after_first_transaction = lambda df_: (
            # convert transaction date to first day of the month
            (df_["transaction_date"] + pd.offsets.MonthEnd(0) - pd.offsets.MonthBegin(1)).dt.to_period("M")
            - df_["first_purchase"].dt.to_period("M")
        ).apply(lambda x: x.n) # cast to int 
    )
    .groupby(["first_purchase", "months_after_first_transaction"])
    .agg(
        num_customer = ("customer_id", "nunique")
    )
    .assign(
        percentage_to_first_purchase = lambda df_: df_["num_customer"] / (df_
                                                                          .groupby(["first_purchase"])
                                                                          ["num_customer"].transform("sum")
                                                                          )
    )
    .reset_index()
    .pivot(
        index = "first_purchase",
        columns= "months_after_first_transaction", 
        values="percentage_to_first_purchase"
    )
    .fillna(0)
)

months_after_first_transaction,0,1,2,3,4,5,6,7,8,9,...,37,38,39,40,41,42,43,44,45,46
first_purchase,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2011-05-01,0.061873,0.019611,0.021177,0.022604,0.020624,0.022512,0.018553,0.021085,0.021039,0.020486,...,0.021039,0.021959,0.020302,0.020164,0.021407,0.019934,0.019565,0.021177,0.018921,0.012062
2011-06-01,0.064337,0.020685,0.021826,0.020685,0.021648,0.020078,0.020899,0.021648,0.020221,0.022539,...,0.021113,0.022218,0.020007,0.020649,0.020899,0.021612,0.020578,0.018545,0.011591,0.0
2011-07-01,0.065277,0.02235,0.022189,0.022834,0.020792,0.02278,0.020953,0.020792,0.021329,0.021007,...,0.021652,0.021007,0.021329,0.021813,0.02047,0.023263,0.020093,0.011175,0.0,0.0
2011-08-01,0.068049,0.020896,0.022079,0.020423,0.022394,0.023104,0.022236,0.021842,0.020817,0.023734,...,0.020186,0.021763,0.023025,0.024286,0.021448,0.022788,0.014272,0.0,0.0,0.0
2011-09-01,0.069185,0.019704,0.020593,0.025037,0.024889,0.021037,0.022222,0.022222,0.021333,0.019111,...,0.025333,0.022222,0.021778,0.021333,0.019704,0.01437,0.0,0.0,0.0,0.0
2011-10-01,0.070643,0.023113,0.023858,0.021249,0.025722,0.022181,0.022367,0.023858,0.02069,0.023486,...,0.024604,0.023299,0.02013,0.018639,0.015284,0.0,0.0,0.0,0.0,0.0
2011-11-01,0.073691,0.022229,0.02162,0.01827,0.021924,0.02162,0.024056,0.025274,0.029233,0.025883,...,0.02162,0.028319,0.024361,0.012789,0.0,0.0,0.0,0.0,0.0,0.0
2011-12-01,0.076109,0.024013,0.025234,0.02035,0.021571,0.029304,0.022385,0.026455,0.023199,0.026048,...,0.024827,0.025234,0.014652,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-01-01,0.083007,0.021143,0.024276,0.023493,0.025842,0.02036,0.028974,0.027408,0.025842,0.025059,...,0.023493,0.014879,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-02-01,0.084806,0.027091,0.018846,0.022379,0.027091,0.020024,0.027091,0.023557,0.037691,0.027091,...,0.018846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
