In [1]:
import pandas as pd
from pathlib import Path

DATA_PROCESSED = Path("../data_processed")

df = pd.read_csv(
    DATA_PROCESSED / "fact_orders_revenue.csv",
    parse_dates=["order_purchase_timestamp"]
)

df.head()


Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,purchase_month,is_delivered,is_canceled,is_late,delivery_days,ship_days_to_carrier,items_revenue,items_count,payment_value,payment_types
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18,2017-10,True,False,False,8.436574,2.366493,38.71,1.0,38.71,"credit_card,voucher"
1,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13,2018-07,True,False,False,13.782037,0.462882,141.46,1.0,141.46,boleto
2,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04,2018-08,True,False,False,9.394213,0.204595,179.12,1.0,179.12,credit_card
3,949d5b44dbf5de918fe9c16f97b45f8a,f88197465ea7920adcdbec7375364d82,delivered,2017-11-18 19:28:06,2017-11-18 19:45:59,2017-11-22 13:39:59,2017-12-02 00:28:42,2017-12-15,2017-11,True,False,False,13.20875,3.745833,72.2,1.0,72.2,credit_card
4,ad21c59c0840e6cb83a9ceb5573f8159,8ab97904e6daea8866dbdbc4fb7aad2c,delivered,2018-02-13 21:18:39,2018-02-13 22:20:29,2018-02-14 19:46:34,2018-02-16 18:17:02,2018-02-26,2018-02,True,False,False,2.873877,0.893113,28.62,1.0,28.62,credit_card


In [2]:
# sort so first purchase is correct
df = df.sort_values(["customer_id", "order_purchase_timestamp"])

# first purchase per customer
first_purchase = (
    df.groupby("customer_id")["order_purchase_timestamp"]
    .min()
    .reset_index()
    .rename(columns={"order_purchase_timestamp": "first_purchase_date"})
)

first_purchase["cohort_month"] = (
    first_purchase["first_purchase_date"]
    .dt.to_period("M")
    .astype(str)
)

first_purchase.head()


Unnamed: 0,customer_id,first_purchase_date,cohort_month
0,00012a2ce6f8dcda20d059ce98491703,2017-11-14 16:08:26,2017-11
1,000161a058600d5901f007fab4c27140,2017-07-16 09:40:32,2017-07
2,0001fd6190edaaf884bcaf3d49edf079,2017-02-28 11:06:43,2017-02
3,0002414f95344307404f0ace7a26f1d5,2017-08-16 13:09:20,2017-08
4,000379cdec625522490c315e70c7a9fb,2018-04-02 13:42:17,2018-04


In [3]:
df = df.merge(first_purchase[["customer_id", "cohort_month"]], on="customer_id", how="left")

df["order_month"] = df["order_purchase_timestamp"].dt.to_period("M").astype(str)

df[["customer_id","cohort_month","order_month"]].head()


Unnamed: 0,customer_id,cohort_month,order_month
0,00012a2ce6f8dcda20d059ce98491703,2017-11,2017-11
1,000161a058600d5901f007fab4c27140,2017-07,2017-07
2,0001fd6190edaaf884bcaf3d49edf079,2017-02,2017-02
3,0002414f95344307404f0ace7a26f1d5,2017-08,2017-08
4,000379cdec625522490c315e70c7a9fb,2018-04,2018-04


In [4]:
cohort_index = (
    (pd.to_datetime(df["order_month"]) - pd.to_datetime(df["cohort_month"]))
    .dt.days // 30
)

df["cohort_index"] = cohort_index
df[["cohort_month","order_month","cohort_index"]].head()


Unnamed: 0,cohort_month,order_month,cohort_index
0,2017-11,2017-11,0
1,2017-07,2017-07,0
2,2017-02,2017-02,0
3,2017-08,2017-08,0
4,2018-04,2018-04,0


In [5]:
cohort_data = (
    df.groupby(["cohort_month","cohort_index"])["customer_id"]
    .nunique()
    .reset_index()
)

cohort_sizes = (
    cohort_data[cohort_data["cohort_index"] == 0]
    .set_index("cohort_month")["customer_id"]
)

retention = (
    cohort_data
    .pivot(index="cohort_month", columns="cohort_index", values="customer_id")
    .divide(cohort_sizes, axis=0)
)

retention.round(3)


cohort_index,0
cohort_month,Unnamed: 1_level_1
2016-09,1.0
2016-10,1.0
2016-12,1.0
2017-01,1.0
2017-02,1.0
2017-03,1.0
2017-04,1.0
2017-05,1.0
2017-06,1.0
2017-07,1.0


In [6]:
orders_per_customer = (
    df.groupby("customer_id")["order_id"]
    .nunique()
)

repeat_rate = (orders_per_customer > 1).mean()
repeat_rate


np.float64(0.0)

In [7]:
delivered = df[df["is_delivered"]]

avg_revenue_per_order = delivered["items_revenue"].mean()
avg_orders_per_customer = orders_per_customer.mean()

simple_ltv = avg_revenue_per_order * avg_orders_per_customer
simple_ltv


np.float64(159.82683876116835)

In [8]:
cohort_ltv = (
    delivered
    .groupby("cohort_month")
    .agg(
        customers=("customer_id","nunique"),
        revenue=("items_revenue","sum")
    )
)

cohort_ltv["ltv"] = cohort_ltv["revenue"] / cohort_ltv["customers"]
cohort_ltv.head()


Unnamed: 0_level_0,customers,revenue,ltv
cohort_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2016-09,1,143.46,143.46
2016-10,265,46490.66,175.436453
2016-12,1,19.62,19.62
2017-01,750,127482.37,169.976493
2017-02,1653,271239.32,164.089123


In [9]:
retention.reset_index().to_csv(DATA_PROCESSED / "cohort_retention.csv", index=False)
cohort_ltv.reset_index().to_csv(DATA_PROCESSED / "cohort_ltv.csv", index=False)

print("Saved cohort_retention.csv and cohort_ltv.csv")


Saved cohort_retention.csv and cohort_ltv.csv
