In [1]:
# !pip install lifetimes

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.metrics import mean_squared_log_error

from lifetimes import BetaGeoFitter
from lifetimes.utils import summary_data_from_transaction_data
from lifetimes import GammaGammaFitter

data = pd.read_csv('transaction_log.csv')
data.date = pd.to_datetime(data.date)

# constant
startDate = dt.date(1997, 1, 1) + dt.timedelta(208)
# fixCustDate = dt.date(1997, 3, 25) 
trainEndDate = startDate + dt.timedelta(56) # dt.date(1997, 4, 22) 
testStartDate = trainEndDate + dt.timedelta(1) #
testEndDate = testStartDate + dt.timedelta(28) 
endDate =  dt.date(1998, 6, 30) 

data = data.sort_values('date')
data.index = data.date

# train'n'test split
train = data[startDate:trainEndDate].reset_index(drop=True)
test = data[testStartDate:testEndDate].reset_index(drop=True)
# train.shape, test.shape

In [3]:
# Агрегированнные сведения о покупках клиентов

trainWithMonetary = train.groupby('cust').sales.mean().rename('monetary_value')
testWithMonetary = test.groupby('cust').sales.mean().rename('monetary_value')

In [4]:
# to RFM (аналог)

summary = summary_data_from_transaction_data(train, 'cust', 'date', observation_period_end=trainEndDate)
summary.head(3)

Unnamed: 0_level_0,frequency,recency,T
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4,0.0,0.0,51.0
114,0.0,0.0,14.0
228,1.0,21.0,46.0


In [5]:
# BG/NBD model. Предсказывает вероятность покупки 
# bgf.predict(t, summary['frequency'], summary['recency'], summary['T'])

bgf = BetaGeoFitter(penalizer_coef=0.0)
bgf.fit(summary['frequency'], summary['recency'], summary['T'])

<lifetimes.BetaGeoFitter: fitted with 298 subjects, a: 0.00, alpha: 48.81, b: 7500.21, r: 0.69>

In [6]:
# к summary добавим данные о покупках (нужно для оценки LTV), клиенты сделали > 1 покупки

returning_customers_summary = summary.merge(trainWithMonetary, right_on='cust', left_on='cust')
returning_customers_summary = returning_customers_summary[returning_customers_summary['frequency']>0]
returning_customers_summary.head(1)

Unnamed: 0_level_0,frequency,recency,T,monetary_value
cust,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
228,1.0,21.0,46.0,30.355


In [7]:
# Gamma-Gamma model

# We can now estimate the average transaction value:
# ggf.conditional_expected_average_profit(
#         returning_customers_summary['frequency'],
#         returning_customers_summary['monetary_value']
#     ).head(10))


ggf = GammaGammaFitter(penalizer_coef = 0)
ggf.fit(returning_customers_summary['frequency'],
        returning_customers_summary['monetary_value'])

<lifetimes.GammaGammaFitter: fitted with 84 subjects, p: 13.56, q: 4.32, v: 8.77>

In [8]:
y_pred = ggf.customer_lifetime_value(
    bgf, #the model to use to predict the number of future transactions
    returning_customers_summary['frequency'],
    returning_customers_summary['recency'],
    returning_customers_summary['T'],
    returning_customers_summary['monetary_value'],
    time=1, # months
    discount_rate=0.01 # monthly discount rate ~ 12.7% annually
)
y_pred

cust
228      16.611715
441      17.865465
564      31.609776
1108     25.528175
1251     20.176509
           ...    
22355    13.942215
22471    19.281780
23147    33.411120
23507    12.248153
23551    60.264414
Name: clv, Length: 84, dtype: float64

In [9]:
y_true = train.groupby('cust').sum().rename(columns={'sales':'clv_true'})

In [10]:
# Оценки на train

d = pd.DataFrame(data=trainWithMonetary)
d = pd.DataFrame(data=y_pred)
d =d.merge(pd.DataFrame(y_true), left_on='cust', right_on='cust', how='outer')
d.sales = d.clv_true.apply(lambda x: x if x > 0 else 0)
d.clv = d.clv.apply(lambda x: x if x > 0 else 0)
ytrue = d.clv_true
ypred = d.clv
np.sqrt(mean_squared_log_error(ytrue, ypred))

  


2.986508086467258

In [11]:
d.sum()

clv          2313.314878
clv_true    15930.310000
dtype: float64

In [None]:
# оценки на тесте

In [12]:
summary = summary_data_from_transaction_data(test, 'cust', 'date', observation_period_end=trainEndDate)
returning_customers_summary = summary.merge(testWithMonetary, right_on='cust', left_on='cust')
returning_customers_summary = returning_customers_summary[returning_customers_summary['frequency']>0]

y_pred = ggf.customer_lifetime_value(
    bgf, #the model to use to predict the number of future transactions
    returning_customers_summary['frequency'],
    returning_customers_summary['recency'],
    returning_customers_summary['T'],
    returning_customers_summary['monetary_value'],
    time=1, # months
    discount_rate=0.01 # monthly discount rate ~ 12.7% annually
)

d = pd.DataFrame(data=trainWithMonetary)
d = pd.DataFrame(data=y_pred)
d =d.merge(pd.DataFrame(y_true), left_on='cust', right_on='cust', how='outer')
d.sales = d.clv_true.apply(lambda x: x if x > 0 else 0)
d.clv = d.clv.apply(lambda x: x if x > 0 else 0)
ytrue = d.clv_true
ypred = d.clv
np.sqrt(mean_squared_log_error(ytrue, ypred))



3.7363837999227063

In [13]:
d.sum()

clv             0.00
clv_true    15930.31
dtype: float64

In [14]:
d

Unnamed: 0_level_0,clv,clv_true
cust,Unnamed: 1_level_1,Unnamed: 2_level_1
4,0,14.96
114,0,22.97
228,0,60.71
441,0,61.29
564,0,117.94
...,...,...
23205,0,13.77
23385,0,68.63
23500,0,13.99
23507,0,37.11
