In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lifetimes

%matplotlib inline

Read data file

In [2]:
transactions_df = pd.read_csv('data.csv',encoding='ISO-8859-1')

In [3]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


Drop Transactions with missing customer IDs

In [4]:
transactions_df = transactions_df[transactions_df['CustomerID'].notnull()].copy()

Convert Invoice Date column from a string column to a datetime column

In [5]:
transactions_df['InvoiceDate'] = pd.to_datetime(transactions_df['InvoiceDate'])

In [6]:
transactions_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      406829 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 27.9+ MB


In [7]:
transactions_df[['Quantity','UnitPrice']].describe()

Unnamed: 0,Quantity,UnitPrice
count,406829.0,406829.0
mean,12.061303,3.460471
std,248.69337,69.315162
min,-80995.0,0.0
25%,2.0,1.25
50%,5.0,1.95
75%,12.0,3.75
max,80995.0,38970.0


Drop transactions with negative quantities

In [8]:
transactions_df = transactions_df[transactions_df['Quantity'] > 0]

In [9]:
transactions_df[['Quantity','UnitPrice']].describe()

Unnamed: 0,Quantity,UnitPrice
count,397924.0,397924.0
mean,13.021823,3.116174
std,180.42021,22.096788
min,1.0,0.0
25%,2.0,1.25
50%,6.0,1.95
75%,12.0,3.75
max,80995.0,8142.75


Create a dataframe containing the number of transactions made for each user

In [10]:
# Calculate the number of invididual invoices generated for each customer
num_transactions_per_user = transactions_df.groupby('CustomerID')['InvoiceNo'].unique().apply(lambda lst: len(lst))

frequency_df = num_transactions_per_user.reset_index().rename(columns={'InvoiceNo':'frequency'})

In [11]:
frequency_df

Unnamed: 0,CustomerID,frequency
0,12346.0,1
1,12347.0,7
2,12348.0,4
3,12349.0,1
4,12350.0,1
...,...,...
4334,18280.0,1
4335,18281.0,1
4336,18282.0,2
4337,18283.0,16


Create a dataframe containing the total amount spent for each customer

In [12]:
# Create a new column containing the total amount spent for each item
transactions_df['Total'] = transactions_df['Quantity'] * transactions_df['UnitPrice']

# Now, aggregate total spend by customer
monetary_df = transactions_df.groupby('CustomerID')['Total'].sum().reset_index().rename(columns={'Total':'monetary'})

In [13]:
monetary_df

Unnamed: 0,CustomerID,monetary
0,12346.0,77183.60
1,12347.0,4310.00
2,12348.0,1797.24
3,12349.0,1757.55
4,12350.0,334.40
...,...,...
4334,18280.0,180.60
4335,18281.0,80.82
4336,18282.0,178.05
4337,18283.0,2094.88


Merge the two dataframes into one dataframe containing each figure

In [14]:
combined_df = pd.merge(frequency_df, monetary_df, on='CustomerID')
combined_df = combined_df[combined_df['monetary'] > 0]

In [15]:
combined_df

Unnamed: 0,CustomerID,frequency,monetary
0,12346.0,1,77183.60
1,12347.0,7,4310.00
2,12348.0,4,1797.24
3,12349.0,1,1757.55
4,12350.0,1,334.40
...,...,...,...
4334,18280.0,1,180.60
4335,18281.0,1,80.82
4336,18282.0,2,178.05
4337,18283.0,16,2094.88


In [16]:
combined_df[['frequency','monetary']].corr()

Unnamed: 0,frequency,monetary
frequency,1.0,0.554086
monetary,0.554086,1.0


In [17]:
gg_mdl = lifetimes.GammaGammaFitter()
gg_mdl.fit(combined_df['frequency'], combined_df['monetary'])

<lifetimes.GammaGammaFitter: fitted with 4338 subjects, p: 1.25, q: 1.50, v: 907.55>

In [18]:
gg_mdl.summary

Unnamed: 0,coef,se(coef),lower 95% bound,upper 95% bound
p,1.254894,0.060261,1.136782,1.373006
q,1.501907,0.045192,1.41333,1.590483
v,907.549026,83.608506,743.676355,1071.421698


In [19]:
combined_df['average_profit'] = gg_mdl.conditional_expected_average_profit(combined_df['frequency'],combined_df['monetary'])

In [20]:
combined_df

Unnamed: 0,CustomerID,frequency,monetary,average_profit
0,12346.0,1,77183.60,55781.002334
1,12347.0,7,4310.00,4199.691722
2,12348.0,4,1797.24,1840.132589
3,12349.0,1,1757.55,1903.697114
4,12350.0,1,334.40,887.132055
...,...,...,...,...
4334,18280.0,1,180.60,777.271741
4335,18281.0,1,80.82,705.998255
4336,18282.0,2,178.05,526.529294
4337,18283.0,16,2094.88,2099.128894


## Calculating Customer Lifetime Value

Gamma Gamma can be combined with either Pareto/NBD or BG/NBD to calculate customer lifetime value. This notebook will use the Pareto/NBD for the full customer lifetime value calculation 

Calculate the age and recency values for the Pareto/NDB model. First start by getting the most recent transaction made by a customer in the dataset

In [21]:
most_recent_transaction = transactions_df['InvoiceDate'].max()

In [22]:
most_recent_transaction

Timestamp('2011-12-09 12:50:00')

Create a dataframe containing the number of days elasped since the most recent transaction for each user

In [23]:
# Compute the lastest transaction for each user
latest_transactions_per_user = transactions_df.groupby('CustomerID')['InvoiceDate'].max()

recency_df = latest_transactions_per_user.reset_index()
recency_df['recency'] = recency_df['InvoiceDate'].apply(lambda date: (most_recent_transaction - date).days)

In [24]:
recency_df.head()

Unnamed: 0,CustomerID,InvoiceDate,recency
0,12346.0,2011-01-18 10:01:00,325
1,12347.0,2011-12-07 15:52:00,1
2,12348.0,2011-09-25 13:13:00,74
3,12349.0,2011-11-21 09:51:00,18
4,12350.0,2011-02-02 16:01:00,309


Calculate the age of the customer

In [25]:
first_transactions = transactions_df.groupby('CustomerID')['InvoiceDate'].min().reset_index()
first_transactions['age'] = first_transactions['InvoiceDate'].apply(lambda date: (most_recent_transaction - date).days)

In [26]:
recency_frequency_df = pd.merge(pd.merge(recency_df, frequency_df, on='CustomerID').drop('InvoiceDate',axis=1), 
                                first_transactions, on='CustomerID').drop('InvoiceDate', axis=1)

In [27]:
recency_frequency_df

Unnamed: 0,CustomerID,recency,frequency,age
0,12346.0,325,1,325
1,12347.0,1,7,366
2,12348.0,74,4,357
3,12349.0,18,1,18
4,12350.0,309,1,309
...,...,...,...,...
4334,18280.0,277,1,277
4335,18281.0,180,1,180
4336,18282.0,7,2,125
4337,18283.0,3,16,336


Fit Pareto/NDB Model

In [28]:
pareto_mbd_mdl = lifetimes.ParetoNBDFitter()

In [29]:
pareto_mbd_mdl.fit(recency_frequency_df['frequency'], recency_frequency_df['recency'], recency_frequency_df['age'])

  np.max(np.abs(fsim[0] - fsim[1:])) <= fatol):


<lifetimes.ParetoNBDFitter: fitted with 4339 subjects, alpha: 11.72, beta: 396.02, r: 1.49, s: 1.22>

Recombine all the calculated data into one dataframe

In [30]:
combined_df = pd.merge(recency_frequency_df, monetary_df, on='CustomerID')
combined_df = combined_df[combined_df['monetary'] > 0]
combined_df['average_profit'] = gg_mdl.conditional_expected_average_profit(combined_df['frequency'],combined_df['monetary'])

In [31]:
combined_df

Unnamed: 0,CustomerID,recency,frequency,age,monetary,average_profit
0,12346.0,325,1,325,77183.60,55781.002334
1,12347.0,1,7,366,4310.00,4199.691722
2,12348.0,74,4,357,1797.24,1840.132589
3,12349.0,18,1,18,1757.55,1903.697114
4,12350.0,309,1,309,334.40,887.132055
...,...,...,...,...,...,...
4334,18280.0,277,1,277,180.60,777.271741
4335,18281.0,180,1,180,80.82,705.998255
4336,18282.0,7,2,125,178.05,526.529294
4337,18283.0,3,16,336,2094.88,2099.128894


In [32]:
combined_df['cltv'] = gg_mdl.customer_lifetime_value(pareto_mbd_mdl,
                                                    combined_df['frequency'],
                                                    combined_df['recency'],
                                                    combined_df['age'],
                                                    combined_df['monetary'],
                                                    time=3,
                                                    freq='D')

  tmp = b * np.exp(a - a_max)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


In [33]:
combined_df

Unnamed: 0,CustomerID,recency,frequency,age,monetary,average_profit,cltv
0,12346.0,325,1,325,77183.60,55781.002334,3.382355e+04
1,12347.0,1,7,366,4310.00,4199.691722,2.135967e-07
2,12348.0,74,4,357,1797.24,1840.132589,9.351334e+00
3,12349.0,18,1,18,1757.55,1903.697114,1.245091e+04
4,12350.0,309,1,309,334.40,887.132055,5.638816e+02
...,...,...,...,...,...,...,...
4334,18280.0,277,1,277,180.60,777.271741,5.469882e+02
4335,18281.0,180,1,180,80.82,705.998255,7.391288e+02
4336,18282.0,7,2,125,178.05,526.529294,3.445497e+01
4337,18283.0,3,16,336,2094.88,2099.128894,1.457321e-18
