# Waterfall

<p> This notebook explores how customers buy seasons by looking into funnel: How many who bought season 1 also bought all subsequent seasons. </p>
<p> Also, it looks into the median time between purchases of seasons </p>


In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("../GOT/data/waterfall_4.txt", sep="\t", names = ['encrypted_customer_id','transaction_date','asin',
                                                                  'content_type','title'])

In [3]:
print(df.shape)

(1229481, 5)


In [4]:
df['season_number'] =    df.title.str.extract('(\d+)')
df['season_number'] =    pd.to_numeric(df['season_number'] , errors='coerce')
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['days_since_epoch'] = (pd.to_datetime(df['transaction_date']) - pd.datetime(1970,1,1)).dt.days
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,encrypted_customer_id,transaction_date,asin,content_type,title,season_number,days_since_epoch
0,A3PCW9WC0GTF2Q,2017-08-15,B073VVVKNB,TV Season,Game of Thrones - Staffel 7 [dt./OV],7,17393
1,A1A77UBUPOPZI1,2017-08-21,B00M3LG74C,TV Season,Game of Thrones - Staffel 4 [dt./OV],4,17399
2,A3GKVH6E99LI3V,2017-07-30,B00I8ZYTRO,TV Season,Game of Thrones - Staffel 1 [dt./OV],1,17377
3,A2R45O6WIV592C,2014-08-05,B00M3LG74C,TV Season,Game of Thrones - Staffel 4 [dt./OV],4,16287
4,A9E4FJS969OCG,2016-09-18,B00I8ZVHDI,TV Season,Game of Thrones - Staffel 3 [dt./OV],3,17062


In [5]:
df.dtypes

encrypted_customer_id            object
transaction_date         datetime64[ns]
asin                             object
content_type                     object
title                            object
season_number                     int64
days_since_epoch                  int64
dtype: object

# Make proper waterfall

In [6]:
base_customers = df[df['season_number'] == 1]['encrypted_customer_id']
base_customers.nunique()

147453

In [7]:
list = []
for i in range(min(df.season_number)+1, max(df.season_number) + 1):
    s = df[(df['season_number'] == i) & (df['encrypted_customer_id'].isin(base_customers))]['encrypted_customer_id'].nunique()
    list.append(s)

In [8]:
list

[100049, 85744, 80044, 74546, 75911, 78532]

# Waterfall including all previous cohorts
## Brute force

In [9]:
s2 = df[(df['season_number'] == 2) & (df['encrypted_customer_id'].isin(base_customers))]['encrypted_customer_id']
s2.nunique()

100049

In [10]:
s3 = df[(df['season_number'] == 3) & (df['encrypted_customer_id'].isin(s2))]['encrypted_customer_id']
s3.nunique()

83421

In [11]:
s4 = df[(df['season_number'] == 4) & (df['encrypted_customer_id'].isin(s3)) ]['encrypted_customer_id']
s4.nunique()

74739

In [12]:
s5 = df[(df['season_number'] == 5) & (df['encrypted_customer_id'].isin(s4))]['encrypted_customer_id']
s5.nunique()

66353

In [13]:
s6 = df[(df['season_number'] == 6) & (df['encrypted_customer_id'].isin(s5))]['encrypted_customer_id']
s6.nunique()

60163

In [14]:
s7 = df[(df['season_number'] == 7) & (df['encrypted_customer_id'].isin(s6))]['encrypted_customer_id']
s7.nunique()

53340

## Recursive implementation

In [15]:
def recursive_call_funnel(df,staffel):
    if staffel == 1:
        return df[df['season_number'] == 1]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel(df,staffel-1)))]['encrypted_customer_id']   

In [16]:
list = []
for i in range(1,8):
    list.append(recursive_call_funnel(df,i).nunique())

In [17]:
list

[147453, 100049, 83421, 74739, 66353, 60163, 53340]

In [18]:
def recursive_call_funnel2(df,staffel):
    if staffel <= 2 :
        return df[df['season_number'] == 2]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel2(df,staffel-1)))]['encrypted_customer_id']   

In [19]:
list2 = []
for i in range(2,8):
    list2.append(recursive_call_funnel2(df,i).nunique())
list2

[117905, 94358, 83429, 73141, 65761, 57774]

In [20]:
def recursive_call_funnel3(df,staffel):
    if staffel <= 3 :
        return df[df['season_number'] == 3]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel3(df,staffel-1)))]['encrypted_customer_id']   

In [21]:
list3 = []
for i in range(3,8):
    list3.append(recursive_call_funnel3(df,i).nunique())
list3

[109499, 93199, 79971, 71174, 62044]

In [22]:
def recursive_call_funnel4(df,staffel):
    if staffel <= 4 :
        return df[df['season_number'] == 4]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel4(df,staffel-1)))]['encrypted_customer_id']   

In [23]:
list4 = []
for i in range(4,8):
    list4.append(recursive_call_funnel4(df,i).nunique())
list4

[127494, 96520, 83780, 71852]

In [24]:
def recursive_call_funnel5(df,staffel):
    if staffel <= 5 :
        return df[df['season_number'] == 5]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel5(df,staffel-1)))]['encrypted_customer_id']   

In [25]:
list5 = []
for i in range(5,8):
    list5.append(recursive_call_funnel5(df,i).nunique())
list5

[142427, 113024, 93841]

In [26]:
def recursive_call_funnel6(df,staffel):
   
    if staffel <= 6:
        return df[df['season_number'] == 6]['encrypted_customer_id']
    else :
        return df[( df['season_number'] == staffel ) 
                  & ( df['encrypted_customer_id'].isin( recursive_call_funnel6(df, staffel-1) ) )]['encrypted_customer_id']   

In [27]:
list6 = []
for i in range(6,8):
    list6.append(recursive_call_funnel6(df,i).nunique())
list6

[214738, 161195]

In [28]:
#Check number of unique customers who bought seasons

for i in range(1,7+1):
    print(df[df['season_number'] == i]['encrypted_customer_id'].nunique())

147453
117905
109499
127494
142427
214738
367947


# Median time between purchases


In [29]:
df_wide = df.pivot_table(index = 'encrypted_customer_id', columns = 'title', values = 'days_since_epoch').reset_index()

In [38]:
#df_wide.head()

In [31]:
for i in range(2,df_wide.shape[1]):
    df_wide[i] = df_wide.iloc[:,i] - df_wide.iloc[:,i-1]

In [39]:
# print(df_wide.shape)
# df_wide[df_wide['encrypted_customer_id'].isin(['A002922237RQVABI2X7ZZ', 'A0061406BM43X1RS4ADM'])]

In [37]:
#Check what customers actually bought
# df[df['encrypted_customer_id'].isin(['A002922237RQVABI2X7ZZ',
#                                      'A0061406BM43X1RS4ADM'])].sort_values(by=['encrypted_customer_id','transaction_date'])

## Days between purchases of seasons, in percentiles

In [34]:
for i in range(2,7+1):
    print("Percentiles from season %d to previous one" %i)   
    print(df_wide[ pd.isnull(df_wide[i]) == False ][i].quantile(np.linspace(.1, 1, 9, 0), 'lower'))

Percentiles from season 2 to previous one
0.1     0.0
0.2     1.0
0.3     2.0
0.4     3.0
0.5     4.0
0.6     6.0
0.7     9.0
0.8    15.0
0.9    40.0
Name: 2, dtype: float64
Percentiles from season 3 to previous one
0.1     0.0
0.2     1.0
0.3     2.0
0.4     3.0
0.5     5.0
0.6     6.0
0.7     9.0
0.8    14.0
0.9    29.0
Name: 3, dtype: float64
Percentiles from season 4 to previous one
0.1     0.0
0.2     1.0
0.3     2.0
0.4     3.0
0.5     4.0
0.6     6.0
0.7     9.0
0.8    14.0
0.9    30.0
Name: 4, dtype: float64
Percentiles from season 5 to previous one
0.1      0.0
0.2      1.0
0.3      2.0
0.4      3.0
0.5      4.0
0.6      6.0
0.7      9.0
0.8     17.0
0.9    101.0
Name: 5, dtype: float64
Percentiles from season 6 to previous one
0.1      0.0
0.2      1.0
0.3      3.0
0.4      5.0
0.5      7.0
0.6     13.0
0.7     36.0
0.8    147.0
0.9    285.0
Name: 6, dtype: float64
Percentiles from season 7 to previous one
0.1      1.0
0.2      4.0
0.3      9.0
0.4     97.0
0.5    317.0
0.6  

In [35]:
for i in range(2,7+1):
    print("Average time of purchase between season %d and previous one" %i)   
    print(df_wide[ pd.isnull(df_wide[i]) == False ][i].mean())

Average time of purchase between season 2 and previous one
20.6877413400767
Average time of purchase between season 3 and previous one
13.288081099750487
Average time of purchase between season 4 and previous one
2.2985018822420504
Average time of purchase between season 5 and previous one
28.17579776212184
Average time of purchase between season 6 and previous one
59.33450122687807
Average time of purchase between season 7 and previous one
237.20601859032024


In [36]:
#Check min transaction
df.groupby("title")['transaction_date'].min()

title
Game of Thrones - Staffel 1 [dt./OV]   2014-02-26
Game of Thrones - Staffel 2 [dt./OV]   2014-02-26
Game of Thrones - Staffel 3 [dt./OV]   2014-02-17
Game of Thrones - Staffel 4 [dt./OV]   2014-08-05
Game of Thrones - Staffel 5 [dt./OV]   2015-06-30
Game of Thrones - Staffel 6 [dt./OV]   2016-04-26
Game of Thrones - Staffel 7 [dt./OV]   2017-07-18
Name: transaction_date, dtype: datetime64[ns]