# Waterfall

<p> This notebook explores how customers buy seasons by looking into funnel: How many who bought season 1 also bought all subsequent seasons. </p>
<p> Also, it looks into the median time between purchases of seasons </p>


In [1]:
import pandas as pd
import numpy as np

import math
from collections import Counter

In [2]:
df = pd.read_csv("../GOT/data/waterfall_4.txt", sep="\t",
                 names = ['encrypted_customer_id',
                          'transaction_date','asin',
                          'content_type','title'])

In [3]:
print(df.shape)

(1229481, 5)


In [4]:
df['season_number'] =    df.title.str.extract('(\d+)')
df['season_number'] =    pd.to_numeric(df['season_number'] , errors='coerce')
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['days_since_epoch'] = (pd.to_datetime(df['transaction_date']) - pd.datetime(1970,1,1)).dt.days
df.head()

  """Entry point for launching an IPython kernel.


Unnamed: 0,encrypted_customer_id,transaction_date,asin,content_type,title,season_number,days_since_epoch
0,A3PCW9WC0GTF2Q,2017-08-15,B073VVVKNB,TV Season,Game of Thrones - Staffel 7 [dt./OV],7,17393
1,A1A77UBUPOPZI1,2017-08-21,B00M3LG74C,TV Season,Game of Thrones - Staffel 4 [dt./OV],4,17399
2,A3GKVH6E99LI3V,2017-07-30,B00I8ZYTRO,TV Season,Game of Thrones - Staffel 1 [dt./OV],1,17377
3,A2R45O6WIV592C,2014-08-05,B00M3LG74C,TV Season,Game of Thrones - Staffel 4 [dt./OV],4,16287
4,A9E4FJS969OCG,2016-09-18,B00I8ZVHDI,TV Season,Game of Thrones - Staffel 3 [dt./OV],3,17062


# Overall stats

In [5]:
print("Overall number of unique customers who bought: " , df['encrypted_customer_id'].nunique())

Overall number of unique customers who bought:  515182


In [6]:
#Check number of unique customers who bought seasons

for i in range(1,7+1):
    print(df[df['season_number'] == i]['encrypted_customer_id'].nunique())

147453
117905
109499
127494
142427
214738
367947


# Waterfall including all previous cohorts
## Brute force

In [7]:
base_customers = df[df['season_number'] == 1]['encrypted_customer_id']
print(base_customers.nunique())


s2 = df[(df['season_number'] == 2) 
        & (df['encrypted_customer_id'].isin(base_customers))]['encrypted_customer_id']
print(s2.nunique())


s3 = df[(df['season_number'] == 3) 
        & (df['encrypted_customer_id'].isin(s2))]['encrypted_customer_id']
print(s3.nunique())


s4 = df[(df['season_number'] == 4) 
        & (df['encrypted_customer_id'].isin(s3)) ]['encrypted_customer_id']
print(s4.nunique())


s5 = df[(df['season_number'] == 5) 
        & (df['encrypted_customer_id'].isin(s4))]['encrypted_customer_id']
print(s5.nunique())


s6 = df[(df['season_number'] == 6) 
        & (df['encrypted_customer_id'].isin(s5))]['encrypted_customer_id']
print(s6.nunique())


s7 = df[(df['season_number'] == 7) 
        & (df['encrypted_customer_id'].isin(s6))]['encrypted_customer_id']
print(s7.nunique())

147453
100049
83421
74739
66353
60163
53340


In [8]:
some_list = []
for i in range(min(df.season_number)+1, max(df.season_number) + 1):
    """loop calculates how many bought season X and first season"""
    s = df[(df['season_number'] == i) 
           & (df['encrypted_customer_id'].isin(base_customers))]['encrypted_customer_id'].nunique()
    some_list.append(s)

In [9]:
some_list

[100049, 85744, 80044, 74546, 75911, 78532]

# Make proper waterfall

## Recursive implementation

In [10]:
def recursive_call_funnel(df,staffel):
    if staffel == 1:
        return df[df['season_number'] == 1]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel(df,staffel-1)))]['encrypted_customer_id']   

In [11]:
list1 = []
for i in range(1,8):
    list1.append(recursive_call_funnel(df,i).nunique())

In [12]:
list1

[147453, 100049, 83421, 74739, 66353, 60163, 53340]

In [13]:
def recursive_call_funnel2(df,staffel):
    if staffel <= 2 :
        return df[df['season_number'] == 2]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel2(df,staffel-1)))]['encrypted_customer_id']   

In [14]:
list2 = []
for i in range(2,8):
    list2.append(recursive_call_funnel2(df,i).nunique())
list2

[117905, 94358, 83429, 73141, 65761, 57774]

In [15]:
def recursive_call_funnel3(df,staffel):
    if staffel <= 3 :
        return df[df['season_number'] == 3]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel3(df,staffel-1)))]['encrypted_customer_id']   

In [16]:
list3 = []
for i in range(3,8):
    list3.append(recursive_call_funnel3(df,i).nunique())
list3

[109499, 93199, 79971, 71174, 62044]

In [17]:
def recursive_call_funnel4(df,staffel):
    if staffel <= 4 :
        return df[df['season_number'] == 4]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel4(df,staffel-1)))]['encrypted_customer_id']   

In [18]:
list4 = []
for i in range(4,8):
    list4.append(recursive_call_funnel4(df,i).nunique())
list4

[127494, 96520, 83780, 71852]

In [19]:
def recursive_call_funnel5(df,staffel):
    if staffel <= 5 :
        return df[df['season_number'] == 5]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel5(df,staffel-1)))]['encrypted_customer_id']   

In [20]:
list5 = []
for i in range(5,8):
    list5.append(recursive_call_funnel5(df,i).nunique())
list5

[142427, 113024, 93841]

In [21]:
def recursive_call_funnel6(df,staffel):
   
    if staffel <= 6:
        return df[df['season_number'] == 6]['encrypted_customer_id']
    else :
        return df[( df['season_number'] == staffel ) 
                  & ( df['encrypted_customer_id'].isin( recursive_call_funnel6(df, staffel-1) ) )]['encrypted_customer_id']   

In [22]:
list6 = []
for i in range(6,8):
    list6.append(recursive_call_funnel6(df,i).nunique())
list6

[214738, 161195]

# Median time between purchases


In [23]:
def from_long_to_wide(df,dimension1, dimension2):
    """convert from wide to long"""
    return df.pivot_table(index = 'encrypted_customer_id', 
                          columns = dimension1, values = dimension2).reset_index()


def add_intervals(df):
    """function goes thru all combinations of season purchases
    and calculates time frame between purchases"""
    df_wide = from_long_to_wide(df,"title","days_since_epoch")
    
    for i in range(2,df_wide.shape[1]):
        df_wide[i] = df_wide.iloc[:,i] - df_wide.iloc[:,i-1]
    return df_wide    

In [24]:
df_wide = add_intervals(df)

In [25]:
print(df_wide.shape)

(515182, 14)


In [26]:
def get_combinations(df):
    
    df = df.sort_values(by = ['encrypted_customer_id','transaction_date'])
    
    t = pd.DataFrame(df.groupby("encrypted_customer_id")['season_number'].agg(lambda x: x.tolist())).reset_index()
    t.columns = ['encrypted_customer_id','combination']
    
    #Get intersection in case you have have 2 column of lists
    #df['intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df.A, df.B)]
    
    
    t['combination'] = t.apply(lambda row: set(row['combination']), axis=1)    
    
    count_combo = t['combination'].apply(tuple)
    d = Counter(elem for elem in count_combo)
    
    df_output = pd.DataFrame.from_dict(d, orient='index').reset_index()
    
    df_output.columns = ['combination','count']
    return df_output, t

In [27]:
final, t = get_combinations(df)

In [28]:
print(final.shape)
final = final.sort_values(by = "count", ascending=False)

(127, 2)


In [38]:
final.head()

Unnamed: 0,combination,count
0,"(7,)",183903
8,"(6, 7)",53593
6,"(1, 2, 3, 4, 5, 6, 7)",53340
4,"(1,)",32802
11,"(6,)",30050


## Check some sets

In [30]:
print(t[t['combination'].apply(lambda x: x == {1,2,3,4,5,6}) == True]['encrypted_customer_id'].nunique())

6823


In [31]:
#Count the number of occurences that a specific subset arrives in the set of sets
t[t['combination'].apply(lambda x: {1,2,3,4,5}.issubset(x)) == True]['encrypted_customer_id'].nunique()

66353

## Days between purchases of seasons, in percentiles

In [32]:
# for i in range(2,7+1):
#     print("Percentiles from season %d to previous one" %i)   
#     print(df_wide[ pd.isnull(df_wide[i]) == False ][i].quantile(np.linspace(.1, 1, 9, 0), 'lower'))

In [33]:
# for i in range(2,7+1):
#     print("Average time of purchase between season %d and previous one" %i)   
#     print(df_wide[ pd.isnull(df_wide[i]) == False ][i].mean())

# Restrict mean time analysis to a particular timeframe

In [34]:
df_restricted = df[ (df['transaction_date'] >= '2018-08-01' ) &  (df['transaction_date'] < '2019-01-01' )  ]

df_wide_restricted = add_intervals(df_restricted)

In [35]:
# for i in range(2,7+1):
#     print("Percentiles from season %d to previous one" %i)   
#     print(df_wide_restricted[ pd.isnull(df_wide_restricted[i]) == False ][i].quantile(np.linspace(.1, 1, 9, 0), 'lower'))
#     print(df_wide_restricted[pd.isnull(df_wide_restricted[i]) == False ]['encrypted_customer_id'].nunique())

In [36]:
# for i in range(2,7+1):
#     print("Average time of purchase between season %d and previous one" %i)   
#     print(df_wide_restricted[ pd.isnull(df_wide_restricted[i]) == False ][i].mean())

In [37]:
#Check min transaction
df.groupby("title")['transaction_date'].min()

title
Game of Thrones - Staffel 1 [dt./OV]   2014-02-26
Game of Thrones - Staffel 2 [dt./OV]   2014-02-26
Game of Thrones - Staffel 3 [dt./OV]   2014-02-17
Game of Thrones - Staffel 4 [dt./OV]   2014-08-05
Game of Thrones - Staffel 5 [dt./OV]   2015-06-30
Game of Thrones - Staffel 6 [dt./OV]   2016-04-26
Game of Thrones - Staffel 7 [dt./OV]   2017-07-18
Name: transaction_date, dtype: datetime64[ns]