# Waterfall 

<p> The notebook calculates how many customers bought sequentially all subsequent seasons of a particular series </p>

In [19]:
import pandas as pd
import numpy as np

import math
from collections import Counter

In [2]:
df = pd.read_csv("../GOT/data/waterfall_4.txt", sep="\t",
                 names = ['encrypted_customer_id',
                          'transaction_date','asin',
                          'content_type','title'])

In [3]:
print(df.shape)
df.head()

(1229481, 5)


Unnamed: 0,encrypted_customer_id,transaction_date,asin,content_type,title
0,A3PCW9WC0GTF2Q,2017-08-15 00:00:00,B073VVVKNB,TV Season,Game of Thrones - Staffel 7 [dt./OV]
1,A1A77UBUPOPZI1,2017-08-21 00:00:00,B00M3LG74C,TV Season,Game of Thrones - Staffel 4 [dt./OV]
2,A3GKVH6E99LI3V,2017-07-30 00:00:00,B00I8ZYTRO,TV Season,Game of Thrones - Staffel 1 [dt./OV]
3,A2R45O6WIV592C,2014-08-05 00:00:00,B00M3LG74C,TV Season,Game of Thrones - Staffel 4 [dt./OV]
4,A9E4FJS969OCG,2016-09-18 00:00:00,B00I8ZVHDI,TV Season,Game of Thrones - Staffel 3 [dt./OV]


In [4]:
def to_preprocess(df):
    df['season_number'] =    df.title.str.extract('(\d+)')
    df['season_number'] =    pd.to_numeric(df['season_number'] , errors='coerce')

    df['transaction_date'] = pd.to_datetime(df['transaction_date'])
    df['days_since_epoch'] = (pd.to_datetime(df['transaction_date']) - pd.datetime(1970,1,1)).dt.days
    return df
    
new_df = to_preprocess(df)

  


In [5]:
print("Overall number of unique customers who bought: " , new_df['encrypted_customer_id'].nunique())
print("Unique asins: ", new_df['asin'].nunique())

Overall number of unique customers who bought:  515182
Unique asins:  14


In [6]:
def recursive_call_funnel(df,staffel, base):
    """calculates customers who watched all previous seasons"""
    if staffel <= base:
        return df[df['season_number'] == base]['encrypted_customer_id']
    else :
        return df[(df['season_number'] == staffel) 
                  & (df['encrypted_customer_id'].isin(recursive_call_funnel(df,staffel-1,base)))]['encrypted_customer_id']   


In [9]:
def create_matrix(df,range_start,range_end):
    """function calculates recursively how many bought in a specific season of a series"""
    new_table = []
    
    for i in range(range_start,range_end + 1):
        new_list = []
        for j in range(i, range_end + 1):
            new_list.append(recursive_call_funnel(df,j,i).nunique())
        new_table.append(new_list)
        
        
    output = []
    for j in range(0, len(new_table)):
        output.append([ round(i  /  new_table[j][0],2) for i in new_table[j]])
    
    return output

In [11]:
new_table_total = create_matrix(new_df, 1, 7)

In [12]:
new_table_total

[[1.0, 0.68, 0.57, 0.51, 0.45, 0.41, 0.36],
 [1.0, 0.8, 0.71, 0.62, 0.56, 0.49],
 [1.0, 0.85, 0.73, 0.65, 0.57],
 [1.0, 0.76, 0.66, 0.56],
 [1.0, 0.79, 0.66],
 [1.0, 0.75],
 [1.0]]

# Median time between purchases

In [13]:
def from_long_to_wide(df,dimension1, dimension2):
    """convert from wide to long"""
    return df.pivot_table(index = 'encrypted_customer_id', 
                          columns = dimension1, values = dimension2).reset_index()


def add_intervals(df):
    """function goes thru all combinations of season purchases
    and calculates time frame between purchases"""
    df_wide = from_long_to_wide(df,"title","days_since_epoch")
    
    for i in range(2,df_wide.shape[1]):
        df_wide[i] = df_wide.iloc[:,i] - df_wide.iloc[:,i-1]
    return df_wide    

In [15]:
df_wide = add_intervals(new_df)

In [20]:
def get_combinations(df):
    
    df = df.sort_values(by = ['encrypted_customer_id','transaction_date'])
    
    t = pd.DataFrame(df.groupby("encrypted_customer_id")['season_number'].agg(lambda x: x.tolist())).reset_index()
    t.columns = ['encrypted_customer_id','combination']
    
    #Get intersection in case you have have 2 column of lists
    #df['intersection'] = [list(set(a).intersection(set(b))) for a, b in zip(df.A, df.B)]
    
    
    t['combination'] = t.apply(lambda row: set(row['combination']), axis=1)    
    
    count_combo = t['combination'].apply(tuple)
    d = Counter(elem for elem in count_combo)
    
    df_output = pd.DataFrame.from_dict(d, orient='index').reset_index()
    
    df_output.columns = ['combination','count']
    return df_output, t

In [21]:
final, t = get_combinations(df)

final = final.sort_values(by = "count", ascending=False)
final.head()

Unnamed: 0,combination,count
0,"(7,)",183903
8,"(6, 7)",53593
6,"(1, 2, 3, 4, 5, 6, 7)",53340
4,"(1,)",32802
11,"(6,)",30050


# Days between purchases

In [22]:
# for i in range(2,7+1):
#     print("Percentiles from season %d to previous one" %i)   
#     print(df_wide[ pd.isnull(df_wide[i]) == False ][i].quantile(np.linspace(.1, 1, 9, 0), 'lower'))