In [1]:
import pandas as pd
import numpy as np

In [None]:
def rename_cols(df):
    df.columns = ['marketplace','encrypted_customer_id','segment',
                  'is_pmd_ever','is_pmd_in_this_one','cohort_start_month',
                  'revenue','units','is_promo','is_pmd','what_period']
    
    df['cohort_start_month'] = pd.to_datetime(df['cohort_start_month'])
    
    return df

# Load cohorts by PMD engagement 

In [None]:
df1 = pd.read_csv("../data/pmd_customers_in_one_campaign.txt", sep = "\t", header=None)
df1 = rename_cols(df1)
df1['pmd'] = 'pmd'

df2 = pd.read_csv("../data/non_pmd_customers.txt", sep = "\t", header=None)
df2 = rename_cols(df2)
df2['pmd'] = 'non_pmd'

In [None]:
print(df1['encrypted_customer_id'].nunique(), df1.shape)
print(df2['encrypted_customer_id'].nunique(), df2.shape)

In [None]:
print(df1['is_pmd_in_this_one'].unique())
print(df2['is_pmd_in_this_one'].unique())

In [None]:
frames = [df1,df2]
combined_df = pd.concat(frames)

In [None]:
combined_df.shape[0] == df1.shape[0] + df2.shape[0]

In [None]:
combined_df.head()

In [None]:
def agg_top_level(df, metric, parameter = None):
    
    if parameter == 'segment':
        aggregates_top_level = df.groupby(['pmd','what_period',parameter]).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

        aggregates_top_level.columns = aggregates_top_level.columns.droplevel()
        aggregates_top_level.columns = ['pmd','what_period',parameter,'revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

        aggregates_top_level['asp']  = aggregates_top_level['revenue_sum'] / aggregates_top_level['units_sum']
        aggregates_top_level['upc']  = aggregates_top_level['units_sum'] / aggregates_top_level['distinct_customers']
        aggregates_top_level['arpu'] = aggregates_top_level['revenue_sum'] / aggregates_top_level['distinct_customers']

        out = pd.pivot_table(aggregates_top_level, index = ['pmd','segment'], columns = 'what_period', values=metric).reset_index()
        out['change'] = out['post_period'] / out['pre_period'] - 1
        
    else: 
        aggregates_top_level = df.groupby(['pmd','what_period']).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

        aggregates_top_level.columns = aggregates_top_level.columns.droplevel()
        aggregates_top_level.columns = ['pmd','what_period','revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

        aggregates_top_level['asp']  = aggregates_top_level['revenue_sum'] / aggregates_top_level['units_sum']
        aggregates_top_level['upc']  = aggregates_top_level['units_sum'] / aggregates_top_level['distinct_customers']
        aggregates_top_level['arpu'] = aggregates_top_level['revenue_sum'] / aggregates_top_level['distinct_customers']
        
        out = pd.pivot_table(aggregates_top_level, index = ['pmd'], columns = 'what_period', values=metric).reset_index()
        out['change'] = out['post_period'] / out['pre_period'] - 1
        
    return aggregates_top_level, out

# Top Level PMD

In [None]:
def to_pivot(df):
    new_df = pd.pivot_table(df, index = ['encrypted_customer_id', 'segment','cohort_start_month'],
                     columns = ['what_period','is_promo', 'is_pmd'], values = ['revenue','units']).reset_index()
    new_df = new_df.fillna(0)
    return new_df

In [None]:
agg, out = agg_top_level(combined_df, "arpu")

In [None]:
agg.head()

In [None]:
out

# Testing PMD

In [None]:
# Assert that no one from pre or post period is absent from campaign period
print(df1[(df1['what_period'] == 'post_period') 
    & (~df1['encrypted_customer_id'].isin(df1[df1['what_period'] == 'campaign_period']['encrypted_customer_id']))])
print(df1[(df1['what_period'] == 'pre_period') 
    & (~df1['encrypted_customer_id'].isin(df1[df1['what_period'] == 'campaign_period']['encrypted_customer_id']))])


print(df2[(df2['what_period'] == 'post_period') 
    & (~df2['encrypted_customer_id'].isin(df2[df2['what_period'] == 'campaign_period']['encrypted_customer_id']))])
print(df2[(df2['what_period'] == 'pre_period') 
    & (~df2['encrypted_customer_id'].isin(df2[df2['what_period'] == 'campaign_period']['encrypted_customer_id']))])

In [None]:
agg_deep, out_deep = agg_top_level(combined_df, "arpu", "segment")

In [None]:
out_deep.head()

In [None]:
def max_levels(df, metric):
    
    d = df.groupby(['pmd','what_period',"segment","is_promo","is_pmd"]).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

    d.columns = d.columns.droplevel()
    d.columns = ['pmd','what_period',"segment","is_promo","is_pmd",'revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

    d['asp']  = d['revenue_sum'] / d['units_sum']
    d['upc']  = d['units_sum'] / d['distinct_customers']
    d['arpu'] = d['revenue_sum'] / d['distinct_customers']


    out = pd.pivot_table(d, index = ['pmd','segment'], columns = ['what_period','is_promo','is_pmd'], 
                         values=[metric]).reset_index()   

    out.columns = out.columns.droplevel(0)

    final = out.iloc[:,[0,1,13,12,11,10,9,8]]

    return d, final  

In [None]:
d, m = max_levels(combined_df,"revenue_sum")

In [None]:
def max_levels_all(df):
    
    d = df.groupby(['pmd','what_period',"segment","is_promo","is_pmd"]).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

    d.columns = d.columns.droplevel()
    d.columns = ['pmd','what_period',"segment","is_promo","is_pmd",'revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

    d['asp']  = d['revenue_sum'] / d['units_sum']
    d['upc']  = d['units_sum'] / d['distinct_customers']
    d['arpu'] = d['revenue_sum'] / d['distinct_customers']


    out = pd.pivot_table(d, index = ['pmd','segment'], columns = ['what_period','is_promo','is_pmd'], 
                         values=['revenue_sum','units_sum','distinct_customers']).reset_index()   

   
    return d, out

In [None]:
r, t = max_levels_all(combined_df)