In [1]:
import pandas as pd
import numpy as np

In [2]:
def rename_cols(df):
    df.columns = ['marketplace','encrypted_customer_id','segment',
                  'is_pmd_ever','is_pmd_in_this_one','cohort_start_month',
                  'revenue','units','is_promo','is_pmd','what_period']
    
    df['cohort_start_month'] = pd.to_datetime(df['cohort_start_month'])
    
    return df

# Load cohorts by PMD engagement 

In [3]:
df1 = pd.read_csv("../data/pmd_customers_in_one_campaign.txt", sep = "\t", header=None)
df1 = rename_cols(df1)
df1['pmd'] = 'pmd'

df2 = pd.read_csv("../data/non_pmd_customers.txt", sep = "\t", header=None)
df2 = rename_cols(df2)
df2['pmd'] = 'non_pmd'

In [4]:
print(df1['encrypted_customer_id'].nunique(), df1.shape)
print(df2['encrypted_customer_id'].nunique(), df2.shape)

124745 (779864, 12)
241875 (913271, 12)


In [5]:
print(df1['is_pmd_in_this_one'].unique())
print(df2['is_pmd_in_this_one'].unique())

['pmd_customer_this_one']
['not_pmd_customer']


In [6]:
frames = [df1,df2]
combined_df = pd.concat(frames)

In [7]:
combined_df.shape[0] == df1.shape[0] + df2.shape[0]

True

In [8]:
combined_df.head()

Unnamed: 0,marketplace,encrypted_customer_id,segment,is_pmd_ever,is_pmd_in_this_one,cohort_start_month,revenue,units,is_promo,is_pmd,what_period,pmd
0,3,A3SMIHKSMSJ0K4,not_bought_at_least_year_before_event,pmd_customer_ever,pmd_customer_this_one,2018-11-01,37.42,8,non_promo,non_pmd,not_known,pmd
1,3,A1Q63VA5JZ830O,highest_engaged,pmd_customer_ever,pmd_customer_this_one,2016-07-01,912.18,375,non_promo,non_pmd,not_known,pmd
2,3,AAWV0AOQYL27C,highest_engaged,pmd_customer_ever,pmd_customer_this_one,2016-04-01,6.65,2,promo,is_pmd,pre_period,pmd
3,3,A37HG9LI65G2D8,highest_engaged,pmd_customer_ever,pmd_customer_this_one,2018-10-01,87.7,25,non_promo,non_pmd,post_period,pmd
4,3,A4DIHMK5GCK73,highest_engaged,pmd_customer_ever,pmd_customer_this_one,2018-05-01,3.32,2,promo,is_pmd,campaign_period,pmd


In [9]:
def agg_top_level(df, metric, parameter = None):
    
    if parameter == 'segment':
        aggregates_top_level = df.groupby(['pmd','what_period',parameter]).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

        aggregates_top_level.columns = aggregates_top_level.columns.droplevel()
        aggregates_top_level.columns = ['pmd','what_period',parameter,'revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

        aggregates_top_level['asp']  = aggregates_top_level['revenue_sum'] / aggregates_top_level['units_sum']
        aggregates_top_level['upc']  = aggregates_top_level['units_sum'] / aggregates_top_level['distinct_customers']
        aggregates_top_level['arpu'] = aggregates_top_level['revenue_sum'] / aggregates_top_level['distinct_customers']

        out = pd.pivot_table(aggregates_top_level, index = ['pmd','segment'], columns = 'what_period', values=metric).reset_index()
        out['change'] = out['post_period'] / out['pre_period'] - 1
        
    else: 
        aggregates_top_level = df.groupby(['pmd','what_period']).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

        aggregates_top_level.columns = aggregates_top_level.columns.droplevel()
        aggregates_top_level.columns = ['pmd','what_period','revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

        aggregates_top_level['asp']  = aggregates_top_level['revenue_sum'] / aggregates_top_level['units_sum']
        aggregates_top_level['upc']  = aggregates_top_level['units_sum'] / aggregates_top_level['distinct_customers']
        aggregates_top_level['arpu'] = aggregates_top_level['revenue_sum'] / aggregates_top_level['distinct_customers']
        
        out = pd.pivot_table(aggregates_top_level, index = ['pmd'], columns = 'what_period', values=metric).reset_index()
        out['change'] = out['post_period'] / out['pre_period'] - 1
        
    return aggregates_top_level, out

# Top Level PMD

In [10]:
def to_pivot(df):
    new_df = pd.pivot_table(df, index = ['encrypted_customer_id', 'segment','cohort_start_month'],
                     columns = ['what_period','is_promo', 'is_pmd'], values = ['revenue','units']).reset_index()
    new_df = new_df.fillna(0)
    return new_df

In [11]:
agg, out = agg_top_level(combined_df, "arpu")

In [12]:
agg

Unnamed: 0,pmd,what_period,revenue_sum,revenue_median,units_sum,units_median,distinct_customers,asp,upc,arpu
0,non_pmd,campaign_period,1604176.0,4.14,348539,1,241875,4.602573,1.440988,6.632253
1,non_pmd,not_known,18022260.0,18.23,3709608,4,192570,4.858265,19.263686,93.588087
2,non_pmd,post_period,2657998.0,7.48,569654,2,141762,4.665987,4.018383,18.749724
3,non_pmd,pre_period,2518304.0,8.32,515160,2,128876,4.888391,3.997331,19.540517
4,pmd,campaign_period,640316.1,1.66,242446,1,124745,2.641067,1.943533,5.133
5,pmd,not_known,19583210.0,14.55,4852448,5,116034,4.035737,41.819191,168.771273
6,pmd,post_period,2327562.0,5.82,621211,2,96891,3.746814,6.411442,24.022477
7,pmd,pre_period,2074148.0,6.65,500427,2,88341,4.144757,5.66472,23.478885


In [13]:
out

what_period,pmd,campaign_period,not_known,post_period,pre_period,change
0,non_pmd,6.632253,93.588087,18.749724,19.540517,-0.040469
1,pmd,5.133,168.771273,24.022477,23.478885,0.023152


# Testing PMD

In [14]:
# Assert that no one from pre or post period is absent from campaign period
print(df1[(df1['what_period'] == 'post_period') 
    & (~df1['encrypted_customer_id'].isin(df1[df1['what_period'] == 'campaign_period']['encrypted_customer_id']))])
print(df1[(df1['what_period'] == 'pre_period') 
    & (~df1['encrypted_customer_id'].isin(df1[df1['what_period'] == 'campaign_period']['encrypted_customer_id']))])


print(df2[(df2['what_period'] == 'post_period') 
    & (~df2['encrypted_customer_id'].isin(df2[df2['what_period'] == 'campaign_period']['encrypted_customer_id']))])
print(df2[(df2['what_period'] == 'pre_period') 
    & (~df2['encrypted_customer_id'].isin(df2[df2['what_period'] == 'campaign_period']['encrypted_customer_id']))])

Empty DataFrame
Columns: [marketplace, encrypted_customer_id, segment, is_pmd_ever, is_pmd_in_this_one, cohort_start_month, revenue, units, is_promo, is_pmd, what_period, pmd]
Index: []
Empty DataFrame
Columns: [marketplace, encrypted_customer_id, segment, is_pmd_ever, is_pmd_in_this_one, cohort_start_month, revenue, units, is_promo, is_pmd, what_period, pmd]
Index: []
Empty DataFrame
Columns: [marketplace, encrypted_customer_id, segment, is_pmd_ever, is_pmd_in_this_one, cohort_start_month, revenue, units, is_promo, is_pmd, what_period, pmd]
Index: []
Empty DataFrame
Columns: [marketplace, encrypted_customer_id, segment, is_pmd_ever, is_pmd_in_this_one, cohort_start_month, revenue, units, is_promo, is_pmd, what_period, pmd]
Index: []


In [15]:
agg_deep, out_deep = agg_top_level(combined_df, "arpu", "segment")

In [16]:
out_deep

what_period,pmd,segment,campaign_period,not_known,post_period,pre_period,change
0,non_pmd,highest_engaged,9.209994,241.859598,30.537278,35.557445,-0.141185
1,non_pmd,low_engaged,6.311772,47.359573,14.009607,10.180426,0.376132
2,non_pmd,lowest_engaged,5.737542,28.932554,11.957628,5.968779,1.003362
3,non_pmd,mid_engaged,6.982217,79.242348,16.857233,15.537756,0.084921
4,non_pmd,not_bought_at_least_year_before_event,5.461684,21.718307,11.531567,,
5,pmd,highest_engaged,7.448476,301.24801,33.875184,33.136002,0.022307
6,pmd,low_engaged,3.300726,44.122512,11.600526,7.628718,0.520639
7,pmd,lowest_engaged,2.994973,28.305297,10.766843,4.945645,1.177035
8,pmd,mid_engaged,3.853449,72.236787,13.862187,10.796435,0.28396
9,pmd,not_bought_at_least_year_before_event,2.710608,19.999504,10.562113,,


In [136]:
def max_levels(df, metric):
    
    d = df.groupby(['pmd','what_period',"segment","is_promo","is_pmd"]).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

    d.columns = d.columns.droplevel()
    d.columns = ['pmd','what_period',"segment","is_promo","is_pmd",'revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

    d['asp']  = d['revenue_sum'] / d['units_sum']
    d['upc']  = d['units_sum'] / d['distinct_customers']
    d['arpu'] = d['revenue_sum'] / d['distinct_customers']


    out = pd.pivot_table(d, index = ['pmd','segment'], columns = ['what_period','is_promo','is_pmd'], 
                         values=[metric]).reset_index()   

    out.columns = out.columns.droplevel(0)

    final = out.iloc[:,[0,1,13,12,11,10,9,8]]

    return d, final  

In [144]:
d, m = max_levels(combined_df,"revenue_sum")

In [145]:
def max_levels_all(df):
    
    d = df.groupby(['pmd','what_period',"segment","is_promo","is_pmd"]).agg({'revenue':['sum','median'],
                                               'units':['sum','median'],
                                               'encrypted_customer_id': lambda x: x.nunique()}).reset_index()

    d.columns = d.columns.droplevel()
    d.columns = ['pmd','what_period',"segment","is_promo","is_pmd",'revenue_sum','revenue_median',
                                   'units_sum','units_median','distinct_customers']

    d['asp']  = d['revenue_sum'] / d['units_sum']
    d['upc']  = d['units_sum'] / d['distinct_customers']
    d['arpu'] = d['revenue_sum'] / d['distinct_customers']


    out = pd.pivot_table(d, index = ['pmd','segment'], columns = ['what_period','is_promo','is_pmd'], 
                         values=['revenue_sum','units_sum','distinct_customers']).reset_index()   

   
    return d, out

In [146]:
r, t = max_levels_all(combined_df)