In [37]:
import pandas as pd
from scipy.stats import ttest_ind_from_stats

In [38]:
general_results = pd.read_csv("../data/GeneralMetrics.csv")

general_results = general_results[general_results.audiencecohort != 'holdoutgroup_braze']

general_results = general_results.merge(general_results.audiencecohort.str.split("_", expand = True).rename(columns = {0:"Targetable", 1:'Cohort'}), left_index = True, right_index = True)

def audience_cohort_parsing(row):
    if row == 'paiddis':
        return 'Paid Display'
    elif row == 'paidsoc':
        return 'Paid Social'
    else:
        return 'All Paid Media'

def targetable_bool(row):
    if row == 'targetablegroup':
        return 'Targetable'
    else:
        return 'Holdout'

general_results.Cohort=general_results.Cohort.apply(lambda row: audience_cohort_parsing(row))

general_results.Targetable=general_results.Targetable.apply(lambda row: targetable_bool(row))

general_results = general_results[['Cohort', 
 'Targetable', 
 'tenure_classification', 
 'all_completes',
 'all_starts',
 'audience_size',
 'audiencecohort',
 'avg_completion_rate',
 'avg_starts',
 'conversions_watched_video',
 'repertoire',
 'repertoire_stddev',
  'usage',
 'usage_stdv']].sort_values(['Cohort', 'tenure_classification', 'Targetable']).reset_index(drop = True)

general_results = general_results.rename(columns = {'repertoire_stddev':'repertoire_stdv'})

In [145]:
signficance_test = dict()

for name, group in general_results.groupby(['Cohort', 'tenure_classification']):
    stat_dict = dict()
    for stat in ['repertoire', 'usage']:
        results = ttest_ind_from_stats(mean1 = group[group['Targetable'] == 'Targetable'][stat].values[0], 
                             mean2 = group[group['Targetable'] != 'Targetable'][stat].values[0], 
                             std1 = group[group['Targetable'] == 'Targetable'][f'{stat}_stdv'].values[0], 
                             std2 = group[group['Targetable'] != 'Targetable'][f'{stat}_stdv'].values[0], 
                             nobs1= group[group['Targetable'] == 'Targetable']['audience_size'].values[0], 
                             nobs2 =group[group['Targetable'] != 'Targetable']['audience_size'].values[0])[1]
        if results >= 0.05:
            stat_dict[f"{stat}_significance"] = 'No'
        else:
            stat_dict[f"{stat}_significance"] = 'Yes'
    signficance_test[name] = stat_dict

In [147]:
pd.DataFrame.from_dict(signficance_test, orient = 'columns').T.reset_index()

Unnamed: 0,level_0,level_1,repertoire_significance,usage_significance
0,All Paid Media,SECOND_MONTH,No,No
1,All Paid Media,SECOND_WEEK,No,No
2,All Paid Media,THIRD_FOURTH_WEEK,No,No
3,All Paid Media,THREE_PLUS_MONTHS,Yes,No
4,Paid Display,SECOND_MONTH,No,No
5,Paid Display,SECOND_WEEK,No,Yes
6,Paid Display,THIRD_FOURTH_WEEK,Yes,Yes
7,Paid Display,THREE_PLUS_MONTHS,Yes,Yes
8,Paid Social,SECOND_MONTH,No,No
9,Paid Social,SECOND_WEEK,No,No


In [149]:
from statsmodels.stats.proportion import proportions_ztest

In [152]:
signficance_test = dict()

for name, group in general_results.groupby(['Cohort', 'tenure_classification']):
    stat_dict = dict()
    for stat in ['conversions_watched_video']:
        results = proportions_ztest(count = [group[group['Targetable'] == 'Targetable'][stat].values[0], group[group['Targetable'] != 'Targetable'][stat].values[0]],  
                             nobs= [group[group['Targetable'] == 'Targetable']['audience_size'].values[0], group[group['Targetable'] != 'Targetable']['audience_size'].values[0]])[1]
        if results >= 0.05:
            stat_dict[f"{stat}_significance"] = 'No'
        else:
            stat_dict[f"{stat}_significance"] = 'Yes'
    signficance_test[name] = stat_dict

In [153]:
pd.DataFrame.from_dict(signficance_test, orient = 'columns').T.reset_index()

Unnamed: 0,level_0,level_1,conversions_watched_video_significance
0,All Paid Media,SECOND_MONTH,Yes
1,All Paid Media,SECOND_WEEK,Yes
2,All Paid Media,THIRD_FOURTH_WEEK,No
3,All Paid Media,THREE_PLUS_MONTHS,Yes
4,Paid Display,SECOND_MONTH,Yes
5,Paid Display,SECOND_WEEK,No
6,Paid Display,THIRD_FOURTH_WEEK,No
7,Paid Display,THREE_PLUS_MONTHS,Yes
8,Paid Social,SECOND_MONTH,No
9,Paid Social,SECOND_WEEK,No


In [155]:
save_df = pd.read_csv("../data/SaveRates.csv")
save_df = save_df[save_df.audiencecohort != 'holdoutgroup_braze']
save_df = save_df.merge(save_df.audiencecohort.str.split("_", expand = True).rename(columns = {0:'Targetable', 1:'Cohort'}), left_index = True, right_index = True).drop(columns='audiencecohort')
save_df.reset_index(drop = True, inplace = True)

save_df['Cohort']=save_df['Cohort'].apply(lambda row: audience_cohort_parsing(row))

save_df['Targetable']=save_df['Targetable'].apply(lambda row: targetable_bool(row))

save_df = save_df[['Cohort', 'Targetable', 'tenure_classification', 'saves', 'save_rate_denom', 'save_rate']].sort_values(['Cohort', 'tenure_classification', 'Targetable']).reset_index(drop = True)

signficance_test = dict()

for name, group in save_df.groupby(['Cohort', 'tenure_classification']):
    stat_dict = dict()
    for stat in ['saves']:
        results = proportions_ztest(count = [group[group['Targetable'] == 'Targetable'][stat].values[0], group[group['Targetable'] != 'Targetable'][stat].values[0]],  
                             nobs= [group[group['Targetable'] == 'Targetable']['save_rate_denom'].values[0], group[group['Targetable'] != 'Targetable']['save_rate_denom'].values[0]])[1]
        if results >= 0.05:
            stat_dict[f"{stat}_significance"] = 'No'
        else:
            stat_dict[f"{stat}_significance"] = 'Yes'
    signficance_test[name] = stat_dict

In [162]:
pd.DataFrame.from_dict(signficance_test, orient = 'columns').T.reset_index()

Unnamed: 0,level_0,level_1,saves_significance
0,All Paid Media,SECOND_MONTH,Yes
1,All Paid Media,THIRD_FOURTH_WEEK,No
2,All Paid Media,THREE_PLUS_MONTHS,No
3,Paid Display,SECOND_MONTH,No
4,Paid Display,THIRD_FOURTH_WEEK,No
5,Paid Display,THREE_PLUS_MONTHS,No
6,Paid Social,SECOND_MONTH,No
7,Paid Social,THIRD_FOURTH_WEEK,No
8,Paid Social,THREE_PLUS_MONTHS,No


In [164]:
save_df = pd.read_csv("../data/WinbackRates.csv")
save_df = save_df[save_df.audiencecohort != 'holdoutgroup_braze']
save_df = save_df.merge(save_df.audiencecohort.str.split("_", expand = True).rename(columns = {0:'Targetable', 1:'Cohort'}), left_index = True, right_index = True).drop(columns='audiencecohort')
save_df.reset_index(drop = True, inplace = True)

save_df['Cohort']=save_df['Cohort'].apply(lambda row: audience_cohort_parsing(row))

save_df['Targetable']=save_df['Targetable'].apply(lambda row: targetable_bool(row))

save_df = save_df[['Cohort', 'Targetable', 'tenure_classification', 'winbacks', 'winback_rate_denom', 'winback_rate']].sort_values(['Cohort', 'tenure_classification', 'Targetable']).reset_index(drop = True)

signficance_test = dict()

for name, group in save_df.groupby(['Cohort', 'tenure_classification']):
    stat_dict = dict()
    for stat in ['winbacks']:
        results = proportions_ztest(count = [group[group['Targetable'] == 'Targetable'][stat].values[0], group[group['Targetable'] != 'Targetable'][stat].values[0]],  
                             nobs= [group[group['Targetable'] == 'Targetable']['winback_rate_denom'].values[0], group[group['Targetable'] != 'Targetable']['winback_rate_denom'].values[0]])[1]
        if results >= 0.05:
            stat_dict[f"{stat}_significance"] = 'No'
        else:
            stat_dict[f"{stat}_significance"] = 'Yes'
    signficance_test[name] = stat_dict

In [165]:
pd.DataFrame.from_dict(signficance_test, orient = 'columns').T.reset_index()

Unnamed: 0,level_0,level_1,winbacks_significance
0,All Paid Media,SECOND_MONTH,No
1,All Paid Media,THREE_PLUS_MONTHS,Yes
2,Paid Display,SECOND_MONTH,No
3,Paid Display,THREE_PLUS_MONTHS,No
4,Paid Social,SECOND_MONTH,Yes
5,Paid Social,THREE_PLUS_MONTHS,No
