In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

purchase_data = pd.read_csv("./user_purchases.csv",parse_dates=['date'])
purchase_data.head()
# purchase_data.info()

Unnamed: 0,date,uid,sku,price,reg_date,device,gender,country,age,month1
0,2017-07-10,41195147,sku_three_499,499,2017-06-26,and,M,BRA,17,499.0
1,2017-07-15,41195147,sku_three_499,499,2017-06-26,and,M,BRA,17,499.0
2,2017-11-12,41195147,sku_four_599,599,2017-06-26,and,M,BRA,17,
3,2017-09-26,91591874,sku_two_299,299,2017-01-05,and,M,TUR,17,
4,2017-12-01,91591874,sku_four_599,599,2017-01-05,and,M,TUR,17,


In [10]:
# Round our timestamp to 'day'
purchase_data.date = purchase_data.date.dt.floor('d')

# Replace the NaN price values with 0 
purchase_data.price = np.where(np.isnan(purchase_data.price), 0, purchase_data.price)

# Aggregate the data by 'uid' & 'date'
purchase_data_agg = purchase_data.groupby(by=['uid', 'date'], as_index=False)
revenue_user_day = purchase_data_agg.price.sum()

# Calculate the final average
revenue_user_day = revenue_user_day.price.mean()
print(revenue_user_day)

407.95033407572384


In [19]:
paywall_views = pd.read_csv("./paywall.csv",parse_dates=['date'])

In [20]:
demographics_data = pd.read_csv("./user_demographics_paywall.csv",parse_dates=['reg_date'])
demographics_data.head()

Unnamed: 0,uid,reg_date,device,gender,country,age
0,52774929,2018-03-07 00:00:00+00:00,and,F,FRA,27
1,40143397,2016-07-02 00:00:00+00:00,and,F,DEU,56
2,89856430,2017-06-05 00:00:00+00:00,and,M,TUR,31
3,76188504,2016-09-24 00:00:00+00:00,iOS,M,BRA,54
4,76493748,2017-06-07 00:00:00+00:00,iOS,M,BRA,24


In [21]:
# Merge the datasets and calculate the per day metrics 
purchase_data = demographics_data.merge(paywall_views, how='inner', on=['uid'])
purchase_data.date = purchase_data.date.dt.floor('d')

# Group and aggregate our combined data set 
daily_purchase_data = purchase_data.groupby(by=['date'], as_index=False)
daily_purchase_data = daily_purchase_data.agg({'purchase': ['sum', 'count']})

# Find the mean of each field and then multiply by 1000 to scale the result
daily_purchases = daily_purchase_data.purchase['sum'].mean()
daily_paywall_views = daily_purchase_data.purchase['count'].mean()
daily_purchases = daily_purchases * 1000
daily_paywall_views = daily_paywall_views * 1000

print(daily_purchases)
print(daily_paywall_views)

3181.8181818181815
91731.86409550045


In [23]:
small_sensitivity = 0.1 

conversion_rate = 0.03468
# Find the conversion rate when increased by the percentage of the sensitivity above
small_conversion_rate = conversion_rate * (1 + small_sensitivity) 

# Apply the new conversion rate to find how many more users per day that translates to
small_purchasers = daily_paywall_views * small_conversion_rate

# Subtract the initial daily_purcahsers number from this new value to see the lift
purchaser_lift = small_purchasers - daily_purchases

print(small_conversion_rate)
print(small_purchasers)
print(purchaser_lift)

0.03814800000000001
3499.387151515152
317.56896969697027


In [24]:
medium_sensitivity = 0.2

# Find the conversion rate lift with the sensitivity above
medium_conversion_rate = conversion_rate * (1 + medium_sensitivity)

# Find how many more users per day that translates to
medium_purchasers = daily_paywall_views * medium_conversion_rate

# Subtract the initial daily_purcahsers number from this new value to see the lift
purchaser_lift = medium_purchasers - daily_purchases

print(medium_conversion_rate)
print(medium_purchasers)
print(purchaser_lift)

0.041616
3817.513256198347
635.6950743801654


In [25]:
large_sensitivity = 0.5

# Find the conversion rate lift with the sensitivity above 
large_conversion_rate = conversion_rate * (1 + large_sensitivity)

# Find how many more users per day that translates to
large_purchasers = daily_paywall_views * large_conversion_rate

# Subtract the initial daily_purcahsers number from this new value to see the lift
purchaser_lift = large_purchasers - daily_purchases

print(large_conversion_rate)
print(large_purchasers)
print(purchaser_lift)

0.052020000000000004
4771.8915702479335
1590.073388429752


Awesome! While it seems that a 50% increase may be too drastic and unreasonable to expect, the small and medium sensitivities both seem very reasonable.

In [26]:
# Find the n & v quantities
n = purchase_data.purchase.count()

# Calculate the quantity "v"
v = conversion_rate * (1 - conversion_rate) 

# Calculate the variance and standard error of the estimate
var = v / n 
se = var**0.5

print(var)
print(se)

3.35121502362457e-07
0.0005788967976785301


Awesome Job! Notice how closely the standard error is related to our sample size?

In [30]:
p1,p2,cl,n1 = 0.1, 0.12, 0.95,1000
from scipy import stats
def get_power(n, p1, p2, cl):
    alpha = 1 - cl
    
    qu = stats.norm.ppf(1 - alpha/2)
    
    diff = abs(p2 - p1)
    bp = (p1 + p2) / 2
    
    v1 = p1 * (1 - p1)
    v2 = p2 * (1 - p2)
    
    bv = bp * (1 - bp)
    
    power_part_one = stats.norm.cdf((n**0.5 * diff - qu * (2 * bv)**0.5)/ (v1 + v2)**0.5)
    power_part_two = 1 - stats.norm.cdf((n**0.5 * diff + qu * (2 * bv)**0.5)/ (v1 + v2)**0.5)
    
    power = power_part_one + power_part_two
    return(power)

Exploring the power calculation
As discussed, power is the probability of rejecting the null hypothesis when the alternative hypothesis is true. Here you will explore some properties of the power function and see how it relates to sample size among other parameters. The get_power() function has been included and takes the following arguments in the listed order n for sample size, p1 as the baseline value, p2 as the value with lift included, and cl as the confidence level.

Instructions
0 XP
Instructions
0 XP
Calculate the power using n = 1000 and n = 2000 in that order, along with the pre-loaded parameters, p1, p2, and cl.
Using the variable n1 for the sample size, find the power with a confidence level of cl = 0.8 and cl = 0.95 in that order.
Hit 'Submit Answer' to compare the ratios. Which change has the bigger impact, increasing the confidence level or the sample size?

In [31]:
# Look at the impact of sample size increase on power
n_param_one = get_power(n=1000, p1=p1, p2=p2, cl=cl)
n_param_two = get_power(n=2000, p1=p1, p2=p2, cl=cl)

# Look at the impact of confidence level increase on power
alpha_param_one = get_power(n=n1, p1=p1, p2=p2, cl=0.8)
alpha_param_two = get_power(n=n1, p1=p1, p2=p2, cl=0.95)
    
# Compare the ratios
print(n_param_two / n_param_one)
print(alpha_param_one / alpha_param_two)

1.7596440001351992
1.8857367092232278


Great Job! With these particular values it looks like decreasing our confidence level has a slightly larger impact on the power than increasing our sample size

In [32]:
# Merge the demographics and purchase data to only include paywall views
purchase_data = demographics_data.merge(paywall_views, how='inner', on=['uid'])
  
# Find the conversion rate
conversion_rate = (sum(purchase_data.purchase) / purchase_data.purchase.count())
            
print(conversion_rate)

0.03468607351645712


In [33]:
# Merge the demographics and purchase data to only include paywall views
purchase_data = demographics_data.merge(paywall_views, how='inner', on=['uid'])
                            
# Find the conversion rate
conversion_rate = (sum(purchase_data.purchase) / purchase_data.purchase.count())
            
# Desired Power: 0.8
# CL: 0.90
# Percent Lift: 0.1
def get_sample_size(power, p1, p2, cl, max_n=1000000):
    n = 1 
    while n <= max_n:
        tmp_power = get_power(n, p1, p2, cl)

        if tmp_power >= power: 
            return n 
        else: 
            n = n + 100

    return "Increase Max N Value"
p2 = conversion_rate * (1 + 0.1)
sample_size = get_sample_size(0.8, conversion_rate, p2, 0.90)
print(sample_size)

36101


In [34]:
# Merge the demographics and purchase data to only include paywall views
purchase_data = demographics_data.merge(paywall_views, how='inner', on=['uid'])
                            
# Find the conversion rate
conversion_rate = (sum(purchase_data.purchase) / purchase_data.purchase.count())

# Desired Power: 0.95
# CL: 0.90
# Percent Lift: 0.1
p2 = conversion_rate * (1 + 0.1)
sample_size = get_sample_size(0.95, conversion_rate, p2, 0.90)
print(sample_size)

63201
