In [1]:
CTRL_ID = 101
EXP_ID = 102
SA_MEAN_CPC_CTRL = 1.0
SA_MEAN_CPC_EXP = 0.9
LA_MEAN_CPC_CTRL = 2
LA_MEAN_CPC_EXP = 2.2

NUM_SA = 10
NUM_LA = 12
MEAN_SA_IMPS_CTRL = 5
MEAN_LA_IMPS_CTRL = 20
MEAN_SA_IMPS_EXP = 8
MEAN_LA_IMPS_EXP = 24

In [71]:
import numpy as np
'''
Creating fake data

Let’s make two categories of advertisers: small and large (mean impression of 100, 10000) and have 100 in each category.
Each advertiser will have a different average ctr (sampled from a Beta Distribution:
say Beta(3,15) -- mean of ~0.167 for both control and experiment

The CPCs for are Normally distributed N(1, 0.2), N(2, 0.2), N(0.9, 0.2), N(2.2, 0.2)
for sa-control, la-control, sa-exp, la-exp, respectively
First, get the total number of simulated impressions per advertiser: sample poission(\lambda)
'''

def calc_sim_data(exp_id, adv_ids, ctrs, mean_imps, mean_cpc, size):
    exp_ids = exp_id*(np.ones(size))
    imps = np.random.poisson(mean_imps, size)
    zeros = 0.0*(np.ones(size))
    ones = np.ones(size)
    data = np.concatenate(([exp_ids], [adv_ids], [ones], [zeros], [ctrs]), axis=0).transpose()
    full_data = np.repeat(data,imps, axis=0)

    np.set_printoptions(suppress=True) # Supress scientific notation.
    for r in full_data:
        r[3] = np.random.normal(mean_cpc, 0.2)
        r[4] = np.random.binomial(1, r[4])
    return full_data
    
CTRL_ID = 101
EXP_ID = 102
SA_MEAN_CPC_CTRL = 1.0
SA_MEAN_CPC_EXP = 0.9
LA_MEAN_CPC_CTRL = 2
LA_MEAN_CPC_EXP = 2.2

NUM_SA = 10
NUM_LA = 12
MEAN_SA_IMPS_CTRL = 5
MEAN_LA_IMPS_CTRL = 20
MEAN_SA_IMPS_EXP = 8
MEAN_LA_IMPS_EXP = 24

sa_adv_ids = np.arange(1000,1000+NUM_SA)
la_adv_ids = np.arange(10000,10000+NUM_LA)

sa_ctr = np.random.beta(3, 15, NUM_SA)
la_ctr = np.random.beta(3, 15, NUM_LA)

sa_data_ctrl = calc_sim_data(CTRL_ID, sa_adv_ids, sa_ctr, MEAN_SA_IMPS_CTRL, SA_MEAN_CPC_CTRL, NUM_SA)
sa_data_exp = calc_sim_data(EXP_ID, sa_adv_ids, sa_ctr, MEAN_SA_IMPS_EXP, SA_MEAN_CPC_EXP, NUM_SA)
la_data_ctrl = calc_sim_data(CTRL_ID, la_adv_ids, la_ctr, MEAN_LA_IMPS_CTRL, LA_MEAN_CPC_CTRL, NUM_LA)
la_data_exp = calc_sim_data(EXP_ID, la_adv_ids, la_ctr, MEAN_LA_IMPS_EXP, LA_MEAN_CPC_EXP, NUM_LA)

all_data = np.concatenate(([sa_data_ctrl], [sa_data_exp],
                          [la_data_ctrl], [la_data_exp]), axis=1)[0] # why is it adding the extra layer???
#print(sa_data)
np.set_printoptions(suppress=True)
np.savetxt('sim_data_{0}_{1}.csv'.format(NUM_SA, NUM_LA), all_data, fmt='%i,%i,%i,%5.2f,%i',
           header="exp_id,advertiser_id,impressions,cost,clicks")

In [33]:
import os
import sys

spark_home = os.environ.get('SPARK_HOME', None)
print(spark_home)
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.8.2.1-src.zip'))

execfile(os.path.join(spark_home, 'python/pyspark/shell.py'))

/Users/davidada/apps/spark-1.4.1-bin-hadoop2.6


ValueError: Cannot run multiple SparkContexts at once; existing SparkContext(app=PySparkShell, master=local[*]) created by <module> at <ipython-input-31-dcb2159c5100>:9 

In [12]:
# Calculate the numerator:
# sum_i cost_{ctrlid,i} * click_{expid,i} * w_i
# and the denominator:
# sum_i cost_{expid,i} * click_{ctrlid,i} * w_i
# where w_i = 1/(click_{ctrlid,i} + click{expid,i})
def get_ctrl_exp(adv_data):
    # The shape is (advertiser_id, iterable(id, cost, clicks))
    r = list(adv_data[1])
    assert len(r) == 2
    if int(r[0][0]) == CTRL_ID and int(r[1][0]) == EXP_ID:
        ctrl = r[0]
        exp = r[1]
    elif int(r[1][0]) == CTRL_ID and int(r[0][0]) == EXP_ID:
        ctrl = r[1]
        exp = r[0]
    else:
        assert False
    return ctrl, exp

def calc_numerator(adv_data):
    [ctrl, exp] = get_ctrl_exp(adv_data)
    w_inverse = (ctrl[2] + exp[2])
    if w_inverse > 0:
        return (ctrl[1] * exp[2] / (ctrl[2] + exp[2]))
    else:
        return 0
            
def calc_denominator(adv_data):
    [ctrl, exp] = get_ctrl_exp(adv_data)
    w_inverse = (ctrl[2] + exp[2])
    if w_inverse > 0:
        return (exp[1] * ctrl[2] / (ctrl[2] + exp[2]))
    else:
        return 0

def convert_line(l):
    return [int(l[0]), int(l[1]), int(l[2]), float(l[3]), int(l[4])]


In [32]:
from operator import add

# We want to calculate MH(k_{a,i},n_{a,i},k_{b,i},n_{b,i}), where a and b are control and experiment
# and there k and n in our cases are cost and clicks.
input_rdd = sc.textFile('sim_data_{0}_{1}.csv'.format(NUM_SA, NUM_LA))
header = input_rdd.first() # Remove the first line.
parsed_input_rdd = input_rdd.filter(lambda x: x !=header).map(lambda x: convert_line(x.split(',')))
transformed = parsed_input_rdd.map(lambda x: ((x[0], x[1]), (x[3]*x[4], x[4])))

# For each advertiser cross exp_id, sum the cost and clicks
grouped_result = transformed.reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1]))
grouped_by_advertiser = grouped_result.map(lambda x: ((x[0][1]), (x[0][0], x[1][0], x[1][1]))).groupByKey()

numerator_sum = grouped_by_advertiser.map(lambda x: calc_numerator(x)).reduce(add)
denominator_sum = grouped_by_advertiser.map(lambda x: calc_denominator(x)).reduce(add)
print(numerator_sum, denominator_sum)


(49.92499707602339, 53.84168922305764)


In [8]:
f = open('sim_data_{0}_{1}.csv'.format(NUM_SA, NUM_LA))
print(f.readlines()[0:2])
f.close()

['# exp_id,advertiser_id,impressions,cpc,clicks\n', '101,1000,1, 0.97,0\n']


In [29]:
import google.cloud.dataflow as df

def t_sum(values):
    result = [0,0]
    for v in values:
        result[0] += v[0]
        result[1] += v[1]
    return (result[0], result[1])

# Create a pipeline executing on a direct runner (local, non-cloud).
p = df.Pipeline('DirectPipelineRunner')
parsed_input_rdd = (p
 | df.Read('load records', df.io.TextFileSource('sim_data_{0}_{1}.csv'.format(NUM_SA, NUM_LA)))
 | df.Filter('filter header', lambda x: x[0] != '#')
 | df.Map('split line', lambda x: convert_line(x.split(','))))
transformed = (parsed_input_rdd
 | df.Map((lambda x: ((x[0], x[1]), (x[3]*x[4], x[4])))))

# For each advertiser cross exp_id, sum the cost and clicks
grouped_result = (transformed
 | df.CombinePerKey('combine per adv/id', t_sum))
grouped_by_advertiser = (grouped_result
 | df.Map(lambda x: ((x[0][1]), (x[0][0], x[1][0], x[1][1])))
 | df.GroupByKey())

numerator_sum = (grouped_by_advertiser
 | df.Map(lambda x: calc_numerator(x))
 | df.CombineGlobally('num', sum))
numerator_sum | df.Write('save numerator', df.io.TextFileSink('./numerator_sum'))

denominator_sum = (grouped_by_advertiser
 | df.Map(lambda x: calc_denominator(x))
 | df.CombineGlobally('denom', sum))
denominator_sum | df.Write('save denominator', df.io.TextFileSink('./denominator_sum'))
p.run()


<google.cloud.dataflow.runners.runner.PipelineResult at 0x1072a9050>