In [62]:
import pandas as pd
import numpy as np
import scipy.stats.distributions as dist


In [26]:
df = pd.read_csv('final_dataframe.csv')
df_subset = df[['Impressions', 'Spend_Range_Min_USD']]
df_subset.head()

Unnamed: 0,Impressions,Spend_Range_Min_USD
0,≤ 10k,1000.0
1,≤ 10k,1000.0
2,≤ 10k,1000.0
3,≤ 10k,1000.0
4,≤ 10k,1000.0


In [29]:
props_over_million = {}
#iterate through min spending 
for spending in df_subset['Spend_Range_Min_USD'].unique():
    #subset by spending interval
    spend_subset = df_subset.loc[df_subset['Spend_Range_Min_USD'] == spending]
    #get number of impressions over 1 million
    num_over_million = len(spend_subset.loc[(spend_subset['Impressions'] == '1M-10M') | (spend_subset['Impressions'] == '> 10M')])
    #append proportion
    props_over_million[spending] = num_over_million/len(spend_subset)
                                            

In [25]:
props_over_million

{1000.0: 0.10324864113260018,
 50000.0: 0.7216494845360825,
 100000.0: 0.7771295215869312}

Perform a hypothesis test on whether spending has an impact on high impressions.

Null Hypothesis: Spending has no impact on impression count, i.e. proportion of impressions over one million is equal for each of the 3 spending category minimums: \\$1,000, \\$50,000, and \\$100,000

In [66]:
#compute some necessary values
df_1000 = df_subset.loc[df_subset['Spend_Range_Min_USD'] == 1000]
df_50000 = df_subset.loc[df_subset['Spend_Range_Min_USD'] == 50000]
df_100000 = df_subset.loc[df_subset['Spend_Range_Min_USD'] == 100000]

num_1000 = len(df_1000.loc[(df_1000['Impressions'] == '1M-10M') | (df_1000['Impressions'] == '> 10M')])
num_50000 = len(df_50000.loc[(df_50000['Impressions'] == '1M-10M') | (df_50000['Impressions'] == '> 10M')])
num_100000 = len(df_100000.loc[(df_100000['Impressions'] == '1M-10M') | (df_100000['Impressions'] == '> 10M')])

#### First, compare the proportions of the \\$1,000 spending minimum to the \\$50,000 one
    

In [72]:
#Calculating standard error 
subset_1000_and_50000 = spend_subset = df_subset.loc[(df_subset['Spend_Range_Min_USD'] == 1000) | (df_subset['Spend_Range_Min_USD'] == 50000)]
prop_1000_and_50000 = len(subset_1000_and_50000.loc[(subset_1000_and_50000['Impressions'] == '1M-10M') | 
                                                    (subset_1000_and_50000['Impressions'] == '> 10M')])/len(subset_1000_and_50000)

variance = prop_1000_and_50000 * (1 - prop_1000_and_50000)
standard_error = np.sqrt(variance * (1 / num_1000 + 1 / num_50000))
print("Sample Standard Error for 1,000 and 5,000: ",standard_error)

Sample Standard Error for 1,000 and 5,000:  0.011062051876451135


In [73]:
# Calculate the test statistic 
best_estimate = (num_50000/len(df_50000) - num_1000/len(df_1000))
print("The best estimate of differences between proportions is: ",best_estimate)
hypothesized_estimate = 0
test_stat = (best_estimate-hypothesized_estimate) / standard_error
print("Computed Test Statistic is",test_stat)

The best estimate of differences between proportions is:  0.6184008434034822
Computed Test Statistic is 55.90290574571723


In [74]:
# Calculate the  p-value
pvalue = 2*dist.norm.cdf(-np.abs(test_stat)) # Multiplied by two indicates a two tailed testing.
print("Computed P-value is", pvalue)

Computed P-value is 0.0


#### Then, compare the proportions of the \\$50,000 spending minimum to the \\$100,000 one
    

In [78]:
#Calculating standard error 
subset_50000_and_100000 = df_subset.loc[(df_subset['Spend_Range_Min_USD'] == 50000) | (df_subset['Spend_Range_Min_USD'] == 100000)]
prop_50000_and_100000 = len(subset_50000_and_100000.loc[(subset_50000_and_100000['Impressions'] == '1M-10M') | 
                                                    (subset_50000_and_100000['Impressions'] == '> 10M')])/len(subset_50000_and_100000)

variance = prop_50000_and_100000 * (1 - prop_50000_and_100000)
standard_error = np.sqrt(variance * (1 / num_50000 + 1 / num_100000))
print("Sample Standard Error for 1,000 and 5,000: ",standard_error)

Sample Standard Error for 1,000 and 5,000:  0.0222521556915524


In [79]:
# Calculate the test statistic 
best_estimate = (num_100000/len(df_100000) - num_50000/len(df_50000))
print("The best estimate of differences between proportions is",best_estimate)
hypothesized_estimate = 0
test_stat = (best_estimate-hypothesized_estimate) / standard_error
print("Computed Test Statistic is",test_stat)

The best estimate of differences between proportions is 0.055480037050848696
Computed Test Statistic is 2.4932432533676105


In [77]:
# Calculate the  p-value
pvalue = 2*dist.norm.cdf(-np.abs(test_stat)) # Multiplied by two indicates a two tailed testing.
print("Computed P-value is", pvalue)

Computed P-value is 0.012658209285286692


#### Finally, compare the proportions of the \\$1,000 spending minimum to the \\$100,000 one
    

In [80]:
#Calculating standard error 
subset_1000_and_100000 = df_subset.loc[(df_subset['Spend_Range_Min_USD'] == 1000) | (df_subset['Spend_Range_Min_USD'] == 100000)]
prop_1000_and_100000 = len(subset_1000_and_100000.loc[(subset_1000_and_100000['Impressions'] == '1M-10M') | 
                                                    (subset_1000_and_100000['Impressions'] == '> 10M')])/len(subset_1000_and_100000)

variance = prop_1000_and_100000 * (1 - prop_1000_and_100000)
standard_error = np.sqrt(variance * (1 / num_1000 + 1 / num_100000))
print("Sample Standard Error for 1,000 and 5,000: ",standard_error)

Sample Standard Error for 1,000 and 5,000:  0.012632437984371081


In [82]:
# Calculate the test statistic 
best_estimate = (num_100000/len(df_100000) - num_1000/len(df_1000))
print("The best estimate of differences between proportions is",best_estimate)
hypothesized_estimate = 0
test_stat = (best_estimate-hypothesized_estimate) / standard_error
print("Computed Test Statistic is",test_stat)

The best estimate of differences between proportions is 0.673880880454331
Computed Test Statistic is 53.345275178715305


In [83]:
# Calculate the  p-value
pvalue = 2*dist.norm.cdf(-np.abs(test_stat)) # Multiplied by two indicates a two tailed testing.
print("Computed P-value is", pvalue)

Computed P-value is 0.0
