# A/B Testing

## The need for experimentation

In [1]:
#reading in the desktop and laptop datasets

import pandas as pd
desktop = pd.read_csv('desktop.csv')
laptop = pd.read_csv('laptop.csv')

In [2]:
#performing t test on the spending , age and visits variables of both desktop and laptop datasets

from scipy.stats import ttest_ind

print(ttest_ind(desktop['spending'], laptop['spending']))
print(ttest_ind(desktop['age'], laptop['age']))
print(ttest_ind(desktop['visits'], laptop['visits']))


TtestResult(statistic=-2.109853741030508, pvalue=0.03919630411621095, df=58.0)
TtestResult(statistic=-0.7101437106800108, pvalue=0.4804606394128761, df=58.0)
TtestResult(statistic=0.20626752311535543, pvalue=0.8373043059847984, df=58.0)


## Running Experiments to Test new Hypotheses

In [3]:
# spliting the desktop users from the desktop dataset into 2 groups one that less than and equal to the median age and one that is older than the median age

groupa = desktop[desktop['age'] <= desktop['age'].median()]
groupb = desktop[desktop['age'] > desktop['age'].median()]

In [4]:
#reading in the emailresults1 dataset

emailresults1 = pd.read_csv('emailresults1.csv')

In [5]:
#printing the first 5 rows of the emailresults1 dataset

emailresults1.head()

Unnamed: 0,userid,revenue
0,1,100
1,2,0
2,3,50
3,4,550
4,5,175


In [6]:
#Joining the emailresults email with groupa and groupb on the userid variable

groupa_withrevenue = pd.merge(groupa, emailresults1, on='userid')
groupb_withrevenue = pd.merge(groupb, emailresults1, on='userid')

In [7]:
#displaying the first 5 rows of the newly merged datasets

print(groupa_withrevenue.head())
print(groupb_withrevenue.head())

   userid  spending  age  visits  revenue
0       1      1250   31     126      100
1       2       900   27       5        0
2       3         0   30     459       50
3       4      2890   22      18      550
4       7       900   18      61       40
   userid  spending  age  visits  revenue
0       5      1460   38      20      175
1       6         0   60     100        0
2       8      1000   51     115      220
3       9       150   41     610      100
4      10      3400   48     154      150


In [8]:
#performing a t test with the new datasets on revenue to see if groups are different

print(ttest_ind(groupa_withrevenue['revenue'], groupb_withrevenue['revenue']))

TtestResult(statistic=-2.186454851070545, pvalue=0.03730073920038287, df=28.0)


## Translating the Math into Practice

In [9]:
import numpy as np
np.random.seed(18811015)
laptop.loc[:,"groupassignment1"] = 1*(np.random.random(len(laptop.index)) > 0.5)
groupc = laptop.loc[laptop['groupassignment1'] == 0,:].copy()
groupd = laptop.loc[laptop['groupassignment1'] == 1,:].copy()

In [10]:
#reading in emailresult2

emailresults2 = pd.read_csv('emailresults2.csv')

In [11]:
#joining our groupc and groupd datasets with emailresults2 on userid

groupc_withrevenue = pd.merge(groupc, emailresults2, on='userid')
groupd_withrevenue = pd.merge(groupd, emailresults2, on='userid')

In [12]:
#performing a t test on revenue with both groups

print(ttest_ind(groupc_withrevenue['revenue'], groupd_withrevenue['revenue']))

TtestResult(statistic=-2.381320497676198, pvalue=0.024288828555138562, df=28.0)


## Understanding Effect Sizes

In [13]:
#calculating Cohen's d for our first A/B test

print(125/np.std(emailresults1['revenue']))


0.763769235188029


## Calculating the Significance of Data

In [15]:
#Calculating the A/B test stat power

from statsmodels.stats.power import TTestIndPower

alpha = 0.05
nobs = 45
effectsize = 0.5
analysis = TTestIndPower()
power =  analysis.solve_power(effect_size=effectsize, nobs1=nobs, alpha=alpha)
print(power)

0.6501855019775578
