In [1]:
# import our libriaries
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
#We are setting the seed to assure you get the same answers on quizzes as we set up
random.seed(42)

In [19]:
# load our data
data = pd.read_csv('a_b_landing.csv', index_col=0)  # index_col=0 drop 1 column
data_countries = pd.read_csv('countries.csv', index_col=0)

## Data preprocessing

First of all, we will study our data and extract useful information.

In [140]:
# check, if the correct reading
data.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [21]:
data_countries.head()

Unnamed: 0,user_id,country
0,834778,UK
1,928468,US
2,822059,UK
3,711597,UK
4,710616,UK


In [7]:
data.shape
# we have 294478 rows and 6 columns

(294478, 6)

In [63]:
# If someone attended the landing page more than once
data['user_id'].nunique()

290584

In [69]:
# Do any of the rows have missing values?
data.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [141]:
# Check the number of converted users
data['converted'].value_counts()

0    259241
1     35237
Name: converted, dtype: int64

In [28]:
# I want to see average convertion rate for both groups
convertion_rate_total = data['converted'].value_counts()[1] / data.shape[0]
print('Convertion rate : ', round(convertion_rate_total*100, 2),'%')

Convertion rate :  11.97 %


In [29]:
# let's check the amount of convertion in two groups (new_page, old_page
overall_look  =data.groupby(['landing_page', 'converted']).agg({'user_id':'count'})
overall_look

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id
landing_page,converted,Unnamed: 2_level_1
new_page,0,129741
new_page,1,17498
old_page,0,129500
old_page,1,17739


In [61]:
new_page_convertion = overall_look['user_id'][1] / (overall_look['user_id'][0] + overall_look['user_id'][1])
old_page_convertion = overall_look['user_id'][3] / (overall_look['user_id'][2] + overall_look['user_id'][3])
print('new_page_convertion : {new_page_convertion:.2f}%'.format(new_page_convertion=new_page_convertion*100))
print('old_page_convertion : {old_page_convertion:.2f}%'.format(old_page_convertion=old_page_convertion*100))

new_page_convertion : 11.88%
old_page_convertion : 12.05%


##### At a glance, it looks like there is no any significant difference between groups.

The amount of users in two groups are similar which is good. \
We continue...

In [207]:
# Create two datasets: 
# treatment_np - only rows with treatment and new_page
# control_op   - only rows with control and old_page
treatment_np = data.query("group == 'treatment' & landing_page == 'new_page'")
control_op = data.query("group == 'control' & landing_page == 'old_page'")
print('total number of the rows in treatment_np :',treatment_np.shape[0])
print('total number of the rows in control_op   :',control_op.shape[0])

total number of the rows in treatment_np : 145311
total number of the rows in control_op   : 145274


In [208]:
# Check users who attended our landing page more than once
treatment_np[treatment_np['user_id'].duplicated()]
# user_id == 773192 visited more than once. Let's observe it

Unnamed: 0,user_id,timestamp,group,landing_page,converted
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [209]:
treatment_np[treatment_np['user_id'] == 773192]
# it seems we can delete one observation

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [210]:
# delete one observation
treatment_np = treatment_np.drop(axis=0, index=1899)

In [211]:
# do the same with control group
control_op[control_op['user_id'].duplicated()]
# there is no any unnecessary observation 

Unnamed: 0,user_id,timestamp,group,landing_page,converted


## A/B test 
Second step will be the A/B test.

we will compare two groups:  **treatment_np** and **control_op**. <br>
But, before, let's do A/A test.

In [170]:
expected = np.round(len(treatment_np) * control_op.converted.mean())
expected

17493.0

In [171]:
observed = sum(treatment_np.converted)
observed

17264

In [180]:
observed_2 = len(treatment_np) - observed
observed_2

128046

In [188]:
expected_2 = np.round(len(treatment_np) * (1 - control_op.converted.mean()))
expected_2

127817.0

In [191]:
chi_sqr = (observed - expected)**2/ expected + (observed_2 - expected_2)**2 / expected_2
chi_sqr

3.4081095898954477

In [192]:
from scipy.stats import chisquare

In [193]:
chisquare(f_obs=[observed,observed_2], f_exp=[expected,expected_2])
# we can see, that p_value > 0.05 . It means that we have not enough power to reject the H0 

Power_divergenceResult(statistic=3.4081095898954477, pvalue=0.06487672828461125)

In [196]:
from scipy.stats import ttest_ind

In [212]:
# t-test for two groups
ttest_ind(a=treatment_np.converted, b=control_op.converted)

Ttest_indResult(statistic=-1.3109235634981506, pvalue=0.18988462498742617)