In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm

In [3]:
raw = pd.read_csv('ab_data.csv')
raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
print('# rows:',raw.shape[0])
print('# unique user:',raw.user_id.nunique())

# rows: 294478
# unique user: 290584


In [5]:
#发现行数与unique user_id 不符，列出重复的用户id：
raw[raw.user_id.duplicated(keep=False)].sort_values(by='user_id').head(10)

Unnamed: 0,user_id,timestamp,group,landing_page,converted
230259,630052,2017-01-17 01:16:05.208766,treatment,new_page,0
213114,630052,2017-01-07 12:25:54.089486,treatment,old_page,1
22513,630126,2017-01-14 13:35:54.778695,treatment,old_page,0
251762,630126,2017-01-19 17:16:00.280440,treatment,new_page,0
183371,630137,2017-01-20 02:08:49.893878,control,old_page,0
11792,630137,2017-01-22 14:59:22.051308,control,new_page,0
207211,630320,2017-01-07 18:02:43.626318,control,old_page,0
255753,630320,2017-01-12 05:27:37.181803,treatment,old_page,0
96929,630471,2017-01-07 02:14:17.405726,control,new_page,0
110634,630471,2017-01-23 01:42:51.501851,control,old_page,0


In [6]:
#观察发现，存在分组group与展示页面版本landing_page不符的情况。查看group和landing_page不匹配的数量：
mismatch = (raw['group'] == 'treatment') != (raw['landing_page'] == 'new_page')
print('不匹配数:', mismatch.sum())

不匹配数: 3893


In [7]:
#把不符的数据剔除：
match_df = raw[~mismatch].copy()

In [8]:
#再次查看 #rows 和 # unique user_id：
print('# rows:',match_df.shape[0])
print('# unique user:',match_df.user_id.nunique())

# rows: 290585
# unique user: 290584


In [9]:
#再次查看重复的user_id：
match_df[match_df.user_id.duplicated(keep=False)].sort_values(by='user_id')

Unnamed: 0,user_id,timestamp,group,landing_page,converted
1899,773192,2017-01-09 05:37:58.781806,treatment,new_page,0
2893,773192,2017-01-14 02:55:59.590927,treatment,new_page,0


In [10]:
#该用户两个时间点未转化，仅保留最近的1条信息：
match_df = match_df.drop_duplicates(subset=['user_id'],keep='last')

In [11]:
#检查缺失值：
match_df.isnull().sum()

user_id         0
timestamp       0
group           0
landing_page    0
converted       0
dtype: int64

In [12]:
#无缺失值，再看看收到新页面的用户占比：
match_df[match_df['landing_page'] == 'new_page'].shape[0]/match_df.shape[0]

0.5000619442226688

## 方法一： 直接公式计算

In [18]:
# 用户数量
n_c = match_df.query('group == "control"').shape[0]
n_t = match_df.query('group == "treatment"').shape[0]
print('n_c:',n_c,'n_t:', n_t)

n_c: 145274 n_t: 145310


In [19]:
# 转化用户数量
convert_c = match_df.query('group == "control" & converted == 1').shape[0]
convert_t = match_df.query('group == "treatment" & converted == 1').shape[0]
print('convert_c:', convert_c,'convert_t:', convert_t)

convert_c: 17489 convert_t: 17872


In [20]:
# 转化率
p_c = convert_c / n_c
p_t = convert_t / n_t
print('p_c:', p_c, 'p_t:', p_t)

p_c: 0.1203863045004612 p_t: 0.12299222352212512


In [21]:
# 估计联合转化率
p_pool = (convert_c + convert_t)/(n_c + n_t)
print('p_pool:', p_pool)

p_pool: 0.12168942543292129


In [23]:
# 计算检验统计量Z
z = (p_c - p_t)/ np.sqrt((1/n_c + 1/n_t)*p_pool*(1-p_pool))
z

-2.1484056695589

In [25]:
# 单侧 z_alpha 值
z_alpha = norm.ppf(0.05)
z_alpha

-1.6448536269514729

In [None]:
# z < z_alpha -> 拒绝原假设