# AB Teste Course - PA James

## 0.1. Imports

In [8]:
import math
import pandas as pd
import numpy as np
#import seaborn as sns
from statsmodels.stats import api as sms
from scipy.stats import chi2_contingency
#from statsmodels.stats.multicomp import pairwise_tukeyhsd
#from scipy.stats import ttest_1samp, shapiro, ttest_ind, mannwhitneyu, f_oneway


## 1.0. Load Data

In [2]:

df_raw = pd.read_csv("datasets/ab_data.csv")

In [3]:
df_raw.head()

Unnamed: 0,user_id,timestamp,group,landing_page,converted
0,851104,2017-01-21 22:11:48.556739,control,old_page,0
1,804228,2017-01-12 08:01:45.159739,control,old_page,0
2,661590,2017-01-11 16:55:06.154213,treatment,new_page,0
3,853541,2017-01-08 18:28:03.143765,treatment,new_page,0
4,864975,2017-01-21 01:52:26.210827,control,old_page,1


In [4]:
df_raw.shape

(294478, 5)

## 2.0. Design de Experimentos

### 2.1. Formulação das hipoteses

In [5]:
# H0: A conversão da nova pagina é de 13%
# H1: A conversão da nova pagina é diferente de 13%

### 2.2. Parâmetros do Experimento

In [6]:
# nivel de confiança
confidence_level = 0.95

# nivel de significancia
significanse_level = 1 - confidence_level

# conversóes da página atual e da nova pagina
p1 = 0.13
p2 = 0.15

# tamanho do efeito
effect_size = sms.proportion_effectsize(p1, p2)

# Poder estatístico
power = 0.80

# sample size
sample_n = sms.NormalIndPower().solve_power(
                effect_size,
                power=power,
                alpha=significanse_level

)

sample_n = math.ceil(sample_n)
sample_n

4720

In [10]:
# Prepar os dados
df_aux = df_raw[['user_id', 'group']].groupby('user_id').count().reset_index().query('group > 1')
df3 = df_raw[~df_raw['user_id'].isin(df_aux['user_id'])]

# Amostragem
df_control_sample = df3[df3['group'] ==  'control'].sample(n=sample_n, random_state=32)
print(f'Size of Control group: {df_control_sample.shape[0]}')

df_treatment_sample = df3[df3['group'] ==  'treatment'].sample(n=sample_n, random_state=32)
print(f'Size of Treatment group: {df_treatment_sample.shape[0]}')

df_ab = pd.concat([df_control_sample, df_treatment_sample])

# Taxa de conversão
converted = df_control_sample.loc[df_control_sample['converted'] == 1, 'converted'].sum()
conversion_rate_control = converted / len(df_control_sample)
print(f'\nConversion Rate - Control Group: {conversion_rate_control}')

converted = df_treatment_sample.loc[df_treatment_sample['converted'] == 1, 'converted'].sum()
conversion_rate_treatment = converted / len(df_treatment_sample)
print(f'Conversion Rate - Treatment Group: {conversion_rate_treatment}')

# Teste de Hipóteses
df_table = df_ab[['group', 'converted']].groupby('group').agg({'converted' :['sum', 'count']})
df_table.columns = ['converted', 'non_converted']

chi_val, pval, dof, expected = chi2_contingency(df_table)

print(f'p-value: {pval:.2f}')

if pval < significanse_level:
    print('Rejeita a hipóteses nula')
else:
    print('Falha em rejeitar a hipótese nula')


# Conclusão

# conversão de resultado para R$

Size of Control group: 4720
Size of Treatment group: 4720

Conversion Rate - Control Group: 0.11864406779661017
Conversion Rate - Treatment Group: 0.11970338983050847
p-value: 0.91
Falha em rejeitar a hipótese nula


# 3.0 Conversão da pagina em Faturamento

In [28]:
# conversão de resultado para R$
pagina_atual = 13 / 100
pagina_nova = 15 / 100

# Media de visitantes diários * taxa de conversão da pagina atual = compradores
# Compradores * tick medio = faturamento (GMV - Gross margin Value)

In [14]:
df4 = df3.copy()

# 
df4['timestamp'] = pd.to_datetime(df4['timestamp']).apply(lambda x: x.strftime('%Y-%m-%d'))

In [33]:
df5 = df4[['user_id', 'timestamp']].groupby('timestamp').count().reset_index()

# Current GMV
df5['current_purchases'] = np.ceil(df5['user_id'] * pagina_atual).astype(int)
df5['current_GMV'] = df5['current_purchases'] * 4500

current_gmv = df5['current_GMV'].sum()
print(f'GMV On Period: {current_gmv}')

# Expected GMV
df5['new_purchases'] = np.ceil(df5['user_id'] * pagina_nova).astype(int)
df5['new_GMV'] = df5['new_purchases'] * 4500

new_gmv = df5['new_GMV'].sum()
print(f'New GMV On Period: {new_gmv}')

lift_abs = new_gmv - current_gmv
print(f'Abs lift: {lift_abs}')

lift = 100 * (new_gmv - current_gmv) / current_gmv
print(f'Expected Lift: {lift:.2f}%') 

GMV On Period: 167760000
New GMV On Period: 193563000
Expected Lift: 15.38%
