In [1]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np

from statsmodels.stats.weightstats import ztest
from scipy.stats import ttest_ind


# A/B Test Challenge



---

#### What is an A/B Test? 

It is a decision making support & research methodology that allow you to measure an impact of a change in a product (e.g.: a digital product). For this challenge you will analyse the data resulting of an A/B test performed on a digital product where a new set of sponsored ads are included.


#### Measure of success

Metrics are need it to measure the success of your product. They are typically split in the following categories: 

- __Enganged based metrics:__ number of users, number of downloads, number of active users, user retention, etc.

- __Revenue and monetization metrics:__ ads and affiliate links, subscription-based, in-app purchases, etc.

- __Technical metrics:__ service level indicators (uptime of the app, downtime of the app, latency).



---

## Metrics understanding

In this part you must analyse the metrics involved in the test. We will focus in the following metrics:

- Activity level + Daily active users (DAU).

- Click-through rate (CTR)

### Activity level

In the following part you must perform every calculation you consider necessary in order to answer the following questions:

- How many activity levels you can find in the dataset (Activity level of zero means no activity).

- What is the amount of users for each activity level.

- How many activity levels do you have per day and how many records per each activity level.

At the end of this section you must provide your conclusions about the _activity level_ of the users.

__Dataset:__ `activity_pretest.csv`

In [2]:
# your-code

act_pretest = pd.read_csv('../../datasets/abtest/activity_pretest.csv')
act_pretest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1860000 entries, 0 to 1859999
Data columns (total 3 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   userid          object
 1   dt              object
 2   activity_level  int64 
dtypes: int64(1), object(2)
memory usage: 42.6+ MB


In [3]:
act_pretest.isnull().sum()

userid            0
dt                0
activity_level    0
dtype: int64

In [4]:
act_pretest

Unnamed: 0,userid,dt,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,0
...,...,...,...
1859995,200d65e6-b1ce-4a47-8c2b-946db5c5a3a0,2021-10-31,20
1859996,535dafe4-de7c-4b56-acf6-aa94f21653bc,2021-10-31,20
1859997,0428ca3c-e666-4ef4-8588-3a2af904a123,2021-10-31,20
1859998,a8cd1579-44d4-48b3-b3d6-47ae5197dbc6,2021-10-31,20


In [5]:
# activity levels
act_pretest['activity_level'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

In [6]:
# users per activity level
act_pretest.groupby(['activity_level'])['userid'].count()

activity_level
0     909125
1      48732
2      49074
3      48659
4      48556
5      49227
6      48901
7      48339
8      48396
9      48820
10     48943
11     48832
12     48911
13     48534
14     48620
15     48599
16     48934
17     48395
18     48982
19     48901
20     24520
Name: userid, dtype: int64

In [7]:
# records per day
act_pretest_day = pd.DataFrame(act_pretest.groupby(['dt'])['userid'].count()).reset_index()
act_pretest_day

Unnamed: 0,dt,userid
0,2021-10-01,60000
1,2021-10-02,60000
2,2021-10-03,60000
3,2021-10-04,60000
4,2021-10-05,60000
5,2021-10-06,60000
6,2021-10-07,60000
7,2021-10-08,60000
8,2021-10-09,60000
9,2021-10-10,60000


In [8]:
# records per activity levels per day
act_pretest_day = pd.DataFrame(act_pretest.groupby(['dt', 'activity_level'])['userid'].count()).reset_index()
act_pretest_day

Unnamed: 0,dt,activity_level,userid
0,2021-10-01,0,29366
1,2021-10-01,1,1602
2,2021-10-01,2,1507
3,2021-10-01,3,1587
4,2021-10-01,4,1551
...,...,...,...
646,2021-10-31,16,1499
647,2021-10-31,17,1534
648,2021-10-31,18,1531
649,2021-10-31,19,1616


In [9]:
# records per activity levels
act_pretest_activity = pd.DataFrame(act_pretest.groupby(['activity_level', 'dt'])['userid'].count()).reset_index()
act_pretest_activity

Unnamed: 0,activity_level,dt,userid
0,0,2021-10-01,29366
1,0,2021-10-02,29225
2,0,2021-10-03,29215
3,0,2021-10-04,29401
4,0,2021-10-05,29412
...,...,...,...
646,20,2021-10-27,810
647,20,2021-10-28,800
648,20,2021-10-29,784
649,20,2021-10-30,780


In [10]:
act_pretest.groupby(['dt']).agg({'userid': 'count', 'activity_level': 'mean'}).reset_index()

Unnamed: 0,dt,userid,activity_level
0,2021-10-01,60000,5.241417
1,2021-10-02,60000,5.266517
2,2021-10-03,60000,5.2463
3,2021-10-04,60000,5.24695
4,2021-10-05,60000,5.233433
5,2021-10-06,60000,5.2181
6,2021-10-07,60000,5.232067
7,2021-10-08,60000,5.251633
8,2021-10-09,60000,5.287583
9,2021-10-10,60000,5.2422


### Daily active users (DAU)

![ab_test](./img/user_activity_ab_testinG.JPG)


The daily active users (DAU) refers to the amount of users that are active per day (activity level of zero means no activity). You must perform the calculation of this metric and provide your insights about it.

__Dataset:__ `activity_pretest.csv`

In [11]:
# your-code



In [12]:
# remove activity_level = 0 (no activity)
act_pretest_real = act_pretest.loc[act_pretest['activity_level'] != 0]
act_pretest_real

Unnamed: 0,userid,dt,activity_level
909125,428070b0-083e-4c0e-8444-47bf91e99fff,2021-10-01,1
909126,93370f9c-56ef-437f-99ff-cb7c092d08a7,2021-10-01,1
909127,0fb7120a-53cf-4a51-8b52-bf07b8659bd6,2021-10-01,1
909128,ce64a9d8-07d9-4dca-908d-5e1e4568003d,2021-10-01,1
909129,e08332f0-3a5c-4ed2-b957-87e464e89b97,2021-10-01,1
...,...,...,...
1859995,200d65e6-b1ce-4a47-8c2b-946db5c5a3a0,2021-10-31,20
1859996,535dafe4-de7c-4b56-acf6-aa94f21653bc,2021-10-31,20
1859997,0428ca3c-e666-4ef4-8588-3a2af904a123,2021-10-31,20
1859998,a8cd1579-44d4-48b3-b3d6-47ae5197dbc6,2021-10-31,20


In [13]:
act_pretest_real.groupby(['dt']).agg({'userid': 'count', 'activity_level': 'mean'}).reset_index()

# less users and higher mean activity_level


Unnamed: 0,dt,userid,activity_level
0,2021-10-01,30634,10.265881
1,2021-10-02,30775,10.267782
2,2021-10-03,30785,10.225045
3,2021-10-04,30599,10.288473
4,2021-10-05,30588,10.26566
5,2021-10-06,30639,10.218545
6,2021-10-07,30637,10.246565
7,2021-10-08,30600,10.29732
8,2021-10-09,30902,10.266488
9,2021-10-10,30581,10.28521


In [14]:
act_pretest_real.describe()

Unnamed: 0,activity_level
count,950875.0
mean,10.256362
std,5.635938
min,1.0
25%,5.0
50%,10.0
75%,15.0
max,20.0


In [15]:
mean_activity_level = act_pretest_real['activity_level'].mean()
mean_activity_level

10.256361772052058

### Click-through rate (CTR)

![ab_test](./img/ad_click_through_rate_ab_testing.JPG)

Click-through rate (CTR) refers to the percentage of clicks that the user perform from the total amount ads showed to that user during a certain day. You must perform the analysis of this metric (e.g.: average CTR per day) and provide your insights about it.

__Dataset:__ `ctr_pretest.csv`

In [16]:
# your-code


ctr_pretest = pd.read_csv('../../datasets/abtest/ctr_pretest.csv')
ctr_pretest.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950875 entries, 0 to 950874
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   userid  950875 non-null  object 
 1   dt      950875 non-null  object 
 2   ctr     950875 non-null  float64
dtypes: float64(1), object(2)
memory usage: 21.8+ MB


In [17]:
ctr_pretest.isnull().sum()

userid    0
dt        0
ctr       0
dtype: int64

In [18]:
ctr_pretest

Unnamed: 0,userid,dt,ctr
0,4b328144-df4b-47b1-a804-09834942dce0,2021-10-01,34.28
1,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,34.67
2,8028cccf-19c3-4c0e-b5b2-e707e15d2d83,2021-10-01,34.77
3,652b3c9c-5e29-4bf0-9373-924687b1567e,2021-10-01,35.42
4,45b57434-4666-4b57-9798-35489dc1092a,2021-10-01,35.04
...,...,...,...
950870,a09a3687-b71a-4a67-b1ef-9b05c9770c4c,2021-10-31,32.33
950871,c843a595-b94c-42e1-b2fe-ec096070681e,2021-10-31,30.09
950872,edcdf0c1-3d8f-47e8-b7dd-05505749eb69,2021-10-31,35.71
950873,76b7a9ae-98fa-4c77-869d-594a4ef7282d,2021-10-31,34.76


In [19]:
ctr_pretest_day = pd.DataFrame(ctr_pretest.groupby(['dt'])['ctr'].mean()).reset_index()
ctr_pretest_day

Unnamed: 0,dt,ctr
0,2021-10-01,32.993446
1,2021-10-02,32.991664
2,2021-10-03,32.995086
3,2021-10-04,32.992995
4,2021-10-05,33.004375
5,2021-10-06,33.018564
6,2021-10-07,32.9885
7,2021-10-08,32.998654
8,2021-10-09,33.005082
9,2021-10-10,33.007134


In [20]:
ctr_pretest.describe()

Unnamed: 0,ctr
count,950875.0
mean,33.000242
std,1.731677
min,30.0
25%,31.5
50%,33.0
75%,34.5
max,36.0


In [21]:
mean_ctr = ctr_pretest['ctr'].mean()
mean_ctr

33.00024155646148

---

## Pretest metrics 

In this section you will perform the analysis of the metrics using the dataset that includes the result for the test and control groups, but only for the pretest data (i.e.: prior to November 1st, 2021). You must provide insights about the metrics (__Activity level__, __DAU__ and __CTR__) and also perform an hyphotesis test in order to determine whether there is any statistical significant difference between the groups prior to the start of the experiment. You must try different approaches (i.e.: __z-test__ and __t-test__) and compare the results.


__Datasets:__ `activity_all.csv`, `ctr_all.csv`

In [22]:
# your-code

pre_act = pd.read_csv('../../datasets/abtest/activity_all.csv')
pre_act.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3660000 entries, 0 to 3659999
Data columns (total 4 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   userid          object
 1   dt              object
 2   groupid         int64 
 3   activity_level  int64 
dtypes: int64(2), object(2)
memory usage: 111.7+ MB


In [23]:
pre_act

Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0
...,...,...,...,...
3659995,f0126b50-ad74-4480-9250-41b50a408932,2021-11-30,0,20
3659996,6ffe1efe-2e5d-427f-95ff-cc862c46c798,2021-11-30,1,20
3659997,f2073207-25dd-4127-a893-b70106d5ead7,2021-11-30,0,20
3659998,0416f2be-3ab8-481b-873c-3678b4705ecf,2021-11-30,1,20


In [24]:
pre_act['dt'] = pd.to_datetime(pre_act['dt'])
pre_act.info()
pre_act

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3660000 entries, 0 to 3659999
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   userid          object        
 1   dt              datetime64[ns]
 2   groupid         int64         
 3   activity_level  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 111.7+ MB


Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0
...,...,...,...,...
3659995,f0126b50-ad74-4480-9250-41b50a408932,2021-11-30,0,20
3659996,6ffe1efe-2e5d-427f-95ff-cc862c46c798,2021-11-30,1,20
3659997,f2073207-25dd-4127-a893-b70106d5ead7,2021-11-30,0,20
3659998,0416f2be-3ab8-481b-873c-3678b4705ecf,2021-11-30,1,20


In [25]:
pre_act[pre_act['dt'] < '2021-11-01'].count()

userid            1860000
dt                1860000
groupid           1860000
activity_level    1860000
dtype: int64

In [26]:
pre_act[pre_act['dt'] >= '2021-11-01'].count()

userid            1800000
dt                1800000
groupid           1800000
activity_level    1800000
dtype: int64

In [27]:
# pre_act -> dates october
pre_act = pre_act[pre_act['dt'] < '2021-11-01']
pre_act

Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0
...,...,...,...,...
3625439,200d65e6-b1ce-4a47-8c2b-946db5c5a3a0,2021-10-31,0,20
3625440,535dafe4-de7c-4b56-acf6-aa94f21653bc,2021-10-31,1,20
3625441,0428ca3c-e666-4ef4-8588-3a2af904a123,2021-10-31,1,20
3625442,a8cd1579-44d4-48b3-b3d6-47ae5197dbc6,2021-10-31,0,20


In [28]:
pre_act.groupby(['activity_level'])['userid'].count()

activity_level
0     909125
1      48732
2      49074
3      48659
4      48556
5      49227
6      48901
7      48339
8      48396
9      48820
10     48943
11     48832
12     48911
13     48534
14     48620
15     48599
16     48934
17     48395
18     48982
19     48901
20     24520
Name: userid, dtype: int64

In [29]:
# remove no activity (activity_level = 0)
print(len(pre_act))
pre_act = pre_act.loc[pre_act['activity_level'] != 0]
print(len(pre_act))

1860000
950875


In [30]:
pre_act.groupby(['activity_level'])['userid'].count()

activity_level
1     48732
2     49074
3     48659
4     48556
5     49227
6     48901
7     48339
8     48396
9     48820
10    48943
11    48832
12    48911
13    48534
14    48620
15    48599
16    48934
17    48395
18    48982
19    48901
20    24520
Name: userid, dtype: int64

In [31]:
pre_act.groupby(['groupid'])['userid'].count()

groupid
0    474947
1    475928
Name: userid, dtype: int64

In [32]:
# split group 0 and group 1
pre_act_0 = pre_act.loc[pre_act['groupid'] == 0]
print(len(pre_act_0))
pre_act_1 = pre_act.loc[pre_act['groupid'] == 1]
print(len(pre_act_1))

474947
475928


In [33]:
# check group 0 
print(pre_act_0['dt'].unique())
print(pre_act_0['groupid'].unique())
print(pre_act_0['activity_level'].unique())
pre_act_0.head(2)

['2021-10-01T00:00:00.000000000' '2021-10-02T00:00:00.000000000'
 '2021-10-03T00:00:00.000000000' '2021-10-04T00:00:00.000000000'
 '2021-10-05T00:00:00.000000000' '2021-10-06T00:00:00.000000000'
 '2021-10-07T00:00:00.000000000' '2021-10-08T00:00:00.000000000'
 '2021-10-09T00:00:00.000000000' '2021-10-10T00:00:00.000000000'
 '2021-10-11T00:00:00.000000000' '2021-10-12T00:00:00.000000000'
 '2021-10-13T00:00:00.000000000' '2021-10-14T00:00:00.000000000'
 '2021-10-15T00:00:00.000000000' '2021-10-16T00:00:00.000000000'
 '2021-10-17T00:00:00.000000000' '2021-10-18T00:00:00.000000000'
 '2021-10-19T00:00:00.000000000' '2021-10-20T00:00:00.000000000'
 '2021-10-21T00:00:00.000000000' '2021-10-22T00:00:00.000000000'
 '2021-10-23T00:00:00.000000000' '2021-10-24T00:00:00.000000000'
 '2021-10-25T00:00:00.000000000' '2021-10-26T00:00:00.000000000'
 '2021-10-27T00:00:00.000000000' '2021-10-28T00:00:00.000000000'
 '2021-10-29T00:00:00.000000000' '2021-10-30T00:00:00.000000000'
 '2021-10-31T00:00:00.000

Unnamed: 0,userid,dt,groupid,activity_level
1356595,ce64a9d8-07d9-4dca-908d-5e1e4568003d,2021-10-01,0,1
1356597,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,0,1


In [34]:
# check group 1 
print(pre_act_1['dt'].unique())
print(pre_act_1['groupid'].unique())
print(pre_act_1['activity_level'].unique())
pre_act_1.head(2)

['2021-10-01T00:00:00.000000000' '2021-10-02T00:00:00.000000000'
 '2021-10-03T00:00:00.000000000' '2021-10-04T00:00:00.000000000'
 '2021-10-05T00:00:00.000000000' '2021-10-06T00:00:00.000000000'
 '2021-10-07T00:00:00.000000000' '2021-10-08T00:00:00.000000000'
 '2021-10-09T00:00:00.000000000' '2021-10-10T00:00:00.000000000'
 '2021-10-11T00:00:00.000000000' '2021-10-12T00:00:00.000000000'
 '2021-10-13T00:00:00.000000000' '2021-10-14T00:00:00.000000000'
 '2021-10-15T00:00:00.000000000' '2021-10-16T00:00:00.000000000'
 '2021-10-17T00:00:00.000000000' '2021-10-18T00:00:00.000000000'
 '2021-10-19T00:00:00.000000000' '2021-10-20T00:00:00.000000000'
 '2021-10-21T00:00:00.000000000' '2021-10-22T00:00:00.000000000'
 '2021-10-23T00:00:00.000000000' '2021-10-24T00:00:00.000000000'
 '2021-10-25T00:00:00.000000000' '2021-10-26T00:00:00.000000000'
 '2021-10-27T00:00:00.000000000' '2021-10-28T00:00:00.000000000'
 '2021-10-29T00:00:00.000000000' '2021-10-30T00:00:00.000000000'
 '2021-10-31T00:00:00.000

Unnamed: 0,userid,dt,groupid,activity_level
1356592,428070b0-083e-4c0e-8444-47bf91e99fff,2021-10-01,1,1
1356593,93370f9c-56ef-437f-99ff-cb7c092d08a7,2021-10-01,1,1


In [35]:
# ACTIVITY LEVEL ----> mean per every record
# pretest activity: group 0 and group 1
print('mean group 0: ', pre_act_0['activity_level'].mean())
print('mean group 1: ', pre_act_1['activity_level'].mean())

# z-test
z_score, p_value = ztest(pre_act_0['activity_level'], pre_act_1['activity_level'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples have a similar behavior


mean group 0:  10.254769479541928
mean group 1:  10.257950782471298
z_score: -0.27521370941856227 
p-value: 0.7831520549245693
accept H0


In [36]:
pre_act_0 = pre_act_0.groupby(['dt']).agg({'userid': 'count', 'activity_level': 'mean'}).reset_index()
pre_act_0

Unnamed: 0,dt,userid,activity_level
0,2021-10-01,15337,10.236422
1,2021-10-02,15354,10.252638
2,2021-10-03,15423,10.226545
3,2021-10-04,15211,10.263494
4,2021-10-05,15126,10.252149
5,2021-10-06,15335,10.202152
6,2021-10-07,15346,10.257005
7,2021-10-08,15357,10.323631
8,2021-10-09,15371,10.177347
9,2021-10-10,15277,10.27872


In [37]:
pre_act_1 = pre_act_1.groupby(['dt']).agg({'userid': 'count', 'activity_level': 'mean'}).reset_index()
pre_act_1

Unnamed: 0,dt,userid,activity_level
0,2021-10-01,15297,10.295417
1,2021-10-02,15421,10.282861
2,2021-10-03,15362,10.223539
3,2021-10-04,15388,10.313166
4,2021-10-05,15462,10.278877
5,2021-10-06,15304,10.234971
6,2021-10-07,15291,10.236087
7,2021-10-08,15243,10.270813
8,2021-10-09,15531,10.35471
9,2021-10-10,15304,10.291688


In [38]:
# ACTIVITY LEVEL ----> mean per every day
# pretest activity: group 0 and group 1 
print('mean group 0: ', pre_act_0['activity_level'].mean())
print('mean group 1: ', pre_act_1['activity_level'].mean())

# z-test
z_score, p_value = ztest(pre_act_0['activity_level'], pre_act_1['activity_level'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples have a similar behavior


mean group 0:  10.254766009835997
mean group 1:  10.257895924766082
z_score: -0.21548460386770213 
p-value: 0.829389514461113
accept H0


In [39]:
# NUMBER USER
# pretest userid: group 0 and group 1
print('mean group 0: ', pre_act_0['userid'].mean())
print('mean group 1: ', pre_act_1['userid'].mean())

# z-test
z_score, p_value = ztest(pre_act_0['userid'], pre_act_1['userid'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples have a similar behavior


mean group 0:  15320.870967741936
mean group 1:  15352.516129032258
z_score: -1.4121065242323187 
p-value: 0.15791859802311015
accept H0


In [40]:
pre_ctr = pd.read_csv('../../datasets/abtest/ctr_all.csv')
pre_ctr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2303408 entries, 0 to 2303407
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userid   object 
 1   dt       object 
 2   groupid  int64  
 3   ctr      float64
dtypes: float64(1), int64(1), object(2)
memory usage: 70.3+ MB


In [41]:
pre_ctr

Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95
...,...,...,...,...
2303403,932e0348-ea2d-4b98-8782-aa84420f0796,2021-11-12,1,37.27
2303404,6775a825-6d3d-4dc3-9335-cad061736752,2021-11-12,1,39.14
2303405,a7b55365-21f1-4123-b2b5-485a8c7b98da,2021-11-12,1,40.05
2303406,a6fa937c-6f40-4f04-b15b-f1de09e179db,2021-11-12,1,38.14


In [42]:
pre_ctr['dt'] = pd.to_datetime(pre_ctr['dt'])
pre_ctr.info()
pre_ctr

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2303408 entries, 0 to 2303407
Data columns (total 4 columns):
 #   Column   Dtype         
---  ------   -----         
 0   userid   object        
 1   dt       datetime64[ns]
 2   groupid  int64         
 3   ctr      float64       
dtypes: datetime64[ns](1), float64(1), int64(1), object(1)
memory usage: 70.3+ MB


Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95
...,...,...,...,...
2303403,932e0348-ea2d-4b98-8782-aa84420f0796,2021-11-12,1,37.27
2303404,6775a825-6d3d-4dc3-9335-cad061736752,2021-11-12,1,39.14
2303405,a7b55365-21f1-4123-b2b5-485a8c7b98da,2021-11-12,1,40.05
2303406,a6fa937c-6f40-4f04-b15b-f1de09e179db,2021-11-12,1,38.14


In [43]:
pre_ctr[pre_ctr['dt'] < '2021-11-01'].count()

userid     950875
dt         950875
groupid    950875
ctr        950875
dtype: int64

In [44]:
pre_ctr[pre_ctr['dt'] >= '2021-11-01'].count()

userid     1352533
dt         1352533
groupid    1352533
ctr        1352533
dtype: int64

In [45]:
# pre_ctr -> dates october
pre_ctr = pre_ctr[pre_ctr['dt'] < '2021-11-01']
pre_ctr

Unnamed: 0,userid,dt,groupid,ctr
808703,4b328144-df4b-47b1-a804-09834942dce0,2021-10-01,0,34.28
808704,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,0,34.67
808705,8028cccf-19c3-4c0e-b5b2-e707e15d2d83,2021-10-01,0,34.77
808706,652b3c9c-5e29-4bf0-9373-924687b1567e,2021-10-01,0,35.42
808707,45b57434-4666-4b57-9798-35489dc1092a,2021-10-01,0,35.04
...,...,...,...,...
1759573,a09a3687-b71a-4a67-b1ef-9b05c9770c4c,2021-10-31,1,32.33
1759574,c843a595-b94c-42e1-b2fe-ec096070681e,2021-10-31,1,30.09
1759575,edcdf0c1-3d8f-47e8-b7dd-05505749eb69,2021-10-31,1,35.71
1759576,76b7a9ae-98fa-4c77-869d-594a4ef7282d,2021-10-31,1,34.76


In [46]:
pre_ctr.groupby(['groupid'])['userid'].count()

groupid
0    474947
1    475928
Name: userid, dtype: int64

In [47]:
# split group 0 and group 1
pre_ctr_0 = pre_ctr.loc[pre_ctr['groupid'] == 0]
print(len(pre_ctr_0))
pre_ctr_1 = pre_ctr.loc[pre_ctr['groupid'] == 1]
print(len(pre_ctr_1))

474947
475928


In [48]:
# check group 0
print(pre_ctr_0['dt'].unique())
print(pre_ctr_0['groupid'].unique())
pre_ctr_0.head(2)

['2021-10-01T00:00:00.000000000' '2021-10-02T00:00:00.000000000'
 '2021-10-03T00:00:00.000000000' '2021-10-04T00:00:00.000000000'
 '2021-10-05T00:00:00.000000000' '2021-10-06T00:00:00.000000000'
 '2021-10-07T00:00:00.000000000' '2021-10-08T00:00:00.000000000'
 '2021-10-09T00:00:00.000000000' '2021-10-10T00:00:00.000000000'
 '2021-10-11T00:00:00.000000000' '2021-10-12T00:00:00.000000000'
 '2021-10-13T00:00:00.000000000' '2021-10-14T00:00:00.000000000'
 '2021-10-15T00:00:00.000000000' '2021-10-16T00:00:00.000000000'
 '2021-10-17T00:00:00.000000000' '2021-10-18T00:00:00.000000000'
 '2021-10-19T00:00:00.000000000' '2021-10-20T00:00:00.000000000'
 '2021-10-21T00:00:00.000000000' '2021-10-22T00:00:00.000000000'
 '2021-10-23T00:00:00.000000000' '2021-10-24T00:00:00.000000000'
 '2021-10-25T00:00:00.000000000' '2021-10-26T00:00:00.000000000'
 '2021-10-27T00:00:00.000000000' '2021-10-28T00:00:00.000000000'
 '2021-10-29T00:00:00.000000000' '2021-10-30T00:00:00.000000000'
 '2021-10-31T00:00:00.000

Unnamed: 0,userid,dt,groupid,ctr
808703,4b328144-df4b-47b1-a804-09834942dce0,2021-10-01,0,34.28
808704,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,0,34.67


In [49]:
# check group 1
print(pre_ctr_1['dt'].unique())
print(pre_ctr_1['groupid'].unique())
pre_ctr_1.head(2)

['2021-10-01T00:00:00.000000000' '2021-10-02T00:00:00.000000000'
 '2021-10-03T00:00:00.000000000' '2021-10-04T00:00:00.000000000'
 '2021-10-05T00:00:00.000000000' '2021-10-06T00:00:00.000000000'
 '2021-10-07T00:00:00.000000000' '2021-10-08T00:00:00.000000000'
 '2021-10-09T00:00:00.000000000' '2021-10-10T00:00:00.000000000'
 '2021-10-11T00:00:00.000000000' '2021-10-12T00:00:00.000000000'
 '2021-10-13T00:00:00.000000000' '2021-10-14T00:00:00.000000000'
 '2021-10-15T00:00:00.000000000' '2021-10-16T00:00:00.000000000'
 '2021-10-17T00:00:00.000000000' '2021-10-18T00:00:00.000000000'
 '2021-10-19T00:00:00.000000000' '2021-10-20T00:00:00.000000000'
 '2021-10-21T00:00:00.000000000' '2021-10-22T00:00:00.000000000'
 '2021-10-23T00:00:00.000000000' '2021-10-24T00:00:00.000000000'
 '2021-10-25T00:00:00.000000000' '2021-10-26T00:00:00.000000000'
 '2021-10-27T00:00:00.000000000' '2021-10-28T00:00:00.000000000'
 '2021-10-29T00:00:00.000000000' '2021-10-30T00:00:00.000000000'
 '2021-10-31T00:00:00.000

Unnamed: 0,userid,dt,groupid,ctr
824040,381e40b0-5529-4bc6-a3f6-6a687c7cde66,2021-10-01,1,31.27
824041,1797453f-f558-42f6-9a2f-55b95dd37e71,2021-10-01,1,32.18


In [50]:
# CLICKS
# pretest ctr: group 0 and group 1
print('mean group 0: ', pre_ctr_0['ctr'].mean())
print('mean group 1: ', pre_ctr_1['ctr'].mean())

# z-test
z_score, p_value = ztest(pre_ctr_0['ctr'], pre_ctr_1['ctr'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples have a similar behavior


mean group 0:  33.0009127755312
mean group 1:  32.99957172093207
z_score: 0.3775817380268587 
p-value: 0.7057413330705573
accept H0


---

## Experiment metrics 

In this section you must perform the same analysis as in the previous section, but using the data generated during the experiment (i.e.: after November 1st, 2021). You must provide insights about the metrics (__Activity level__, __DAU__ and __CTR__) and also perform an hyphotesis test in order to determine whether there is any statistical significant difference between the groups during the experiment. You must try different approaches (i.e.: __z-test__ and __t-test__) and compare the results.


__Datasets:__ `activity_all.csv`, `ctr_all.csv`

In [51]:
# your-code

exp_act = pd.read_csv('../../datasets/abtest/activity_all.csv')
exp_act['dt'] = pd.to_datetime(exp_act['dt'])

# exp_act -> dates november
exp_act = exp_act[exp_act['dt'] >='2021-11-01']
exp_act

# remove no activity (activity_level = 0)
print(len(exp_act))
exp_act = exp_act.loc[exp_act['activity_level'] != 0]
print(len(exp_act))


1800000
1352533


In [52]:
exp_act.groupby(['activity_level'])['userid'].count()

activity_level
1     69917
2     69979
3     69428
4     69336
5     68957
6     69198
7     69020
8     69217
9     69652
10    68726
11    69414
12    69638
13    69327
14    69075
15    69549
16    69364
17    69425
18    69323
19    69432
20    34556
Name: userid, dtype: int64

In [53]:
exp_act.groupby(['groupid'])['userid'].count()

groupid
0    473460
1    879073
Name: userid, dtype: int64

In [54]:
# split group 0 and group 1
exp_act_0 = exp_act.loc[exp_act['groupid'] == 0]
print(len(exp_act_0))
exp_act_1 = exp_act.loc[exp_act['groupid'] == 1]
print(len(exp_act_1))

473460
879073


In [55]:
# check group 0
print(exp_act_0['dt'].unique())
print(exp_act_0['groupid'].unique())
print(exp_act_0['activity_level'].unique())
exp_act_0.head(2)


['2021-11-01T00:00:00.000000000' '2021-11-02T00:00:00.000000000'
 '2021-11-03T00:00:00.000000000' '2021-11-04T00:00:00.000000000'
 '2021-11-05T00:00:00.000000000' '2021-11-06T00:00:00.000000000'
 '2021-11-07T00:00:00.000000000' '2021-11-08T00:00:00.000000000'
 '2021-11-09T00:00:00.000000000' '2021-11-10T00:00:00.000000000'
 '2021-11-11T00:00:00.000000000' '2021-11-12T00:00:00.000000000'
 '2021-11-13T00:00:00.000000000' '2021-11-14T00:00:00.000000000'
 '2021-11-15T00:00:00.000000000' '2021-11-16T00:00:00.000000000'
 '2021-11-17T00:00:00.000000000' '2021-11-18T00:00:00.000000000'
 '2021-11-19T00:00:00.000000000' '2021-11-20T00:00:00.000000000'
 '2021-11-21T00:00:00.000000000' '2021-11-22T00:00:00.000000000'
 '2021-11-23T00:00:00.000000000' '2021-11-24T00:00:00.000000000'
 '2021-11-25T00:00:00.000000000' '2021-11-26T00:00:00.000000000'
 '2021-11-27T00:00:00.000000000' '2021-11-28T00:00:00.000000000'
 '2021-11-29T00:00:00.000000000' '2021-11-30T00:00:00.000000000']
[0]
[ 1  2  3  4  5  6  

Unnamed: 0,userid,dt,groupid,activity_level
1405325,27f9ec3c-37bf-459a-b94b-f2aff84cd96f,2021-11-01,0,1
1405327,c34e51cf-4b66-420f-94d0-2a0397b29d83,2021-11-01,0,1


In [56]:
# check group 1
print(exp_act_1['dt'].unique())
print(exp_act_1['groupid'].unique())
print(exp_act_1['activity_level'].unique())
exp_act_1.head(2)


['2021-11-01T00:00:00.000000000' '2021-11-02T00:00:00.000000000'
 '2021-11-03T00:00:00.000000000' '2021-11-04T00:00:00.000000000'
 '2021-11-05T00:00:00.000000000' '2021-11-06T00:00:00.000000000'
 '2021-11-07T00:00:00.000000000' '2021-11-08T00:00:00.000000000'
 '2021-11-09T00:00:00.000000000' '2021-11-10T00:00:00.000000000'
 '2021-11-11T00:00:00.000000000' '2021-11-12T00:00:00.000000000'
 '2021-11-13T00:00:00.000000000' '2021-11-14T00:00:00.000000000'
 '2021-11-15T00:00:00.000000000' '2021-11-16T00:00:00.000000000'
 '2021-11-17T00:00:00.000000000' '2021-11-18T00:00:00.000000000'
 '2021-11-19T00:00:00.000000000' '2021-11-20T00:00:00.000000000'
 '2021-11-21T00:00:00.000000000' '2021-11-22T00:00:00.000000000'
 '2021-11-23T00:00:00.000000000' '2021-11-24T00:00:00.000000000'
 '2021-11-25T00:00:00.000000000' '2021-11-26T00:00:00.000000000'
 '2021-11-27T00:00:00.000000000' '2021-11-28T00:00:00.000000000'
 '2021-11-29T00:00:00.000000000' '2021-11-30T00:00:00.000000000']
[1]
[ 1  2  3  4  5  6  

Unnamed: 0,userid,dt,groupid,activity_level
1405324,37e721ba-4b26-4196-abd1-2435da67d619,2021-11-01,1,1
1405326,26162641-e802-4f79-b2ec-6b79845aad89,2021-11-01,1,1


In [57]:
# ACTIVITY LEVEL ----> mean per every record
# experiment activity: group 0 and group 1
print('mean group 0: ', exp_act_0['activity_level'].mean())
print('mean group 1: ', exp_act_1['activity_level'].mean())

# z-test
z_score, p_value = ztest(exp_act_0['activity_level'], exp_act_1['activity_level'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples have a similar behavior


mean group 0:  10.252289528154437
mean group 1:  10.250989394509899
z_score: 0.12793424967290937 
p-value: 0.8982010064247459
accept H0


In [58]:
exp_act_0 = exp_act_0.groupby(['dt']).agg({'userid': 'count', 'activity_level': 'mean'}).reset_index()
exp_act_0

Unnamed: 0,dt,userid,activity_level
0,2021-11-01,15989,10.235349
1,2021-11-02,16024,10.27309
2,2021-11-03,16049,10.314786
3,2021-11-04,16040,10.299314
4,2021-11-05,16045,10.300717
5,2021-11-06,15991,10.125883
6,2021-11-07,16133,10.199529
7,2021-11-08,16119,10.252621
8,2021-11-09,15953,10.281577
9,2021-11-10,15990,10.253471


In [59]:
exp_act_1 = exp_act_1.groupby(['dt']).agg({'userid': 'count', 'activity_level': 'mean'}).reset_index()
exp_act_1

Unnamed: 0,dt,userid,activity_level
0,2021-11-01,29318,10.305171
1,2021-11-02,29289,10.229199
2,2021-11-03,29306,10.223299
3,2021-11-04,29267,10.292309
4,2021-11-05,29336,10.222048
5,2021-11-06,29306,10.252917
6,2021-11-07,29255,10.217194
7,2021-11-08,29263,10.218467
8,2021-11-09,29286,10.29608
9,2021-11-10,29340,10.232072


In [60]:
# ACTIVITY LEVEL ----> mean per every day
# experiment test activity: group 0 and group 1 
print('mean group 0: ', exp_act_0['activity_level'].mean())
print('mean group 1: ', exp_act_1['activity_level'].mean())

# z-test
z_score, p_value = ztest(exp_act_0['activity_level'], exp_act_1['activity_level'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples have a similar behavior


mean group 0:  10.252282473061717
mean group 1:  10.250992526291954
z_score: 0.13021116078508205 
p-value: 0.8963993647290683
accept H0


In [61]:
# t-test
t_score, p_value_t = ttest_ind(exp_act_0['activity_level'], exp_act_1['activity_level'], equal_var=False)

print(f'z_score: {t_score}', f'\np-value: {p_value_t}')

if 0.05 < p_value_t:
    print('accept H0')
else:
    print('rejected H0')


z_score: 0.13021116078508205 
p-value: 0.8969256877877079
accept H0


In [62]:
# NUMBER USER
# experiment test userid: group 0 and group 1
print('mean group 0: ', exp_act_0['userid'].mean())
print('mean group 1: ', exp_act_1['userid'].mean())

# z-test
z_score, p_value = ztest(exp_act_0['userid'], exp_act_1['userid'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples haven't a similar behavior, changes affect to number of users


mean group 0:  15782.0
mean group 1:  29302.433333333334
z_score: -198.89904948926164 
p-value: 0.0
rejected H0


In [63]:
z_score, p_value = ztest(exp_act_0['userid'], exp_act_1['userid'], value=0, alternative='larger')

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# with parameter alternative='larger', you can be sure control group is greater than normal group


z_score: -198.89904948926164 
p-value: 1.0
accept H0


In [64]:
# t-test
t_score, p_value_t = ttest_ind(exp_act_0['userid'], exp_act_1['userid'], equal_var=False)


print(f'z_score: {t_score}', f'\np-value: {p_value_t}')

if 0.05 < p_value_t:
    print('accept H0')
else:
    print('rejected H0')


z_score: -198.89904948926164 
p-value: 1.5078204932597328e-47
rejected H0


In [65]:
# t-test
t_score, p_value_t = ttest_ind(exp_act_0['userid'], exp_act_1['userid'], equal_var=False, alternative='greater')


print(f'z_score: {t_score}', f'\np-value: {p_value_t}')

if 0.05 < p_value_t:
    print('accept H0')
else:
    print('rejected H0')


z_score: -198.89904948926164 
p-value: 1.0
accept H0


In [66]:
exp_ctr = pd.read_csv('../../datasets/abtest/ctr_all.csv')
exp_ctr['dt'] = pd.to_datetime(exp_ctr['dt'])

# exp_ctr -> dates november
exp_ctr = exp_ctr[exp_ctr['dt'] >= '2021-11-01']
exp_ctr


Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95
...,...,...,...,...
2303403,932e0348-ea2d-4b98-8782-aa84420f0796,2021-11-12,1,37.27
2303404,6775a825-6d3d-4dc3-9335-cad061736752,2021-11-12,1,39.14
2303405,a7b55365-21f1-4123-b2b5-485a8c7b98da,2021-11-12,1,40.05
2303406,a6fa937c-6f40-4f04-b15b-f1de09e179db,2021-11-12,1,38.14


In [67]:
exp_ctr.groupby(['groupid'])['userid'].count()

groupid
0    473460
1    879073
Name: userid, dtype: int64

In [68]:
# split group 0 and group 1
exp_ctr_0 = exp_ctr.loc[exp_ctr['groupid'] == 0]
print(len(exp_ctr_0))
exp_ctr_1 = exp_ctr.loc[exp_ctr['groupid'] == 1]
print(len(exp_ctr_1))

473460
879073


In [69]:
# check group 0
print(exp_ctr_0['dt'].unique())
print(exp_ctr_0['groupid'].unique())
exp_ctr_0.head(2)

['2021-11-13T00:00:00.000000000' '2021-11-14T00:00:00.000000000'
 '2021-11-15T00:00:00.000000000' '2021-11-16T00:00:00.000000000'
 '2021-11-17T00:00:00.000000000' '2021-11-18T00:00:00.000000000'
 '2021-11-19T00:00:00.000000000' '2021-11-20T00:00:00.000000000'
 '2021-11-21T00:00:00.000000000' '2021-11-22T00:00:00.000000000'
 '2021-11-23T00:00:00.000000000' '2021-11-24T00:00:00.000000000'
 '2021-11-25T00:00:00.000000000' '2021-11-26T00:00:00.000000000'
 '2021-11-27T00:00:00.000000000' '2021-11-28T00:00:00.000000000'
 '2021-11-29T00:00:00.000000000' '2021-11-30T00:00:00.000000000'
 '2021-11-01T00:00:00.000000000' '2021-11-02T00:00:00.000000000'
 '2021-11-03T00:00:00.000000000' '2021-11-04T00:00:00.000000000'
 '2021-11-05T00:00:00.000000000' '2021-11-06T00:00:00.000000000'
 '2021-11-07T00:00:00.000000000' '2021-11-08T00:00:00.000000000'
 '2021-11-09T00:00:00.000000000' '2021-11-10T00:00:00.000000000'
 '2021-11-11T00:00:00.000000000' '2021-11-12T00:00:00.000000000']
[0]


Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46


In [70]:
# check group 1
print(exp_ctr_1['dt'].unique())
print(exp_ctr_1['groupid'].unique())
exp_ctr_1.head(2)

['2021-11-13T00:00:00.000000000' '2021-11-14T00:00:00.000000000'
 '2021-11-15T00:00:00.000000000' '2021-11-16T00:00:00.000000000'
 '2021-11-17T00:00:00.000000000' '2021-11-18T00:00:00.000000000'
 '2021-11-19T00:00:00.000000000' '2021-11-20T00:00:00.000000000'
 '2021-11-21T00:00:00.000000000' '2021-11-22T00:00:00.000000000'
 '2021-11-23T00:00:00.000000000' '2021-11-24T00:00:00.000000000'
 '2021-11-25T00:00:00.000000000' '2021-11-26T00:00:00.000000000'
 '2021-11-27T00:00:00.000000000' '2021-11-28T00:00:00.000000000'
 '2021-11-29T00:00:00.000000000' '2021-11-30T00:00:00.000000000'
 '2021-11-01T00:00:00.000000000' '2021-11-02T00:00:00.000000000'
 '2021-11-03T00:00:00.000000000' '2021-11-04T00:00:00.000000000'
 '2021-11-05T00:00:00.000000000' '2021-11-06T00:00:00.000000000'
 '2021-11-07T00:00:00.000000000' '2021-11-08T00:00:00.000000000'
 '2021-11-09T00:00:00.000000000' '2021-11-10T00:00:00.000000000'
 '2021-11-11T00:00:00.000000000' '2021-11-12T00:00:00.000000000']
[1]


Unnamed: 0,userid,dt,groupid,ctr
15973,cd5df711-42f7-4684-9ae8-f6a72383bb28,2021-11-13,1,40.39
15974,fe630199-265b-4542-a103-a74d66abeb22,2021-11-13,1,37.7


In [71]:
# CLICKS
# experiment test ctr: group 0 and group 1
print('mean group 0: ', exp_ctr_0['ctr'].mean())
print('mean group 1: ', exp_ctr_1['ctr'].mean())

# z-test
z_score, p_value = ztest(exp_ctr_0['ctr'], exp_ctr_1['ctr'], value=0)

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# two samples haven't a similar behavior, changes affect to clicks


mean group 0:  32.99697756938155
mean group 1:  37.99695912626155
z_score: -1600.7913068017688 
p-value: 0.0
rejected H0


In [72]:
z_score, p_value = ztest(exp_ctr_0['ctr'], exp_ctr_1['ctr'], value=0, alternative='larger')

print(f'z_score: {z_score}', f'\np-value: {p_value}')

if 0.05 < p_value:
    print('accept H0')
else:
    print('rejected H0')

# with parameter alternative='larger', you can be sure control group is greater than normal group


z_score: -1600.7913068017688 
p-value: 1.0
accept H0


In [73]:
# t-test
t_score, p_value_t = ttest_ind(exp_ctr_0['ctr'], exp_ctr_1['ctr'], equal_var=False)


print(f'z_score: {t_score}', f'\np-value: {p_value_t}')

if 0.05 < p_value_t:
    print('accept H0')
else:
    print('rejected H0')


z_score: -1600.5618238144957 
p-value: 0.0
rejected H0


In [74]:
# t-test
t_score, p_value_t = ttest_ind(exp_ctr_0['ctr'], exp_ctr_1['ctr'], equal_var=False, alternative='greater')


print(f'z_score: {t_score}', f'\np-value: {p_value_t}')

if 0.05 < p_value_t:
    print('accept H0')
else:
    print('rejected H0')


z_score: -1600.5618238144957 
p-value: 1.0
accept H0


---

## Conclusions

Please provide your conclusions after the analyses and your recommendation whether we may or may not implement the changes in the digital product.

In [None]:
# your-conclusions

'''

Pretest:

- Both groups have a similar behavior.
- Accept all H0 (activity, DAU and clicks).
- It's ok to apply the experiment to both samples. No difference is detected just before applying change.


Experiment:

- About activity, is accepted H0. Both samples have similar behavior before and after change.
- About number users and clicks, is rejected H0. Changes affect.
- Using alternative parameter in function ztest, we can be sure that behavior in group 1 is better than group 0.

After analyses results and set conclusions, it's recommended to implement the change.

'''


---