In [1]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()
from datetime import datetime
from scipy.stats import ttest_ind

## Assignments 

In [2]:
assignments_df = pd.read_csv("Assignments.csv")
assignments_df.head()

Unnamed: 0,userid,ts,groupid
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0


We need to treat a bit the timestamp `ts` column:

In [4]:
print(datetime.strptime(assignments_df.head(1)['ts'][0], '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))

2021-11-02


In [9]:
assignments_df['dt'] = assignments_df['ts'].map(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))
assignments_df.head()

Unnamed: 0,userid,ts,groupid,dt
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0,2021-11-02
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0,2021-11-13
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0,2021-11-20
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0,2021-11-20
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0,2021-11-04


In [11]:
assignments_df.describe()

Unnamed: 0,groupid
count,60000.0
mean,0.500817
std,0.500003
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [12]:
assignments_df.groupby(['groupid']).count()

Unnamed: 0_level_0,userid,ts,dt
groupid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,29951,29951,29951
1,30049,30049,30049


From the last two outputs we can see that we have a total of 60000 users and they are evenly distributed across the groups.

Let's look at the data on a daily basis:

In [13]:
assignment_count = assignments_df.groupby(['groupid', 'dt']).count().reset_index()

In [14]:
assignment_count.head()

Unnamed: 0,groupid,dt,userid,ts
0,0,2021-11-01,1497,1497
1,0,2021-11-02,1467,1467
2,0,2021-11-03,1532,1532
3,0,2021-11-04,1509,1509
4,0,2021-11-05,1503,1503


In [16]:
alt.Chart(assignment_count).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
).properties(
    width=600,
    height=400
)

Seems that the users were uniformly assigned throughout the whole period of the test.

## Pre-test metrics

### User activity

In [17]:
activity_df = pd.read_csv("Activity_all.csv")
activity_df.head()

Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0


For each user, we have recorded the date of the test, the groupid they were assigned to and their activity levels.

Let's now look at the aggregated data by group and date.

In [18]:
activity_df.groupby(['groupid', 'dt']).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
groupid,dt,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,2021-10-01,29951.0,5.241762,6.516640,0.0,0.0,1.0,10.0,20.0
0,2021-10-02,29951.0,5.255885,6.509838,0.0,0.0,1.0,10.0,20.0
0,2021-10-03,29951.0,5.266068,6.511458,0.0,0.0,1.0,10.0,20.0
0,2021-10-04,29951.0,5.212447,6.511711,0.0,0.0,1.0,10.0,20.0
0,2021-10-05,29951.0,5.177590,6.512791,0.0,0.0,1.0,10.0,20.0
...,...,...,...,...,...,...,...,...,...
1,2021-11-26,30049.0,10.031216,5.770582,0.0,5.0,10.0,15.0,20.0
1,2021-11-27,30049.0,10.026024,5.774141,0.0,5.0,10.0,15.0,20.0
1,2021-11-28,30049.0,9.975307,5.788257,0.0,5.0,10.0,15.0,20.0
1,2021-11-29,30049.0,9.970781,5.799546,0.0,5.0,10.0,15.0,20.0


It is already noticeable that there is a difference in the activity levels between groups. The mean of the control group is around `5.2` whereas the mean for the test group reaches the value of `10`.

Now let's observe the data without the users that did not log in these days and group them by date and groupid:

In [19]:
activity_df.query('activity_level > 0').groupby(['dt', 'groupid']).count().reset_index().head()

Unnamed: 0,dt,groupid,userid,activity_level
0,2021-10-01,0,15337,15337
1,2021-10-01,1,15297,15297
2,2021-10-02,0,15354,15354
3,2021-10-02,1,15421,15421
4,2021-10-03,0,15423,15423


In [20]:
activity_df.query('activity_level > 0').groupby(['dt', 'groupid']).count().reset_index().tail()

Unnamed: 0,dt,groupid,userid,activity_level
117,2021-11-28,1,29273,29273
118,2021-11-29,0,15356,15356
119,2021-11-29,1,29289,29289
120,2021-11-30,0,15192,15192
121,2021-11-30,1,29382,29382


In [21]:
alt.Chart(activity_df.query('activity_level > 0').groupby(['dt', 'groupid']).\
          count().reset_index()).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
          ).properties(
    width=600,
    height=400
          )

While before the testing (that occurred around the middle of the month), the activity levels of both groups was similar, it can be seen that at the end there's a big difference, since the number of users of the test group that are active at least once a day doubles the number of the control group.

In [27]:
group0 = activity_df.query('activity_level > 0 and groupid == 0 and dt >= "2021-11-01"')\
.groupby(['dt','groupid']).count().reset_index()[['groupid', 'activity_level']].describe()

group1 = activity_df.query('activity_level > 0 and groupid == 1 and dt >= "2021-11-01"')\
.groupby(['dt','groupid']).count().reset_index()[['groupid', 'activity_level']].describe()

# Combine the two dataframes
comparison_df = pd.concat([group0, group1], axis=1, keys=['Group 0', 'Group 1'])

# Display the combined DataFrame
comparison_df

Unnamed: 0_level_0,Group 0,Group 0,Group 1,Group 1
Unnamed: 0_level_1,groupid,activity_level,groupid,activity_level
count,30.0,30.0,30.0,30.0
mean,0.0,15782.0,1.0,29302.433333
std,0.0,371.077276,0.0,30.417422
min,0.0,15163.0,1.0,29255.0
25%,0.0,15335.0,1.0,29280.0
50%,0.0,15990.5,1.0,29300.0
75%,0.0,16045.0,1.0,29321.0
max,0.0,16147.0,1.0,29382.0


We can see here that there is a huge difference between the number of users who were active at least once a day in each group. And also that the test group is much more stable, since the standard deviation is 10 times lower than the one for the control group.

We can take a closer look at the metrics before and after the test started:

#### Before

In [29]:
activity_df.query('dt < "2021-11-01"').groupby(['groupid']).describe()

Unnamed: 0_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,928481.0,5.245635,6.521184,0.0,0.0,1.0,10.0,20.0
1,931519.0,5.240952,6.520811,0.0,0.0,1.0,10.0,20.0


#### After

In [28]:
activity_df.query('dt >= "2021-11-01"').groupby(['groupid']).describe()

Unnamed: 0_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,898530.0,5.402211,6.55557,0.0,0.0,1.0,11.0,20.0
1,901470.0,9.996304,5.78868,0.0,5.0,10.0,15.0,20.0


While before the test, the numbers are very similar for both groups, after the start of the test we can see a big difference in the behaviour.

In [40]:
data_act_count = activity_df.query('activity_level > 0').groupby(['groupid', 'dt']).count().reset_index()

## Comparing the activity between the groups
By the activity levels

In [30]:
activity_df.query('groupid == 0')['activity_level'].to_numpy()

array([ 0,  0,  0, ..., 20, 20, 20], dtype=int64)

I´ll show that the ttest works comparing first group 0 (control group) with itself. I'm using the ttest for the ease of use, since it can be easily used with the `scipy.stats` package and because if we have a sample size big enough, the results are similar to the ones of the z-test.

In [36]:
# Perform the t-test
res = ttest_ind(activity_df.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy(),
                activity_df.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy())

# Access the t-statistic and p-value
t_statistic, p_value = res.statistic, res.pvalue

# Print them with the desired formatting
print(f"T-statistic: {t_statistic:.10f}")
print(f"P-value: {p_value:.10f}")

T-statistic: 0.0000000000
P-value: 1.0000000000


The p-value of 1 means that there is absolutely no difference between the groups, which makes totally sense since we are comparing the same group.

Now let's compare the two groups after the test:

In [38]:
# Perform the t-test
res = ttest_ind(activity_df.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy(),
                activity_df.query('groupid == 1 and dt >= "2021-11-01"')['activity_level'].to_numpy())

# Access the t-statistic and p-value
t_statistic, p_value = res.statistic, res.pvalue

# Print them with the desired formatting
print(f"T-statistic: {t_statistic:.10f}")
print(f"P-value: {p_value:.10f}")

T-statistic: -498.4000989153
P-value: 0.0000000000


Here, it is clear that the groups are different and the result is not by chance.

In [41]:
before = data_act_count.query('dt < "2021-11-01"')
before.head()

Unnamed: 0,groupid,dt,userid,activity_level
0,0,2021-10-01,15337,15337
1,0,2021-10-02,15354,15354
2,0,2021-10-03,15423,15423
3,0,2021-10-04,15211,15211
4,0,2021-10-05,15126,15126


In [43]:
after = data_act_count.query('dt >= "2021-11-01"')
after.head()

Unnamed: 0,groupid,dt,userid,activity_level
31,0,2021-11-01,15989,15989
32,0,2021-11-02,16024,16024
33,0,2021-11-03,16049,16049
34,0,2021-11-04,16040,16040
35,0,2021-11-05,16045,16045


Check for the pretest bias on activity:

In [57]:
print(f"Average number of users before the test in the control group: {round(np.mean(before.query('groupid == 0')['userid'].to_numpy()), 2)}")
print(f"Average number of users before the test in the test group: {round(np.mean(before.query('groupid == 1')['userid'].to_numpy()), 2)}")

Average number of users before the test in the control group: 15320.87
Average number of users before the test in the test group: 15352.52


In [56]:
# Perform the t-test
res = ttest_ind(before.query('groupid == 0')['userid'].to_numpy(),
                before.query('groupid == 1')['userid'].to_numpy())

# Access the t-statistic and p-value
t_statistic, p_value = res.statistic, res.pvalue

# Print them with the desired formatting
print(f"T-statistic: {t_statistic:.10f}")
print(f"P-value: {p_value:.10f}")

T-statistic: -1.4121065242
P-value: 0.1630842354


Both the mean and a high p-value (greater than the common threshold of 0.05) indicate that there's almost no difference between groups before the test took place.

In [58]:
print(f"Average number of users after the test in the control group: {round(np.mean(after.query('groupid == 0')['userid'].to_numpy()), 2)}")
print(f"Average number of users after the test in the test group: {round(np.mean(after.query('groupid == 1')['userid'].to_numpy()), 2)}")

Average number of users after the test in the control group: 15782.0
Average number of users after the test in the test group: 29302.43


In [59]:
# Perform the t-test
res = ttest_ind(after.query('groupid == 0')['userid'].to_numpy(),
                after.query('groupid == 1')['userid'].to_numpy())

# Access the t-statistic and p-value
t_statistic, p_value = res.statistic, res.pvalue

# Print them with the desired formatting
print(f"T-statistic: {t_statistic:.10f}")
print(f"P-value: {p_value:.10f}")

T-statistic: -198.8990494893
P-value: 0.0000000000


We can safely stablish that the test affected the user activity levels.

## Click through rate (CTR)

In [60]:
ctr_df = pd.read_csv("Ctr_all.csv")

In [61]:
ctr_df.head()

Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95


In [63]:
ctr_avg = ctr_df.groupby(['groupid', 'dt'])['ctr'].mean().reset_index()

In [65]:
alt.Chart(ctr_avg).mark_line(size=5).encode(
    alt.X("dt"),
    alt.Y("ctr"),
    color="groupid:O",
    tooltip=['ctr']
).properties(
    width=600,
    height=400
)

We can see a big difference from the start of the test date between the click-through rate of both groups.

In [68]:
before = ctr_df.query('dt < "2021-11-01"')[['groupid', 'ctr']]
after = ctr_df.query('dt >= "2021-11-01"')[['groupid', 'ctr']]

In [69]:
before

Unnamed: 0,groupid,ctr
808703,0,34.28
808704,0,34.67
808705,0,34.77
808706,0,35.42
808707,0,35.04
...,...,...
1759573,1,32.33
1759574,1,30.09
1759575,1,35.71
1759576,1,34.76


In [71]:
print(f"Average ctr of the control group before the test: {round(before.query('groupid == 0')['ctr'].to_numpy().mean(), 2)}")
print(f"Average ctr of the test group before the test: {round(before.query('groupid == 1')['ctr'].to_numpy().mean(), 2)}")
print()
print(f"Average ctr of the control group after the test: {round(after.query('groupid == 0')['ctr'].to_numpy().mean(), 2)}")
print(f"Average ctr of the test group after the test: {round(after.query('groupid == 1')['ctr'].to_numpy().mean(), 2)}")

Average ctr of the control group before the test: 33.0
Average ctr of the test group before the test: 33.0

Average ctr of the control group after the test: 33.0
Average ctr of the test group after the test: 38.0


In [72]:
print(f"Std for the ctr of the control group before the test: {round(before.query('groupid == 0')['ctr'].to_numpy().std(), 2)}")
print(f"Std for the ctr of the test group before the test: {round(before.query('groupid == 1')['ctr'].to_numpy().std(), 2)}")
print()
print(f"Std for the ctr of the control group after the test: {round(after.query('groupid == 0')['ctr'].to_numpy().std(), 2)}")
print(f"Std for the ctr of the test group after the test: {round(after.query('groupid == 1')['ctr'].to_numpy().std(), 2)}")

Std for the ctr of the control group before the test: 1.73
Std for the ctr of the test group before the test: 1.73

Std for the ctr of the control group after the test: 1.73
Std for the ctr of the test group after the test: 1.73


In [74]:
# Perform the t-test
res = ttest_ind(before.query('groupid == 0')['ctr'].to_numpy(),
                before.query('groupid == 1')['ctr'].to_numpy())

# Access the t-statistic and p-value
t_statistic, p_value = res.statistic, res.pvalue

# Print them with the desired formatting
print(f"T-statistic: {t_statistic:.10f}")
print(f"P-value: {p_value:.10f}")

T-statistic: 0.3775817380
P-value: 0.7057414173


In [75]:
# Perform the t-test
res = ttest_ind(after.query('groupid == 0')['ctr'].to_numpy(),
                after.query('groupid == 1')['ctr'].to_numpy())

# Access the t-statistic and p-value
t_statistic, p_value = res.statistic, res.pvalue

# Print them with the desired formatting
print(f"T-statistic: {t_statistic:.10f}")
print(f"P-value: {p_value:.10f}")

T-statistic: -1600.7913068018
P-value: 0.0000000000


We can conclude that before the test, both groups showed a very similar ctr while after the test the difference is due to the new feature and not by chance.