### Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import altair as alt
alt.data_transformers.disable_max_rows()
from datetime import datetime
from scipy.stats import ttest_ind

### Exploring the data

In [14]:
# The data set contains number of activities per user per day
data = pd.read_csv("./Activity_pretest.csv")
data.head()

Unnamed: 0,userid,dt,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,0


In [17]:
activity = data[data['activity_level'] > 0].groupby(['dt', 'activity_level']).count().reset_index()
activity

Unnamed: 0,dt,activity_level,userid
0,2021-10-01,1,1602
1,2021-10-01,2,1507
2,2021-10-01,3,1587
3,2021-10-01,4,1551
4,2021-10-01,5,1586
...,...,...,...
615,2021-10-31,16,1499
616,2021-10-31,17,1534
617,2021-10-31,18,1531
618,2021-10-31,19,1616


In [18]:
# Let's plot the activity level on a line chart
alt.Chart(activity).mark_line(size=1).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('userid:Q', axis=alt.Axis(title = 'number of users')),
    tooltip=['activity_level'], 
    color='activity_level:N'
).properties(
    width=600,
    height=400
)

### Calculating Daily Active Users

In [19]:
activity = data.query('activity_level > 0').groupby(['dt']).count().reset_index()
activity.describe()

Unnamed: 0,userid,activity_level
count,31.0,31.0
mean,30673.387097,30673.387097
std,90.968375,90.968375
min,30489.0,30489.0
25%,30608.0,30608.0
50%,30661.0,30661.0
75%,30728.5,30728.5
max,30902.0,30902.0


In [20]:
alt.Chart(activity).mark_line(size=4).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('userid:Q', axis=alt.Axis(title = 'number of users'))
).properties(
    width=600,
    height=400, 
    title='Daily Active Users'
)

In [21]:
# We see that the number of active users is stable in october

### Click-through rate

In [23]:
## Loading the data 
data = pd.read_csv("Ctr_pretest.csv")

In [24]:
data.head()

Unnamed: 0,userid,dt,ctr
0,4b328144-df4b-47b1-a804-09834942dce0,2021-10-01,34.28
1,34ace777-5e9d-40b3-a859-4145d0c35c8d,2021-10-01,34.67
2,8028cccf-19c3-4c0e-b5b2-e707e15d2d83,2021-10-01,34.77
3,652b3c9c-5e29-4bf0-9373-924687b1567e,2021-10-01,35.42
4,45b57434-4666-4b57-9798-35489dc1092a,2021-10-01,35.04


In [25]:
data.describe()

Unnamed: 0,ctr
count,950875.0
mean,33.000242
std,1.731677
min,30.0
25%,31.5
50%,33.0
75%,34.5
max,36.0


In [31]:
ctr = data.groupby('dt')['ctr'].mean()
ctr

dt
2021-10-01    32.993446
2021-10-02    32.991664
2021-10-03    32.995086
2021-10-04    32.992995
2021-10-05    33.004375
2021-10-06    33.018564
2021-10-07    32.988500
2021-10-08    32.998654
2021-10-09    33.005082
2021-10-10    33.007134
2021-10-11    32.990300
2021-10-12    32.996166
2021-10-13    32.984248
2021-10-14    32.999878
2021-10-15    33.008517
2021-10-16    32.991025
2021-10-17    33.001919
2021-10-18    33.007763
2021-10-19    33.001511
2021-10-20    33.004632
2021-10-21    32.997566
2021-10-22    33.006785
2021-10-23    33.012228
2021-10-24    32.984093
2021-10-25    32.990223
2021-10-26    33.014248
2021-10-27    33.007045
2021-10-28    33.005711
2021-10-29    33.004230
2021-10-30    33.016430
2021-10-31    32.987515
Name: ctr, dtype: float64

In [34]:
alt.Chart(ctr.reset_index()).mark_line(size=4).encode(
    alt.X('dt:T', axis=alt.Axis(title = 'date')),
    alt.Y('ctr:Q', axis=alt.Axis(title = 'ctr'), scale=alt.Scale(domain=[32, 34])),
    tooltip=['ctr'], 
).properties(
    width=600,
    height=400, 
    title='Average Daily CTR'
)

In [None]:
# The CTR is inline with DAU

# A/B test analysis

## Assignments

In [35]:
# First let's read the file with user assignments into test and control groups
data = pd.read_csv("./Assignments.csv")

In [36]:
data.head()

Unnamed: 0,userid,ts,groupid
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0


In [37]:
# Extracting date from the "ts"
data['dt'] = data['ts'].map(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ').strftime("%Y-%m-%d"))

In [38]:
data.head()

Unnamed: 0,userid,ts,groupid,dt
0,c5d77c89-33a3-4fe3-9e31-179dec09d49c,2021-11-02T07:31:42Z,0,2021-11-02
1,9061d751-7a94-44d3-8792-5ca5ec59aa89,2021-11-13T07:43:51Z,0,2021-11-13
2,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-11-20T19:26:07Z,0,2021-11-20
3,d2646662-269f-49de-aab1-8776afced9a3,2021-11-20T11:09:02Z,0,2021-11-20
4,2d9b23b7-4e5e-4162-9f0f-49e593fdd2b5,2021-11-04T07:42:07Z,0,2021-11-04


In [40]:
# Checking that the users are split evenly
data.groupby('groupid').count()

Unnamed: 0_level_0,userid,ts,dt
groupid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,29951,29951,29951
1,30049,30049,30049


In [41]:
data_count = data.groupby(['groupid','dt']).count().reset_index()

In [42]:
alt.Chart(data_count).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
).properties(
    width=600,
    height=400
)

## Calculating pre-test metrics

### User activity

In [46]:
data_act = pd.read_csv("./Activity_all.csv")

In [47]:
data_act.head()

Unnamed: 0,userid,dt,groupid,activity_level
0,a5b70ae7-f07c-4773-9df4-ce112bc9dc48,2021-10-01,0,0
1,d2646662-269f-49de-aab1-8776afced9a3,2021-10-01,0,0
2,c4d1cfa8-283d-49ad-a894-90aedc39c798,2021-10-01,1,0
3,6889f87f-5356-4904-a35a-6ea5020011db,2021-10-01,0,0
4,dbee604c-474a-4c9d-b013-508e5a0e3059,2021-10-01,1,0


In [48]:
alt.Chart(data_act[data_act['activity_level'] > 0].groupby(['dt', 'groupid']).count().reset_index()).mark_line(size=3).encode(
    alt.X('dt'),
    alt.Y('userid'),
    color='groupid:O',
    tooltip=['userid']
).properties(
    width=600,
    height=400
)

In [61]:
# It appears that the test began @ 2021-11-01. Lets split the data into groups and calculate the activityy levels

data_act[(data_act['activity_level'] > 0) & (data_act['groupid'] == 0) & (data_act['dt'] >= "2021-11-01")].groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe()



Unnamed: 0,groupid,activity_level
count,30.0,30.0
mean,0.0,15782.0
std,0.0,371.077276
min,0.0,15163.0
25%,0.0,15335.0
50%,0.0,15990.5
75%,0.0,16045.0
max,0.0,16147.0


In [62]:
# Activity in test group is significantly higher
data_act[(data_act['activity_level'] > 0) & (data_act['groupid'] == 1) & (data_act['dt'] >= "2021-11-01")].groupby(['dt','groupid']).count().reset_index()[['groupid','activity_level']].describe()

Unnamed: 0,groupid,activity_level
count,30.0,30.0
mean,1.0,29302.433333
std,0.0,30.417422
min,1.0,29255.0
25%,1.0,29280.0
50%,1.0,29300.0
75%,1.0,29321.0
max,1.0,29382.0


In [65]:
data_act[data_act['dt'] >= "2021-11-01"].groupby(['groupid']).describe()

Unnamed: 0_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,898530.0,5.402211,6.55557,0.0,0.0,1.0,11.0,20.0
1,901470.0,9.996304,5.78868,0.0,5.0,10.0,15.0,20.0


In [66]:
# For the sake of the comparission let's look at the data before the test
data_act[data_act['dt'] < "2021-11-01"].groupby(['groupid']).describe()

Unnamed: 0_level_0,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level,activity_level
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
groupid,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,928481.0,5.245635,6.521184,0.0,0.0,1.0,10.0,20.0
1,931519.0,5.240952,6.520811,0.0,0.0,1.0,10.0,20.0


### Comparing the activity between the groups

By the activity levels

In [67]:
data_act[data_act['groupid'] == 0]['activity_level'].to_numpy()

array([ 0,  0,  0, ..., 20, 20, 20])

In [69]:
res = ttest_ind(data_act.query('groupid == 0 and dt >= "2021-11-01"')['activity_level'].to_numpy(),
                data_act.query('groupid == 1 and dt >= "2021-11-01"')['activity_level'].to_numpy()).pvalue

print(res)

0.0


In [70]:
# On the basis of this resuls we reject the null hypothesis, it seems that changes in question lead to signifficant increase in user activity
"{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'

### Click through rate (CTR)

In [71]:
# Loading data for CTR test
data_ctr = pd.read_csv("./Ctr_all.csv")

In [72]:
data_ctr.head()

Unnamed: 0,userid,dt,groupid,ctr
0,60389fa7-2d71-4cdf-831c-c2bb277ffa1e,2021-11-13,0,31.81
1,b59cb225-d160-4851-92d2-7cc8120a2f63,2021-11-13,0,30.46
2,aa336050-934e-453f-a5b0-dd881fcd114e,2021-11-13,0,34.25
3,8df767f4-a10f-4322-a722-676b7e02b372,2021-11-13,0,34.92
4,a74762ed-4da0-42ab-91d2-40d7e808dfe9,2021-11-13,0,34.95


In [75]:
data_ctr_avg = data_ctr.groupby(['groupid','dt'])['ctr'].mean().reset_index()
data_ctr_avg

Unnamed: 0,groupid,dt,ctr
0,0,2021-10-01,32.980627
1,0,2021-10-02,33.004056
2,0,2021-10-03,33.002006
3,0,2021-10-04,32.990363
4,0,2021-10-05,33.014167
...,...,...,...
117,1,2021-11-26,37.997834
118,1,2021-11-27,37.978912
119,1,2021-11-28,37.992709
120,1,2021-11-29,37.987909


In [76]:
alt.Chart(data_ctr_avg).mark_line(size=5).encode(
    alt.X('dt'),
    alt.Y('ctr'),
    color='groupid:O',
    tooltip=['ctr']
).properties(
    width=600,
    height=400
)

In [77]:
before = data_ctr[data_ctr['dt'] < "2021-11-01"][['groupid', 'ctr']]

In [78]:
after = data_ctr[data_ctr['dt'] >= "2021-11-01"][['groupid', 'ctr']]

In [86]:
# Let's compare the data prior to test 
before[before['groupid'] == 0]['ctr'].to_numpy().mean()

33.00091277553074

In [87]:
before[before['groupid'] == 1]['ctr'].to_numpy().mean()

32.99957172093258

In [88]:
# And after the test
after[after['groupid'] == 0]['ctr'].to_numpy().mean()

32.996977569382835

In [89]:
after[after['groupid'] == 1]['ctr'].to_numpy().mean()

37.99695912626142

In [91]:
# Comparing the groups before
res = ttest_ind(before[before['groupid'] == 0]['ctr'].to_numpy(), before[before['groupid'] == 1]['ctr']
                .to_numpy()).pvalue

print(res)

0.705741417344299


In [92]:
# Groups after
res = ttest_ind(after[after['groupid'] == 0]['ctr'].to_numpy(), after[after['groupid'] == 1]['ctr']
                .to_numpy()).pvalue

print(res)

0.0


In [94]:
# The difference is signifficant, reject the null hypothesis
"{:.100f}".format(res)

'0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000'