### Imports

In [1]:
import sqlite3
import warnings
from typing import List

import pandas as pd
import numpy as np
from pandas.core.common import SettingWithCopyWarning
import altair as alt
from scipy import stats
from statsmodels.stats import weightstats

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
# This will remove unnecessary warnings that jupyter always rise
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

### Constants

In [3]:
DB_FILE = 'Data scientist exercise.db'
DB_SELECT = 'SELECT * from access_log'

# Column names
EVENT_TYPE = 'event_type'
REVENUE = 'revenue'
VARIANT = 'variant'
CITY = 'city'
USER_ID = 'user_id'

# Event type values
EVENT_TYPE_P_VIEW = 'property_view'
EVENT_TYPE_F_ADDED = 'property_favorite_added'
EVENT_TYPE_B_REQUEST = 'booking_request'

DELETED_USERS: List = []

### Connection and data read

In [4]:
conn = sqlite3.connect(DB_FILE)
data = pd.read_sql(DB_SELECT, con=conn)

In [5]:
# In order to not re-read the database once and another while working, we will use a copy of the original one
df = data.copy()

### Data Exploratory and Cleanse

Let's first check if we have users in both groups (A & B), which is suppossed to be impossible and remove them.

In [6]:
a_users = df.loc[df[VARIANT]=='A',USER_ID].unique()
b_users = df.loc[df[VARIANT]=='B',USER_ID].unique()
coincidences = set(a_users) & set(b_users)
print('Number of users present in both Variants = {}'.format(len(coincidences)))

Number of users present in both Variants = 38


In [7]:
if coincidences:
    df = df[~df[USER_ID].isin(coincidences)]

In [8]:
df_br = df[df[EVENT_TYPE]==EVENT_TYPE_B_REQUEST].reset_index(drop=True).copy()
df_br.head()

Unnamed: 0,datetime,user_id,variant,city,event_type,revenue
0,2021-08-01 00:50:26,556943737,A,rome,booking_request,211.234257
1,2021-08-01 05:02:36,630002484,B,madrid,booking_request,
2,2021-08-01 08:39:23,741334523,B,madrid,booking_request,288.934075
3,2021-08-01 16:29:02,715418235,A,madrid,booking_request,212.030443
4,2021-08-01 16:30:38,107297881,A,rome,booking_request,175.026215


Let's study our data and check that there are no inconsistencies.

In [9]:
print('Number of records with negative or zero revenue for Booking Requests: {}'.format(len(df_br[df_br[REVENUE]<=0])))

Number of records with negative or zero revenue for Booking Requests: 20


In [10]:
print('Number of records for Booking Requests group A: {}'.format(len(df_br[df_br[VARIANT]=='A'])))
print('Number of records for Booking Requests group B: {}'.format(len(df_br[df_br[VARIANT]=='A'])))

Number of records for Booking Requests group A: 215
Number of records for Booking Requests group B: 215


In [11]:
df_br_na = df_br[df_br[REVENUE].isna()]
print('Number of NA records for Booking Requests: {}'.format(len(df_br_na)))
alt.Chart(df_br_na).mark_bar().encode(
    x=alt.X('count({})'.format(VARIANT)),
    y=alt.Y(VARIANT)
)

Number of NA records for Booking Requests: 16


In [12]:
df_br_zero = df_br[df_br[REVENUE]==0]
print('Number of zero records for Booking Requests: {}'.format(len(df_br_zero)))
alt.Chart(df_br_zero).mark_bar().encode(
    x=alt.X('count({})'.format(VARIANT)),
    y=alt.Y(VARIANT)
)

Number of zero records for Booking Requests: 20


In [13]:
alt.Chart(df_br_na).mark_bar().encode(
    x=alt.X('count({})'.format(CITY)),
    y=alt.Y(CITY)
)

In [14]:
alt.Chart(df_br_zero).mark_bar().encode(
    x=alt.X('count({})'.format(CITY)),
    y=alt.Y(CITY)
)

After some data exploration we find out that there are 16 Booking Requests with *null* revenue and 20 zeros, and it does not seem to be related with the city. Therefore, since we have *12+7=19* records from group **A** and *8+9=17* records from group **B**, the solution proposed in this case is to remove all the Null and zero values from the study even though it will not mantain the samples equal.

In [15]:
DELETED_USERS.extend(df_br.loc[df_br[REVENUE].notnull(), USER_ID].unique())
DELETED_USERS.extend(df_br.loc[df_br[REVENUE]>0, USER_ID].unique())

In [16]:
df_br = df_br[df_br[REVENUE].notnull()]
df_br = df_br[df_br[REVENUE]>0]

Since we have deleted some users from the Business Requests users, we need to remove them also from the global dataset.

In [17]:
df = df[~df[USER_ID].isin(DELETED_USERS)]

Since we have performed some changes in the BR data, we remove the BR data from the original dataset and replace it by the one we have modified so that we have not changed the data.

In [18]:
df = df[df[EVENT_TYPE]!=EVENT_TYPE_B_REQUEST]
df[REVENUE] = df[REVENUE].fillna(0)
df = df.append(df_br, ignore_index=True)

Let's calculate now all the necessary values for the hipothesis tests

In [19]:
df_grouped = df.groupby([VARIANT,USER_ID]).agg({REVENUE:'sum'}, axis=1).reset_index()

df_A = df_grouped[df_grouped[VARIANT] == 'A']
df_B = df_grouped[df_grouped[VARIANT] == 'B']

df_br_A = df_br[df_br[VARIANT]=='A']
df_br_B = df_br[df_br[VARIANT]=='B']

In [20]:
A_MEAN = df_br_A[REVENUE].mean()
B_MEAN = df_br_B[REVENUE].mean()

TOTAL_USERS = len(df[USER_ID].unique())
TOTAL_USERS_A = len(df_A[USER_ID].unique())
TOTAL_USERS_B = len(df_B[USER_ID].unique())

USERS_WITH_BR = len(df_br[USER_ID].unique())
USERS_WITH_BR_A = len(df_br_A[USER_ID].unique())
USERS_WITH_BR_B = len(df_br_B[USER_ID].unique())

CVR_A = USERS_WITH_BR_A/TOTAL_USERS_A
CVR_B = USERS_WITH_BR_B/TOTAL_USERS_B
RU_A = df_br_A[REVENUE].sum()/TOTAL_USERS_A
RU_B = df_br_B[REVENUE].sum()/TOTAL_USERS_B

print('Mean Revenue group A (control group) = {}'.format(A_MEAN))
print('Mean Revenue group B (treatment group) = {}'.format(B_MEAN))
print('Total Revenue group A (control group) = {}'.format(df_br_A[REVENUE].sum()))
print('Total Revenue group B (treatment group) = {}'.format(df_br_B[REVENUE].sum()))
print('Conversion Rate group A (control group) = {}'.format(CVR_A))
print('Conversion Rate group B (treatment group) = {}'.format(CVR_B))
print('Revenue per user group A (control group) = {}'.format(RU_A))
print('Revenue per user group B (treatment group) = {}'.format(RU_B))

Mean Revenue group A (control group) = 199.23785774983205
Mean Revenue group B (treatment group) = 352.12634702952465
Total Revenue group A (control group) = 39050.62011896708
Total Revenue group B (treatment group) = 50706.19397225155
Conversion Rate group A (control group) = 0.24317617866004962
Conversion Rate group B (treatment group) = 0.17733990147783252
Revenue per user group A (control group) = 48.44990089201871
Revenue per user group B (treatment group) = 62.44605168996497


As we can see, the conversion rate per user in the **control group** is greater than in the **treatment group**.

# CONVERSION RATE

## Two sampled Z-test

In [21]:
t_statistic, pval = weightstats.ztest(df_br_A[REVENUE].values, df_br_B[REVENUE].values, alternative='two-sided')
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=0.0
t=-49.42707684094923


In [22]:
t_statistic, pval = weightstats.ztest(df_br_A[REVENUE].values, df_br_B[REVENUE].values, alternative='smaller')
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=0.0
t=-49.42707684094923


## Two sampled t-test

In [23]:
t_statistic, pval = stats.ttest_ind(df_br_A[REVENUE].values, df_br_B[REVENUE].values, alternative='two-sided', equal_var=False)
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=2.6900750765539986e-129
t=-47.47422202856776


In [24]:
t_statistic, pval = stats.ttest_ind(df_br_A[REVENUE].values, df_br_B[REVENUE].values, alternative='less', equal_var=False)
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=1.3450375382769993e-129
t=-47.47422202856776


Our null hipothesis is that A mean is equal to B mean. Since this is rejected (because its p-value is 0), we are going to check if A mean is larger than B mean. Now, since its p-value is 0, we also reject this hipothesis. Then it must be A mean larger than B mean.

# REVENUE PER USER

## Two sampled Z-test

In [25]:
t_statistic, pval = weightstats.ztest(df_A[REVENUE].values, df_B[REVENUE].values, alternative='two-sided')
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=0.013195413126049167
t=-2.4784512786612543


In [26]:
t_statistic, pval = weightstats.ztest(df_A[REVENUE].values, df_B[REVENUE].values, alternative='smaller')
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=0.006597706563024584
t=-2.4784512786612543


## Two sampled t-test

In [27]:
t_statistic, pval = stats.ttest_ind(df_A[REVENUE].values, df_B[REVENUE].values, alternative='two-sided', equal_var=False)
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=0.01317113451532276
t=-2.4823221042816095


In [28]:
t_statistic, pval = stats.ttest_ind(df_A[REVENUE].values, df_B[REVENUE].values, alternative='less', equal_var=False)
print('P-value={}'.format(pval))
print('t={}'.format(t_statistic))

P-value=0.00658556725766138
t=-2.4823221042816095


Our null hipothesis is that A mean is equal to B mean. In this case, we can reject the null hipothesis at a 95% of significance since its p-value is 0.013. If we try to reject the hipothesis that A mean is larger than B mean, we get that we can reject it with 0.0066 probability, which leads to A mean **smaller** to B mean with a 95% of significance or A mean **smaller or equal** to B mean with a 99% significance.

After all the study we could confirm that the treatment group have reported a smaller conversion rate compared to the control group. However, it has reported a higher revenue per user and in total. The recommendation would be to continue and roll out the experiment to all users.

In [29]:
alt.Chart(df_br).transform_density(
    REVENUE,
    as_=[REVENUE, 'density'],
    groupby=[VARIANT]
).mark_area().encode(
    x=alt.X(REVENUE),
    y=alt.Y('density:Q'),
).facet(
    VARIANT,
    columns=2
)

From the plots above we can see that the distribution is pretty similar, like a normal distribution. The treatment group *(B)* seems to have a bigger mean and higher standard deviation that the control group *(A)*

Assuming both are normal distributions, we can get the average revenue per booking request in each variant with a confidence interval of 95% doing the following:

In [30]:
A_lower_interval, A_higher_interval = stats.norm.interval(0.95, loc=A_MEAN, scale=df_br_A[REVENUE].std())
B_lower_interval, B_higher_interval = stats.norm.interval(0.95, loc=B_MEAN, scale=df_br_B[REVENUE].std())
print('Average revenue per booking request in control group (A):[{}, {}]'.format(A_lower_interval, A_higher_interval))
print('Average revenue per booking request in treatment group (B):[{}, {}]'.format(B_lower_interval, B_higher_interval))

Average revenue per booking request in control group (A):[150.81750993346756, 247.65820556619653]
Average revenue per booking request in treatment group (B):[288.76555722516594, 415.48713683388337]


In order to get the estimated increase, we could divide both limits and get the increase.

In [31]:
increase_higher = B_lower_interval/A_lower_interval
increase_lower = B_higher_interval/A_higher_interval
print('Percentage increase might be around [{},{}]'.format(increase_lower, increase_higher))

Percentage increase might be around [1.67766351970449,1.914668643929697]


However, this determination is not completelly reliable, since the revenues from one group could already be higher/lower than the other, making this estimation not correct. A more reliable way is if we had a grouping on the properties that could somehow group them by price for instance, or by other charactheristics that could end up in comparing properties with similar prices.