In [2]:
import pandas as pd
import numpy as np

USER_HISTORY_PATH = "../data/raw/user_history.csv"
NOTIF_PATH = "../data/raw/notification_allowed.csv"

user_history = pd.read_csv(USER_HISTORY_PATH)
notif = pd.read_csv(NOTIF_PATH)

In [3]:
print("notifications shape:", notif.shape)
print("user history shape:", user_history.shape)

print("\nuser history columns:", len(user_history.columns))
print("notifications columns:", len(notif.columns))

print("\nunique users in user history:", user_history["login_id"].nunique())
print("unique users in notifications:", notif["login_id"].nunique())

print("\nuser history date_id range:", user_history["date_id"].min(), "->", user_history["date_id"].max())
print("notificatons date_id range:", notif["date_id"].min(), "->", notif["date_id"].max())
print("registration date range:", user_history["registration_date_id"].min(), "->", user_history["registration_date_id"].max())

print("\nuser history nan values:", user_history.isna().any().any())
print("notifications nan values:", user_history.isna().any().any())

counter = notif.groupby("login_id").size()
print("\nnotification records per user:")
print(counter.value_counts().head(10))

display(user_history.head(3))
display(notif.head(3))

notifications shape: (3859, 4)
user history shape: (122459, 79)

user history columns: 79
notifications columns: 4

unique users in user history: 4979
unique users in notifications: 3859

user history date_id range: 0 -> 34
notificatons date_id range: 0 -> 22
registration date range: 0 -> 13

user history nan values: False
notifications nan values: False

notification records per user:
1    3859
Name: count, dtype: int64


Unnamed: 0,login_id,registration_date_id,registration_channel,registration_country,payer,dau,sessions_count,playtime,last_login_day,days_active_last_7_days,...,equipment_bonus,campaign_battles,campaign_battles_total,event_battles,event_battles_total,event_tokens,group_points,lives_spent,milestone_points,date_id
0,1,0,Organic,Canada,False,1,3,1351,0,1,...,0.0,5,5,0,0,15,0,0,0,0
1,2,0,Organic,Brazil,False,1,1,1046,0,1,...,0.0,2,2,0,0,15,0,0,0,0
2,3,0,Organic,Brazil,False,1,7,5854,0,1,...,0.0,8,8,0,0,40,0,0,0,0


Unnamed: 0,login_id,time,allowed_notifications,date_id
0,352,999,True,0
1,184,2780,True,0
2,162,2928,False,0


```user_history date_id```
- dnevni snapshot-ovi
- Zasto ide do 34, a test traje 14 dana?
    - test traje od 0 do 13, ali ako se user registruje 13og dana
    - on mora da gleda sledecih 14 dana
    - zato bi trebalo da ide od 27, ali zbog pravila (21 dan)
        - ide do 34
 
```registration date id```
- dan kada se user prvi put pojavio u igri

- Ukupno usera na testu ima: 4.979
- Popup odgovor imamo za 3.859
    - ***1.120 igraca nikad nije ni doslo da popupa!***

## Group assignment

In [4]:
users = user_history.copy()
users['group'] = np.where(users["login_id"] % 2 == 0, "group1_even", "group2_odd")

user_base = (
    users
    .sort_values(["login_id", "date_id"])
    .groupby("login_id", as_index=False)
    .first()
)

Koristimo prvi zapis po korisniku kao rani behavioral snapshot, cime izbegavamo koriscenje buducih informacija u analizi (data leakage)

***group1_even***
- pop-up odmah

***group2_odd***
- pop-up nakon tutorijala

In [5]:
print('user_base shape:', user_base.shape)
display(user_base[[
    "login_id",
    "group",
    "registration_date_id",
    "registration_channel",
    "registration_country",
    "payer",
    "date_id"
]].head(4))

user_base shape: (4979, 80)


Unnamed: 0,login_id,group,registration_date_id,registration_channel,registration_country,payer,date_id
0,1,group2_odd,0,Organic,Canada,False,0
1,2,group1_even,0,Organic,Brazil,False,0
2,3,group2_odd,0,Organic,Brazil,False,0
3,4,group1_even,0,Paid,France,False,0


## Merge

In [6]:
user_base = user_base.merge(
    notif[["login_id", "allowed_notifications", "date_id"]],
    on="login_id",
    how="left",
    suffixes=("","_notif")
)

Kolona `time` iz `notification_allowed` nije ukljucena prilikom spajanja jer predstavlja trenutak odgovora na popup nakon njegovog prikazivanja.  

Ova informacija nije dostupna u trenutku donosenja odluke o prikazivanju notifikacije i direktno zavisi od korisnicke reakcije, pa bi njeno koriscenje u analizi ili modeliranju uvodilo bias i potencijalni data leakage.

`time` nam govori posle koliko vremena od samog starta testa je korisnik dobio notifikaciju

engagement, economy i progression predstavljaju
- stanje igraca
- nisu posledica same odluke da li ce user allow ili not allow

One opisuju koliko je user investiran, to je okej

In [30]:
user_base.isna().any()

login_id                 False
registration_date_id     False
registration_channel     False
registration_country     False
payer                    False
                         ...  
milestone_points         False
date_id                  False
group                    False
allowed_notifications     True
date_id_notif             True
Length: 82, dtype: bool

## Exposed notifications

In [31]:
user_base["notif_exposed"] = user_base["allowed_notifications"].notna()

In [32]:
user_base["notif_exposed"].sum()

np.int64(3859)

## Allowed notifications

In [33]:
# Pravimo ITT (intention to treat) metodu za celokupan cohort.
user_base["allowed"] = (
    user_base["allowed_notifications"]
    .astype('boolean')
    .fillna(False)
    .astype(int)
)

In [34]:
print("After merge:")
print(user_base.shape)

print("\nExposure rate:")
print(user_base["notif_exposed"].value_counts(), '\n')
print(user_base["notif_exposed"].value_counts(normalize=True), '\n')

print(user_base["allowed"].value_counts(), '\n')
print(user_base["allowed"].value_counts(normalize=True))


display(
    user_base[[
        "login_id",
        "group",
        "notif_exposed",
        "allowed",
        "date_id_notif"
    ]]
)

After merge:
(4979, 84)

Exposure rate:
notif_exposed
True     3859
False    1120
Name: count, dtype: int64 

notif_exposed
True     0.775055
False    0.224945
Name: proportion, dtype: float64 

allowed
0    2684
1    2295
Name: count, dtype: int64 

allowed
0    0.539064
1    0.460936
Name: proportion, dtype: float64


Unnamed: 0,login_id,group,notif_exposed,allowed,date_id_notif
0,1,group2_odd,False,0,
1,2,group1_even,True,1,0.0
2,3,group2_odd,True,0,0.0
3,4,group1_even,True,0,0.0
4,5,group2_odd,False,0,
...,...,...,...,...,...
4974,4975,group2_odd,True,1,13.0
4975,4976,group1_even,True,0,13.0
4976,4977,group2_odd,False,0,
4977,4978,group1_even,True,1,13.0


Skoro cetvrtina igraca nikad nije dosla do trenutka kad se popup prikazuje\
Zasto?
- odustali su pre tutorijala, verovatno narocito u group2
- sad cemo to da proverimo


In [35]:
exposure_by_group = (
    user_base
    .groupby("group")["notif_exposed"]
    .agg(
        total_users = "count",
        exposed_users = "sum",
        exposure_rate = "mean"
    )
)

display(exposure_by_group)

Unnamed: 0_level_0,total_users,exposed_users,exposure_rate
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
group1_even,2489,2487,0.999196
group2_odd,2490,1372,0.551004


Potvrdjujemo gore navedeno:
- grupa kojoj se odmah pokazuje notif: 99.9% vidi popup
- grupa kojoj se pokazuje notif nakon tutorijala: 55.1%

## SRM - Sample Ratio MisMatch

In [36]:
group_counts = user_base['group'].value_counts()
display(group_counts)

print('\nGroup proportions:')
print(group_counts / group_counts.sum())

group
group2_odd     2490
group1_even    2489
Name: count, dtype: int64


Group proportions:
group
group2_odd     0.5001
group1_even    0.4999
Name: count, dtype: float64


### p-value

In [37]:
from scipy.stats import chisquare

observed = group_counts.values
expected = [group_counts.sum() / 2, group_counts.sum() / 2]

chi2, pval = chisquare(observed, expected)

print("SRM chi-square test:")
print("chi2 = ", chi2)
print("p-value = ", pval)

SRM chi-square test:
chi2 =  0.0002008435428800964
p-value =  0.9886928159591263


- Nema dokaza da je split pokvaren, tj mozemo verovati rezultatima eksperimenta

```chi-square test```
- da li se rezultat znacajno razlikuje od onoga sto ockeujemo

```ITT```
- racunamo metriku nad svim userima koji su dodeljeni grupi
    - cak i ako nisu videi popup
- Zasto?
    - meri realan efekat u produkciji

In [38]:
hit_rate_itt = (
    user_base
    .groupby('group')['allowed']
    .agg(
        total_users = "count",
        allowed_users = "sum",
        hit_rate = "mean"
    )
)

hit_rate_itt["hit_rate_percent"] = hit_rate_itt["hit_rate"]*100

display(hit_rate_itt)

Unnamed: 0_level_0,total_users,allowed_users,hit_rate,hit_rate_percent
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group1_even,2489,1434,0.576135,57.613499
group2_odd,2490,861,0.345783,34.578313


Razlika je +23 procenta u korist **ranog** popupa

## Statisticka znacajnost razlike

In [39]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

success = hit_rate_itt["allowed_users"].values
nobs = hit_rate_itt["total_users"].values

zstat, pval = proportions_ztest(success, nobs)

print("z-stat", zstat)
print("p-value", pval)

ci_group1 = proportion_confint(success[0], nobs[0], alpha=0.05, method="normal")
ci_group2 = proportion_confint(success[1], nobs[1], alpha=0.05, method="normal")

print("\n95% CI group1:", ci_group1)
print("\n95% CI group2:", ci_group2)

diff = hit_rate_itt.loc["group1_even", "hit_rate"] - hit_rate_itt.loc["group2_odd", "hit_rate"]
print("\nHit rate difference (group1 - group2):", diff)

z-stat 16.303930312986633
p-value 9.254491215024052e-60

95% CI group1: (0.5567211492554596, 0.5955488386915071)

95% CI group2: (0.32710165644643946, 0.36446460861380153)

Hit rate difference (group1 - group2): 0.23035186144336284


### Statisticka znacajnost

**Highly statistically significant (p < 0.001)**

---

### Zakljucak

- Prikazivanje popup poruke za dozvolu push notifikacija odmah nakon instalacije znacajno povecava stopu prihvatanja za oko **23 procentna poena** u odnosu na prikazivanje posle tutorijala
- Razlika je **veoma statisticki znacajna (p < 0.001)**

---


Na prethodnom testu analizirani su svi korisnici i rezultat je pokazao **prednost grupe 1**. \
Medjutim, kada posmatramo samo korisnike koji su zaista bili izlozeni popup poruci, obrazac se blago menja i dobijamo nesto drugaciji ishod.

In [40]:
exposed = user_base[user_base["notif_exposed"] == True]

hit_rate_exposed = (
    exposed
    .groupby("group")["allowed"]
    .agg(
        exposed_users = "count",
        allowed_users = "sum",
        hit_rate = "mean"
    )
)

hit_rate_exposed["hit_ratehit_percent"] = hit_rate_exposed["hit_rate"]*100

display(hit_rate_exposed)

Unnamed: 0_level_0,exposed_users,allowed_users,hit_rate,hit_ratehit_percent
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group1_even,2487,1434,0.576598,57.659831
group2_odd,1372,861,0.627551,62.755102


In [41]:
from statsmodels.stats.proportion import proportions_ztest, proportion_confint

success = hit_rate_exposed["allowed_users"].values
nobs = hit_rate_exposed["exposed_users"].values

zstat, pval = proportions_ztest(success, nobs)

print("z-stat", zstat)
print("p-value", pval)

ci_group1 = proportion_confint(success[0], nobs[0], alpha=0.05, method="normal")
ci_group2 = proportion_confint(success[1], nobs[1], alpha=0.05, method="normal")

print("\n95% CI group1:", ci_group1)
print("\n95% CI group2:", ci_group2)

diff = hit_rate_exposed.loc["group1_even", "hit_rate"] - hit_rate_exposed.loc["group2_odd", "hit_rate"]
print("\nHit rate difference (group1 - group2):", diff)

z-stat -3.0860993209454675
p-value 0.0020280106814055746

95% CI group1: (0.557179476094048, 0.5960171463426227)

95% CI group2: (0.6019693470234743, 0.6531326937928522)

Hit rate difference (group1 - group2): -0.05095270918982786


Kada user stigne do **pop-up** posle tutorijala, veca je sansa da klikne Allow
- Zasto?
    - vec je dosta vremena ulozio u gledanje tutorijala
    - svidela mu se igra cim je toliko ostao

---
***Iako popup prikazan nakon tutorijala ima bolji rezultat medju korisnicima koji su mu bili izlozeni (oko 5 procentnih poena veci opt-in), prikazivanje odmah nakon instalacije dopire do skoro duplo vise igraca, sto na kraju dovodi do znatno vece ukupne stope prihvatanja.***

---

## Retention analysis for all users

In [42]:
active_lookup = set(
    zip(
        user_history.loc[user_history["dau"] == 1, "login_id"],
        user_history.loc[user_history["dau"] == 1, "date_id"]
    )
)
#(352, 0)
#(352, 1)
#igrac 352 je bio aktivan 0-ti i 1 dan testa.

def is_active(user_id, day):
    return (user_id, day) in active_lookup
    
for d in [1, 7, 14]:
    user_base[f"retained_d{d}"] = user_base.apply(
        lambda x: is_active(x["login_id"], x["registration_date_id"] + d),
        axis=1
    )

print("Unique retained_d1:", user_base["retained_d1"].unique())
print("Unique retained_d7:", user_base["retained_d7"].unique())
print("Unique retained_d14:", user_base["retained_d14"].unique())

display(
    user_base[[
        "login_id",
        "group",
        "retained_d1",
        "retained_d7",
        "retained_d14"
    ]].head(6)
)

Unique retained_d1: [False  True]
Unique retained_d7: [False  True]
Unique retained_d14: [False  True]


Unnamed: 0,login_id,group,retained_d1,retained_d7,retained_d14
0,1,group2_odd,False,False,False
1,2,group1_even,False,False,False
2,3,group2_odd,True,True,True
3,4,group1_even,False,False,False
4,5,group2_odd,False,False,False
5,6,group1_even,True,False,False


In [43]:
retention_summary = (
    user_base
    .groupby("group")
    .agg(
        d1_retention=("retained_d1", "mean"),
        d7_retention=("retained_d7", "mean"),
        d14_retention=("retained_d14", "mean"),
        users=("login_id", "count")
    )
)

retention_summary_percent = retention_summary.copy()
retention_summary_percent[["d1_retention", "d7_retention", "d14_retention"]] *= 100

display(retention_summary_percent)

Unnamed: 0_level_0,d1_retention,d7_retention,d14_retention,users
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group1_even,42.507031,23.94536,17.95902,2489
group2_odd,45.863454,25.140562,18.554217,2490


## Statisticki znacajnost razlike za all users

In [44]:
from statsmodels.stats.proportion import proportions_ztest

def retention_test(metric):
    success = [
        user_base.loc[user_base["group"]=="group1_even", metric].sum(),
        user_base.loc[user_base["group"]=="group2_odd", metric].sum()
    ]
    
    nobs = [
        (user_base["group"]=="group1_even").sum(),
        (user_base["group"]=="group2_odd").sum()
    ]
    
    z, p = proportions_ztest(success, nobs)
    diff = success[0]/nobs[0] - success[1]/nobs[1]

    return diff, z, p

for metric in ["retained_d1", "retained_d7", "retained_d14"]:
    diff, z, p = retention_test(metric)
    print(metric)
    print("Difference (group1 - group2):", diff)
    print("z:", z)
    print("p-value:", p)
    print("-"*40)

retained_d1
Difference (group1 - group2): -0.033564228791421236
z: -2.384537979445472
p-value: 0.017100582764729054
----------------------------------------
retained_d7
Difference (group1 - group2): -0.011952026668344745
z: -0.9798701729005213
p-value: 0.32715020785273863
----------------------------------------
retained_d14
Difference (group1 - group2): -0.005951971808487472
z: -0.5435817960608814
p-value: 0.5867292894783075
----------------------------------------


popup kasnije ima:
- malo bolji retention : 1-3pp\
  
popup odmah ima:
- +23pp veci allow rate

***Samo je D1 retention statisticki znacajan!***\
***Prikazivanje popup poruke odmah nakon instalacije znacajno povecava opt-in stopu za oko 23 poena. Iako blago smanjuje Day 1 retention (za oko 3pp), ne postoji statisticki znacajan uticaj na srednjorocni i dugorocni retention (D7, D14). U principu, rano trazenje dozvole maksimizuje domet notifikacija uz minimalne dugorocne negativne efekte.***

## Retention analysis for exposed users

In [45]:
exposed = user_base[user_base["notif_exposed"] == True]

retention_summary = (
    exposed
    .groupby("group")
    .agg(
        d1_retention=("retained_d1", "mean"),
        d7_retention=("retained_d7", "mean"),
        d14_retention=("retained_d14", "mean"),
        users=("login_id", "count")
    )
)

retention_summary_percent = retention_summary.copy()
retention_summary_percent[["d1_retention", "d7_retention", "d14_retention"]] *= 100

display(retention_summary_percent)

Unnamed: 0_level_0,d1_retention,d7_retention,d14_retention,users
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
group1_even,42.541214,23.964616,17.973462,2487
group2_odd,80.466472,45.626822,33.673469,1372


In [46]:
from statsmodels.stats.proportion import proportions_ztest

def retention_test(metric):
    success = [
        exposed.loc[exposed["group"]=="group1_even", metric].sum(),
        exposed.loc[exposed["group"]=="group2_odd", metric].sum()
    ]
    
    nobs = [
        (exposed["group"]=="group1_even").sum(),
        (exposed["group"]=="group2_odd").sum()
    ]
    
    z, p = proportions_ztest(success, nobs)
    diff = success[0]/nobs[0] - success[1]/nobs[1]

    return diff, z, p

for metric in ["retained_d1", "retained_d7", "retained_d14"]:
    diff, z, p = retention_test(metric)
    print(metric)
    print("Difference (group1 - group2):", diff)
    print("z:", z)
    print("p-value:", p)
    print("-"*40)

retained_d1
Difference (group1 - group2): -0.3792525798877193
z: -22.72019818905014
p-value: 2.829285325293234e-114
----------------------------------------
retained_d7
Difference (group1 - group2): -0.21662206154217678
z: -13.847276185855643
p-value: 1.3212804698122624e-43
----------------------------------------
retained_d14
Difference (group1 - group2): -0.15700007385342554
z: -11.001685978516134
p-value: 3.7505319382090987e-28
----------------------------------------


***Kod exposed usera, situacija je nesto drugacija. D1, D7 i D14 su takodje statisticki znacajni!***\
Sta je ovde problem?
- Ovo vise nije ***random*** poredjenje!
- exposed-only uvodi ***selection bias***
- do pop-upa dolaze samo:
    - motivisaniji, uporniji i angazovaniji igraci
    - nazvacemo ih ***elitni igraci***

A sta se onda desava kod group1?
- ***Svi vide popup odmah, i motivisani i manje angazovani igraci!***

***Rani popup:***
- hvata mnogo vise korisnika
- ne narusava retention znacajno
- donosi vise notifikacija

***Kasni popup:***
- gubi skoro polovinu korisnika pre popup-a
- ali oni koji stignu su motivisaniji

## Segment AB Analysis
### Da li early popup pomaze svima isto ili samo nekim grupama?

In [47]:
user_base.head(1)

Unnamed: 0,login_id,registration_date_id,registration_channel,registration_country,payer,dau,sessions_count,playtime,last_login_day,days_active_last_7_days,...,milestone_points,date_id,group,allowed_notifications,date_id_notif,notif_exposed,allowed,retained_d1,retained_d7,retained_d14
0,1,0,Organic,Canada,False,1,3,1351,0,1,...,0,0,group2_odd,,,False,0,False,False,False


In [48]:
import pandas as pd
import numpy as np

def segment_ab_table(df, segment_col): 
    out = []
    for seg, sub in df.groupby(segment_col):
        g = sub.groupby("group")["allowed"].agg(["count","mean","sum"])

        p1 = g.loc["group1_even","mean"]
        p2 = g.loc["group2_odd","mean"]
        n1 = g.loc["group1_even","count"]
        n2 = g.loc["group2_odd","count"]

        out.append({
            "segment": seg,
            "n_group1": n1,
            "n_group2": n2,
            "hit_rate_group1": p1,
            "hit_rate_group2": p2,
            "diff_pp": (p1 - p2)*100
        })

    return pd.DataFrame(out).sort_values("diff_pp", ascending=False)

print("By registration_channel:")
display(segment_ab_table(user_base, "registration_channel"))

print("\nBy payer:")
display(segment_ab_table(user_base, "payer"))

top_countries = user_base["registration_country"].value_counts().head(10).index
by_country = segment_ab_table(user_base[user_base["registration_country"].isin(top_countries)], "registration_country")
print("\nBy top countries:")
display(by_country)

By registration_channel:


Unnamed: 0,segment,n_group1,n_group2,hit_rate_group1,hit_rate_group2,diff_pp
1,Paid,1218,1221,0.609195,0.366912,24.228304
0,Organic,1271,1269,0.544453,0.325453,21.900007



By payer:


Unnamed: 0,segment,n_group1,n_group2,hit_rate_group1,hit_rate_group2,diff_pp
0,False,2461,2463,0.573751,0.34186,23.189099
1,True,28,27,0.785714,0.703704,8.201058



By top countries:


Unnamed: 0,segment,n_group1,n_group2,hit_rate_group1,hit_rate_group2,diff_pp
5,Japan,124,149,0.653226,0.355705,29.752111
0,Brazil,405,387,0.562963,0.302326,26.063738
2,France,155,133,0.619355,0.360902,25.845258
1,Canada,108,90,0.601852,0.355556,24.62963
3,Germany,522,512,0.609195,0.376953,23.224228
8,United States,669,665,0.54858,0.320301,22.827922
7,United Kingdom,256,282,0.558594,0.365248,19.334552
4,Italy,91,115,0.56044,0.382609,17.783086
6,Serbia,159,157,0.534591,0.369427,16.516444


## Zakljucak

Analiza po registracionim kanalima, payer statusu i glavnim trzistima pokazuje da **rani popup konzistentno povecava opt-in rate u svim posmatranim segmentima**.

Ovo ukazuje na to da efekat nije slucajan niti ogranicen na pojedinacna trzista ili tipove korisnika.

### Poredjenje:

- Po registracionim kanalima:
  - Paid korisnici: **+24.2pp**
  - Organic korisnici: **+21.9pp**

- Po payer statusu:
  - Non-payers: **+23.2pp** 
  - Payers: **+8.2pp**

- Po trzistima:
  - Sva glavna trzista pokazuju pozitivan uplift (16pp – 30pp)
  - Najveci efekti zabelezeni su u Japanu, Brazilu i Francuskoj

### Tumačenje:

Efekat ranog popup-a je globalan i robustan \
Payer korisnici su vec visoko angažovani i samim tim manje osetljivi na trenutak prikazivanja dozvole.

Hajde da proverimo kakva je situacijasa exposed userima

In [49]:
def segment_ab_table(df, segment_col): 
    out = []
    for seg, sub in df.groupby(segment_col):
        g = sub.groupby("group")["allowed"].agg(["count","mean","sum"])

        p1 = g.loc["group1_even","mean"]
        p2 = g.loc["group2_odd","mean"]
        n1 = g.loc["group1_even","count"]
        n2 = g.loc["group2_odd","count"]

        out.append({
            "segment": seg,
            "n_group1": n1,
            "n_group2": n2,
            "hit_rate_group1": p1,
            "hit_rate_group2": p2,
            "diff_pp": (p1 - p2)*100
        })

    return pd.DataFrame(out).sort_values("diff_pp", ascending=False)

print("By registration_channel:")
display(segment_ab_table(exposed, "registration_channel"))

print("\nBy payer:")
display(segment_ab_table(exposed, "payer"))

top_countries = exposed["registration_country"].value_counts().head(10).index
by_country = segment_ab_table(exposed[exposed["registration_country"].isin(top_countries)], "registration_country")
print("\nBy top countries:")
display(by_country)

By registration_channel:


Unnamed: 0,segment,n_group1,n_group2,hit_rate_group1,hit_rate_group2,diff_pp
1,Paid,1217,710,0.609696,0.630986,-2.128994
0,Organic,1270,662,0.544882,0.623867,-7.898518



By payer:


Unnamed: 0,segment,n_group1,n_group2,hit_rate_group1,hit_rate_group2,diff_pp
1,True,28,23,0.785714,0.826087,-4.037267
0,False,2459,1349,0.574217,0.624166,-4.994889



By top countries:


Unnamed: 0,segment,n_group1,n_group2,hit_rate_group1,hit_rate_group2,diff_pp
5,Japan,124,88,0.653226,0.602273,5.095308
2,France,155,80,0.619355,0.6,1.935484
0,Brazil,404,200,0.564356,0.585,-2.064356
3,Germany,522,297,0.609195,0.649832,-4.063625
8,United States,668,355,0.549401,0.6,-5.05988
1,Canada,108,48,0.601852,0.666667,-6.481481
7,United Kingdom,256,157,0.558594,0.656051,-9.745721
4,Italy,91,64,0.56044,0.6875,-12.706044
6,Serbia,159,83,0.534591,0.698795,-16.420399


U ovom slucaju prednost ide ka grupi 2 (kasni popup), medjutim dolazi do uvodjenja ***selection biasa.***

Naime, korisnici u grupi 2 postaju exposed tek nakon sto zavrse tutorijal, sto znaci da su u proseku angazovaniji i motivisaniji od pocetne populacije. Zbog toga exposed podskup vise ne predstavlja uporedivu i nasumicno podeljenu grupu korisnika.

***Iz tog razloga, smatram da je pravednije i metodoloski ispravnije posmatrati ITT (Intent-To-Treat) rezultate, jer oni zadrzavaju randomizaciju i realno odrazavaju efekat promene na celokupnu populaciju.***

In [50]:
user_base.to_csv("../data/processed/user_base.csv", index=False)