In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf

# Load data such that each row corresponds to an email
df = pd.read_csv("../data/masterfile-study2-bc.csv")
print(df.Group.unique())
df["dv"] = 1
df = df.pivot_table(
    index=["Simulation Name", "PID","read","country","age_bin","MANIPULATION",'Group','scenario'],   # identifiers for each email + demo
    columns="eventName",                # the different event/action types
    values="dv",                     # numeric indicator (0/1 or counts)
    aggfunc="sum"                      
).reset_index().fillna(0)

df

['C' 'B']


eventName,Simulation Name,PID,read,country,age_bin,MANIPULATION,Group,scenario,CredSupplied,EmailLinkClicked,MessageDeleted,MessageForwarded,MessageRead,MessageReplied,OutOfOffice,ReportedEmail,SuccessfullyDeliveredEmail
0,EWeek 1 - B01 - S,13,5,US United States,25 - 34,PC * MS,C,B01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,EWeek 1 - B01 - S,14,5,US United States,18 - 24,PC * MS,C,B01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,EWeek 1 - B01 - S,15,5,US United States,25 - 34,PC * MS,C,B01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,EWeek 1 - B01 - S,16,5,US United States,35 - 44,PC * MS,C,B01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,EWeek 1 - B01 - S,17,5,US United States,18 - 24,PC * MS,C,B01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1299,NWeek 4- P03 - X,28,4,CG Democratic Republic of Congo,18 - 24,C * DS,B,P03,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1300,NWeek 4- P03 - X,29,4,BY Burundi,18 - 24,C * DS,B,P03,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1301,NWeek 4- Y02 - A,82,4,TZ Tanzania,25 - 34,PC * DS,B,Y02,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1302,NWeek 4- Y02 - A,83,4,KE Kenya,25 - 34,PC * DS,B,Y02,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [2]:
# Subsections 5.2.1 and 5.2.2: Logistic Regression - Groups B and C together
# Table 15

dvs = ['CredSupplied', 'EmailLinkClicked', 'MessageDeleted', 'MessageRead','MessageReplied', 'ReportedEmail']

for i in dvs:

    dfm = df.copy()
    print(i+' ~ C(MANIPULATION)')
    
    # Fit logistic regression model with MANIPULATION as categorical predictor
    model = smf.logit(i+' ~ C(MANIPULATION)', data=dfm).fit()

    # Display model summary
    print(model.summary())

    # Compute odds ratios and 95% confidence intervals
    odds_ratios = np.exp(model.params)
    conf = np.exp(model.conf_int())
    conf.columns = ['2.5%', '97.5%']

    results = pd.concat([odds_ratios.rename('OR'), conf], axis=1)
    print("\nOdds Ratios with 95% Confidence Intervals:")
    print(results)

CredSupplied ~ C(MANIPULATION)
Optimization terminated successfully.
         Current function value: 0.125885
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:           CredSupplied   No. Observations:                 1304
Model:                          Logit   Df Residuals:                     1300
Method:                           MLE   Df Model:                            3
Date:                Thu, 11 Sep 2025   Pseudo R-squ.:                 0.08167
Time:                        19:21:54   Log-Likelihood:                -164.15
converged:                       True   LL-Null:                       -178.75
Covariance Type:            nonrobust   LLR p-value:                 2.035e-06
                                  coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------------
Intercept                      -2.5324     

In [3]:
# Subsections 5.3.1 and 5.3.2: Binomial Logistic Regression - Groups B and C together
# Table 16

dvs = ['CredSupplied', 'EmailLinkClicked', 'MessageDeleted', 'MessageRead',
       'MessageReplied', 'ReportedEmail']
for i in dvs: 
    dfm = df.copy()

    model = smf.logit(i+' ~ C(read)', data=dfm).fit()

    # Display model summary
    print(model.summary())

    # Compute odds ratios and 95% confidence intervals
    odds_ratios = np.exp(model.params)
    conf = np.exp(model.conf_int())
    conf.columns = ['2.5%', '97.5%']

    results = pd.concat([odds_ratios.rename('OR'), conf], axis=1)
    print("\nOdds Ratios with 95% Confidence Intervals:")
    print(results)

Optimization terminated successfully.
         Current function value: 0.136841
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:           CredSupplied   No. Observations:                 1304
Model:                          Logit   Df Residuals:                     1302
Method:                           MLE   Df Model:                            1
Date:                Thu, 11 Sep 2025   Pseudo R-squ.:                0.001745
Time:                        19:21:55   Log-Likelihood:                -178.44
converged:                       True   LL-Null:                       -178.75
Covariance Type:            nonrobust   LLR p-value:                    0.4296
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
Intercept       -3.6073      0.262    -13.785      0.000      -4.120      -3.094
C(read)[T.5]     0.2595