In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy import stats

# Load data such that each row corresponds to an event
df = pd.read_csv("../data/masterfile-study1-a.csv")
print(df.Group.unique())
print(df.country.unique())
df

['A']
['CG Democratic Republic of Congo' 'MW Malawi' 'SS South Sudan'
 'BY Burundi']


Unnamed: 0,Simulation Name,PID,eventName,eventDateTime,isPartOfCompromise,order,scenario,infoRequested,emailSubject,MANIPULATION,Group,country,age_bin,read,EventDate,EventTime,EventDay,EventTimeBin,EventHour,EventTimeBin_OneHour
0,DSPC3,97,MessageRead,2023-06-25 13:57:00,False,3,Y06,S,WSU Hiring,PC * DS,A,CG Democratic Republic of Congo,35 - 44,1,2023-06-25,13:57:00,Sunday,Afternoon,13,13:00–13:59
1,MSC3,97,MessageRead,2023-06-25 13:56:00,False,3,G06,A,FINAL NOTICE: Rent,C * MS,A,CG Democratic Republic of Congo,35 - 44,1,2023-06-25,13:56:00,Sunday,Afternoon,13,13:00–13:59
2,DSC2,97,MessageRead,2023-06-25 13:56:00,False,2,P10,X,Lost Package,C * DS,A,CG Democratic Republic of Congo,35 - 44,1,2023-06-25,13:56:00,Sunday,Afternoon,13,13:00–13:59
3,MSPC3,86,MessageDeleted,2023-06-10 00:23:00,False,3,B03,A,OVERTIME BONUS,PC * MS,A,CG Democratic Republic of Congo,25 - 34,1,2023-06-10,00:23:00,Saturday,Night,0,00:00–00:59
4,MSPC1,86,MessageDeleted,2023-06-10 00:23:00,False,1,B07,S,HIRING UPDATE,PC * MS,A,CG Democratic Republic of Congo,25 - 34,1,2023-06-10,00:23:00,Saturday,Night,0,00:00–00:59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323,MSC1,101,SuccessfullyDeliveredEmail,2023-05-27 15:12:00,False,1,G07,B,UPDATE: Your Award,C * MS,A,SS South Sudan,35 - 44,1,2023-05-27,15:12:00,Saturday,Afternoon,15,15:00–15:59
324,MSC1,102,SuccessfullyDeliveredEmail,2023-05-27 15:12:00,False,1,G07,B,UPDATE: Your Award,C * MS,A,CG Democratic Republic of Congo,35 - 44,1,2023-05-27,15:12:00,Saturday,Afternoon,15,15:00–15:59
325,MSC1,103,SuccessfullyDeliveredEmail,2023-05-27 15:12:00,False,1,G07,B,UPDATE: Your Award,C * MS,A,CG Democratic Republic of Congo,25 - 34,3,2023-05-27,15:12:00,Saturday,Afternoon,15,15:00–15:59
326,MSC1,90,SuccessfullyDeliveredEmail,2023-05-27 15:12:00,False,1,G07,B,UPDATE: Your Award,C * MS,A,BY Burundi,25 - 34,1,2023-05-27,15:12:00,Saturday,Afternoon,15,15:00–15:59


In [2]:
# Subsection 5.4.2: Chi-squared test - Group A ONLY - Compromised and Country of origin
# Based on event rows

dfm = df.copy()
# Convert eventName to binary outcome (1 = MessageRead, 0 = otherwise)
dfm['read'] = (dfm['eventName'] == 'MessageRead').astype(int)

# Build contingency table
contingency = pd.crosstab(dfm["country"], dfm["read"])

print("Contingency Table:")
print(contingency)

# Run Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print("\nChi-square test results")
print(f"Chi2 statistic = {chi2:.4f}")
print(f"Degrees of freedom = {dof}")
print(f"P-value = {p:.6f}")
print("\nExpected counts:")
print(pd.DataFrame(expected, index=contingency.index, columns=contingency.columns))

Contingency Table:
read                               0   1
country                                 
BY Burundi                        12   8
CG Democratic Republic of Congo  205  53
MW Malawi                         26  12
SS South Sudan                    12   0

Chi-square test results
Chi2 statistic = 9.4212
Degrees of freedom = 3
P-value = 0.024185

Expected counts:
read                                      0          1
country                                               
BY Burundi                        15.548780   4.451220
CG Democratic Republic of Congo  200.579268  57.420732
MW Malawi                         29.542683   8.457317
SS South Sudan                     9.329268   2.670732


In [3]:
# Subsection 5.3.1: Spearman correlation - Group A ONLY - MessageRead and English Reading Proficiency (ERP)
# Running stats below based on email rows 

df["dv"] = 1
df2 = df.pivot_table(
    index=["Simulation Name", "PID","read","country","age_bin"],   # identifiers for each email + demo
    columns="eventName",                # the different event/action types
    values="dv",                     # numeric indicator (0/1 or counts)
    aggfunc="sum"                      
).reset_index().fillna(0)

r_email, p_email = stats.pearsonr(df2["read"], df2["MessageRead"])
# (Equivalent test)
r_pb, p_pb = stats.pointbiserialr(df2["MessageRead"], df2["read"])
rho_email, p_rho = stats.spearmanr(df2["read"], df2["MessageRead"])

print(f"Spearman ρ (email-level): ρ = {rho_email:.3f}, p = {p_rho:.4g}")
print(f"Email-level Pearson r (== point-biserial): r = {r_email:.3f}, p = {p_email:.4g}")
print(f"Point-biserial (scipy): r_pb = {r_pb:.3f}, p = {p_pb:.4g}")

Spearman ρ (email-level): ρ = 0.157, p = 0.01757
Email-level Pearson r (== point-biserial): r = 0.133, p = 0.04492
Point-biserial (scipy): r_pb = 0.133, p = 0.04492
