In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency


# Load data such that each row corresponds to an event
df = pd.read_csv("../data/masterfile-study2-bc.csv")
print(df.Group.unique())
print(df.country.unique())
df

['C' 'B']
['US United States' 'TZ Tanzania' 'KE Kenya'
 'CG Democratic Republic of Congo' 'BY Burundi' 'RW Rwanda' 'UG Uganda'
 'SS South Sudan']


Unnamed: 0,Simulation Name,PID,eventName,eventDateTime,isPartOfCompromise,order,scenario,infoRequested,emailSubject,MANIPULATION,Group,country,age_bin,read,EventDate,EventTime,EventDay,EventTimeBin,EventHour,EventTimeBin_OneHour
0,EWeek 4 - B01 - S,1,MessageRead,2024-09-25 09:32:07,False,4,B01,S,RESUME: Update Needed,PC * MS,C,US United States,18 - 24,5,2024-09-25,09:32:07,Wednesday,Morning,9,09:00–09:59
1,EWeek 4 - B01 - S,1,MessageDeleted,2024-09-25 09:32:07,False,4,B01,S,RESUME: Update Needed,PC * MS,C,US United States,18 - 24,5,2024-09-25,09:32:07,Wednesday,Morning,9,09:00–09:59
2,EWeek 4 - B01 - S,2,SuccessfullyDeliveredEmail,2024-09-23 10:10:33,False,4,B01,S,RESUME: Update Needed,PC * MS,C,US United States,18 - 24,5,2024-09-23,10:10:33,Monday,Morning,10,10:00–10:59
3,EWeek 4 - B01 - S,1,SuccessfullyDeliveredEmail,2024-09-23 10:10:32,False,4,B01,S,RESUME: Update Needed,PC * MS,C,US United States,18 - 24,5,2024-09-23,10:10:32,Monday,Morning,10,10:00–10:59
4,EWeek 4 - B01 - S,3,SuccessfullyDeliveredEmail,2024-09-23 10:10:31,False,4,B01,S,RESUME: Update Needed,PC * MS,C,US United States,35 - 44,5,2024-09-23,10:10:31,Monday,Morning,10,10:00–10:59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2245,NWeek 2 - Y12 - B,212,SuccessfullyDeliveredEmail,2024-03-27 16:01:00,False,2,Y12,B,Update Your Information,PC * DS,B,CG Democratic Republic of Congo,18 - 24,4,2024-03-27,16:01:00,Wednesday,Afternoon,16,16:00–16:59
2246,NWeek 1 - Y12 - B,207,MessageRead,2024-03-26 19:21:00,False,1,Y12,B,Update Your Information,PC * DS,B,KE Kenya,35 - 44,4,2024-03-26,19:21:00,Tuesday,Evening,19,19:00–19:59
2247,NWeek 1 - Y12 - B,207,MessageDeleted,2024-03-26 19:21:00,False,1,Y12,B,Update Your Information,PC * DS,B,KE Kenya,35 - 44,4,2024-03-26,19:21:00,Tuesday,Evening,19,19:00–19:59
2248,NWeek 1 - Y12 - B,207,SuccessfullyDeliveredEmail,2024-03-21 21:04:00,False,1,Y12,B,Update Your Information,PC * DS,B,KE Kenya,35 - 44,4,2024-03-21,21:04:00,Thursday,Night,21,21:00–21:59


In [2]:
# Subsection 5.4.1: Chi-squared test - Groups B and C together - MessageRead and Age groups

dfm = df.copy()
# Convert eventName to binary outcome (1 = MessageRead, 0 = otherwise)
dfm['read'] = (dfm['eventName'] == 'MessageRead').astype(int)

# Build contingency table
contingency = pd.crosstab(dfm["age_bin"], dfm["read"])

print("Contingency Table:")
print(contingency)

# Run Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print("\nChi-square test results")
print(f"Chi2 statistic = {chi2:.4f}")
print(f"Degrees of freedom = {dof}")
print(f"P-value = {p:.6f}")
print("\nExpected counts:")
print(pd.DataFrame(expected, index=contingency.index, columns=contingency.columns))

Contingency Table:
read       0    1
age_bin          
18 - 24  947  355
25 - 34  383  101
35 - 44  208   52
45 - 54  122   32
55 - 64   35   11
65 - 74    4    0

Chi-square test results
Chi2 statistic = 14.1460
Degrees of freedom = 5
P-value = 0.014708

Expected counts:
read              0           1
age_bin                        
18 - 24  983.154667  318.845333
25 - 34  365.473778  118.526222
35 - 44  196.328889   63.671111
45 - 54  116.287111   37.712889
55 - 64   34.735111   11.264889
65 - 74    3.020444    0.979556


In [3]:
# Subsection 5.4.1: Chi-squared test - Group B ONLY - Compromised and Age groups
dfm = df[df['Group']!='C'].copy()
print(dfm.Group.unique())

# Convert eventName to binary outcome (1 = MessageRead, 0 = otherwise)
dfm['compromised'] = (dfm['eventName'] == 'CredSupplied').astype(int)


# Build contingency table
contingency = pd.crosstab(dfm["age_bin"], dfm["compromised"])

print("Contingency Table:")
print(contingency)

# Run Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print("\nChi-square test results")
print(f"Chi2 statistic = {chi2:.4f}")
print(f"Degrees of freedom = {dof}")
print(f"P-value = {p:.6f}")
print("\nExpected counts:")
print(pd.DataFrame(expected, index=contingency.index, columns=contingency.columns))

['B']
Contingency Table:
compromised    0   1
age_bin             
18 - 24      427  13
25 - 34      235   1
35 - 44       91   1
45 - 54       47   0

Chi-square test results
Chi2 statistic = 6.8152
Degrees of freedom = 3
P-value = 0.078028

Expected counts:
compromised           0         1
age_bin                          
18 - 24      431.901840  8.098160
25 - 34      231.656442  4.343558
35 - 44       90.306748  1.693252
45 - 54       46.134969  0.865031


In [4]:
# Subsection 5.4.1: Chi-squared test - Groups B and C together - Compromised and Age groups

dfm = df.copy()
# Convert eventName to binary outcome (1 = CredSupplied, 0 = otherwise)
dfm['compromised'] = (dfm['eventName'] == 'CredSupplied').astype(int)

# Build contingency table
contingency = pd.crosstab(dfm["age_bin"], dfm["compromised"])

print("Contingency Table:")
print(contingency)

# Run Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print("\nChi-square test results")
print(f"Chi2 statistic = {chi2:.4f}")
print(f"Degrees of freedom = {dof}")
print(f"P-value = {p:.6f}")
print("\nExpected counts:")
print(pd.DataFrame(expected, index=contingency.index, columns=contingency.columns))


Contingency Table:
compromised     0   1
age_bin              
18 - 24      1274  28
25 - 34       478   6
35 - 44       258   2
45 - 54       153   1
55 - 64        43   3
65 - 74         4   0

Chi-square test results
Chi2 statistic = 10.4772
Degrees of freedom = 5
P-value = 0.062790

Expected counts:
compromised            0          1
age_bin                            
18 - 24      1278.853333  23.146667
25 - 34       475.395556   8.604444
35 - 44       255.377778   4.622222
45 - 54       151.262222   2.737778
55 - 64        45.182222   0.817778
65 - 74         3.928889   0.071111


In [5]:
# Subsection 5.4.1: Chi-squared test - Group B ONLY - Compromised and Event Time Bins
# After dropping unread emails

dfm = df[df['Group']!='C'].copy()
print(dfm.Group.unique())

dfm['read'] = (dfm['eventName'] == 'MessageRead').astype(int)
dfm = dfm[dfm['read']>0]
print(dfm.read.unique())

dfm['compromised'] = (dfm['eventName'] == 'CredSupplied').astype(int)

# Build contingency table
contingency = pd.crosstab(dfm["EventTimeBin"], dfm["isPartOfCompromise"])

print("Contingency Table:")
print(contingency)

# Run Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print("\nChi-square test results")
print(f"Chi2 statistic = {chi2:.4f}")
print(f"Degrees of freedom = {dof}")
print(f"P-value = {p:.6f}")
print("\nExpected counts:")
print(pd.DataFrame(expected, index=contingency.index, columns=contingency.columns))

['B']
[1]
Contingency Table:
isPartOfCompromise  False  True 
EventTimeBin                    
Afternoon              40      6
Evening                22      3
Morning                 4      1
Night                  64      5

Chi-square test results
Chi2 statistic = 1.6518
Degrees of freedom = 3
P-value = 0.647692

Expected counts:
isPartOfCompromise      False     True 
EventTimeBin                           
Afternoon           41.241379  4.758621
Evening             22.413793  2.586207
Morning              4.482759  0.517241
Night               61.862069  7.137931


In [6]:
# Subsection 5.4.2: Chi-squared test - Groups B and C together - Compromised and Country of origin

dfm = df.copy()
# Convert eventName to binary outcome (1 = CredSupplied, 0 = otherwise)
dfm['compromised'] = (dfm['eventName'] == 'CredSupplied').astype(int)


# Build contingency table
contingency = pd.crosstab(dfm["country"], dfm["compromised"])

print("Contingency Table:")
print(contingency)

# Run Chi-square test
chi2, p, dof, expected = chi2_contingency(contingency)

print("\nChi-square test results")
print(f"Chi2 statistic = {chi2:.4f}")
print(f"Degrees of freedom = {dof}")
print(f"P-value = {p:.6f}")
print("\nExpected counts:")
print(pd.DataFrame(expected, index=contingency.index, columns=contingency.columns))

Contingency Table:
compromised                         0   1
country                                  
BY Burundi                         20   1
CG Democratic Republic of Congo   151   2
KE Kenya                          448   3
RW Rwanda                          20   1
SS South Sudan                      4   0
TZ Tanzania                       135   6
UG Uganda                          22   2
US United States                 1410  25

Chi-square test results
Chi2 statistic = 16.4790
Degrees of freedom = 7
P-value = 0.021083

Expected counts:
compromised                                0          1
country                                                
BY Burundi                         20.626667   0.373333
CG Democratic Republic of Congo   150.280000   2.720000
KE Kenya                          442.982222   8.017778
RW Rwanda                          20.626667   0.373333
SS South Sudan                      3.928889   0.071111
TZ Tanzania                       138.493333   2.506667
UG 