In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from scipy.stats import mannwhitneyu

In [2]:
df = pd.read_csv("../data/masterfile-study2-bc.csv")
df["dv"] = 1
df = df.pivot_table(
    index=['PID','Simulation Name','Group','MANIPULATION'],   # identifiers for each email 
    columns="eventName",                # the different event/action types
    values="dv",                     # numeric indicator (0/1 or counts)
    aggfunc="sum"                      
).reset_index().fillna(0)
df

eventName,PID,Simulation Name,Group,MANIPULATION,CredSupplied,EmailLinkClicked,MessageDeleted,MessageForwarded,MessageRead,MessageReplied,OutOfOffice,ReportedEmail,SuccessfullyDeliveredEmail
0,1,EWeek 1 - G01 - A,C,C * MS,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1,1,EWeek 2 - P01 - X,C,C * DS,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1,EWeek 3 - Y01 - B,C,PC * DS,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
3,1,EWeek 4 - B01 - S,C,PC * MS,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
4,2,EWeek 1 - G01 - A,C,C * MS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1299,344,NWeek 4 - G07 - B,B,C * MS,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
1300,345,NWeek 1 - B12 - B,B,PC * MS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1301,345,NWeek 2 - P09 - X,B,C * DS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1302,345,NWeek 3 - G07 - B,B,C * MS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [3]:
# One-Way ANOVA checks if there are any statistically significant differences
# between the means of the 'CredSupplied' for different 'MANIPULATION' groups.
model = ols('CredSupplied ~ C(MANIPULATION)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("ANOVA Results:")
print(anova_table)

# Tukey's HSD Post-Hoc Test compares every MANIPULATION group with every other group.
p_value = anova_table['PR(>F)'][0]
if p_value < 0.05:
    tukey_results = pairwise_tukeyhsd(endog=df['CredSupplied'], groups=df['MANIPULATION'], alpha=0.05)
    print("\nTukey's HSD Results:")
    print(tukey_results)

ANOVA Results:
                    sum_sq      df          F        PR(>F)
C(MANIPULATION)   0.907975     3.0  10.391013  9.289166e-07
Residual         37.865031  1300.0        NaN           NaN

Tukey's HSD Results:
  Multiple Comparison of Means - Tukey HSD, FWER=0.05   
 group1   group2  meandiff p-adj   lower   upper  reject
--------------------------------------------------------
  C * DS   C * MS  -0.0613    0.0 -0.0957  -0.027   True
  C * DS PC * DS   -0.0429 0.0074 -0.0773 -0.0086   True
  C * DS  PC * MS  -0.0675    0.0 -0.1019 -0.0331   True
  C * MS PC * DS    0.0184 0.5141  -0.016  0.0528  False
  C * MS  PC * MS  -0.0061 0.9679 -0.0405  0.0283  False
PC * DS   PC * MS  -0.0245  0.257 -0.0589  0.0098  False
--------------------------------------------------------


In [4]:
# Descriptive statistics per Group × Manipulation
desc_stats = (
    df.groupby(["Group", "MANIPULATION"])["CredSupplied"]
    .agg(["count", "mean", "median", "std", "var", "min", "max"])
    .reset_index()
)
print(desc_stats)

  Group MANIPULATION  count      mean  median       std       var  min  max
0     B       C * DS    142  0.049296     0.0  0.217251  0.047198  0.0  1.0
1     B       C * MS    142  0.021127     0.0  0.144316  0.020827  0.0  1.0
2     B     PC * DS     142  0.021127     0.0  0.144316  0.020827  0.0  1.0
3     B      PC * MS    142  0.014085     0.0  0.118257  0.013985  0.0  1.0
4     C       C * DS    184  0.092391     0.0  0.290368  0.084313  0.0  1.0
5     C       C * MS    184  0.005435     0.0  0.073721  0.005435  0.0  1.0
6     C     PC * DS     184  0.038043     0.0  0.191823  0.036796  0.0  1.0
7     C      PC * MS    184  0.000000     0.0  0.000000  0.000000  0.0  0.0


In [5]:
# Table 17: Two-Way ANOVA Results for Compromised Responses (Groups B and C).
# Two-Way ANOVA examines the influence of (MANIPULATION and Group) on CredSupplied.
model = ols('CredSupplied ~ C(MANIPULATION) + C(Group) + C(MANIPULATION):C(Group)', data=df).fit()
anova_table = sm.stats.anova_lm(model, typ=2)
print("\nTwo-Way ANOVA Results:")
print(anova_table)


# Table 14: Pairwise Comparisons from Tukey’s HSD Post-Hoc Test for Compromised Events for Groups B and C
# Tukey's HSD Post-Hoc Test
is_significant = (anova_table['PR(>F)'] < 0.05).any()
if is_significant:
    df['interaction_group'] = df['MANIPULATION'].astype(str) + " + " + df['Group'].astype(str)
    tukey_results = pairwise_tukeyhsd(endog=df['CredSupplied'], groups=df['interaction_group'], alpha=0.05)
    print("\nTukey's HSD Results:")
    print(tukey_results)


Two-Way ANOVA Results:
                             sum_sq      df          F        PR(>F)
C(MANIPULATION)            0.907975     3.0  10.416099  8.968651e-07
C(Group)                   0.018318     1.0   0.630409  4.273505e-01
C(MANIPULATION):C(Group)   0.189104     3.0   2.169364  8.988724e-02
Residual                  37.657609  1296.0        NaN           NaN

Tukey's HSD Results:
      Multiple Comparison of Means - Tukey HSD, FWER=0.05       
   group1       group2    meandiff p-adj   lower   upper  reject
----------------------------------------------------------------
  C * DS + B   C * DS + C   0.0431 0.3149 -0.0147  0.1009  False
  C * DS + B   C * MS + B  -0.0282 0.8609 -0.0896  0.0332  False
  C * DS + B   C * MS + C  -0.0439 0.2923 -0.1017  0.0139  False
  C * DS + B PC * DS  + B  -0.0282 0.8609 -0.0896  0.0332  False
  C * DS + B PC * DS  + C  -0.0113  0.999 -0.0691  0.0466  False
  C * DS + B  PC * MS + B  -0.0352 0.6605 -0.0966  0.0262  False
  C * DS + B  PC * MS + 

In [6]:
# Table 11: Summary of Mann-Whitney U Test Results for Compromised Emails (Groups B and C)
parts = df['MANIPULATION'].str.split('*', n=1, expand=True)
df['Linguistic'] = parts[0].str.strip()  # Now contains 'C' and 'PC'
df['Legitimacy'] = parts[1].str.strip()  # Now contains 'DS' and 'MS'

print("\nValue counts for 'Legitimacy' (now DS vs MS):")
print(df['Legitimacy'].value_counts())
print("\nValue counts for 'Linguistic' (now C vs PC):")
print(df['Linguistic'].value_counts())

# Mann-Whitney U Test for Legitimacy (DS vs. MS)
print("\nMann-Whitney U Test for Legitimacy ( DS vs MS)")
group_ds = df[df['Legitimacy'] == 'DS']['CredSupplied']
group_ms = df[df['Legitimacy'] == 'MS']['CredSupplied']
if not group_ds.empty and not group_ms.empty:
    u_statistic_legit, p_value_legit = mannwhitneyu(group_ds, group_ms)
    print(f"U-statistic: {u_statistic_legit}, P-value: {p_value_legit}")
    
    if p_value_legit < 0.05:
        print("SIGNIFICANT for the Legitimacy factor.")
    else:
        print("NOT SIGNIFICANT for the Legitimacy factor.")


# Mann-Whitney U Test for Linguistic Style (C vs. PC) 
print("\nMann-Whitney U Test for Linguistic (C vs. PC)")
group_c = df[df['Linguistic'] == 'C']['CredSupplied']
group_pc = df[df['Linguistic'] == 'PC']['CredSupplied']
if not group_c.empty and not group_pc.empty:
    u_statistic_ling, p_value_ling = mannwhitneyu(group_c, group_pc)
    print(f"U-statistic: {u_statistic_ling}, P-value: {p_value_ling}")

    if p_value_ling < 0.05:
        print("SIGNIFICANT for the Linguistic factor.")
    else:
        print("NOT SIGNIFICANT for the Linguistic factor.")


Value counts for 'Legitimacy' (now DS vs MS):
MS    652
DS    652
Name: Legitimacy, dtype: int64

Value counts for 'Linguistic' (now C vs PC):
C     652
PC    652
Name: Linguistic, dtype: int64

Mann-Whitney U Test for Legitimacy ( DS vs MS)
U-statistic: 221680.0, P-value: 6.966061982161438e-06
SIGNIFICANT for the Legitimacy factor.

Mann-Whitney U Test for Linguistic (C vs. PC)
U-statistic: 217768.0, P-value: 0.010219644712187741
SIGNIFICANT for the Linguistic factor.


In [7]:
# Linear Regression with Group and MANIPULATION
model = ols('CredSupplied ~ C(MANIPULATION) + C(Group) + C(MANIPULATION):C(Group)', data=df).fit()
print("\nLinear Regression Results Summary:")
print(model.summary())


Linear Regression Results Summary:
                            OLS Regression Results                            
Dep. Variable:           CredSupplied   R-squared:                       0.029
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     5.484
Date:                Thu, 11 Sep 2025   Prob (F-statistic):           3.16e-06
Time:                        19:20:57   Log-Likelihood:                 460.82
No. Observations:                1304   AIC:                            -905.6
Df Residuals:                    1296   BIC:                            -864.3
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                                                coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------

In [8]:
# Linear Regression -  Legitimacy, Linguistic, and their interaction effect
model = ols('CredSupplied ~ C(Legitimacy) + C(Linguistic) + C(Legitimacy):C(Linguistic)', data=df).fit()
print("\nLinear Regression Results Summary:")
print(model.summary())


Linear Regression Results Summary:
                            OLS Regression Results                            
Dep. Variable:           CredSupplied   R-squared:                       0.023
Model:                            OLS   Adj. R-squared:                  0.021
Method:                 Least Squares   F-statistic:                     10.39
Date:                Thu, 11 Sep 2025   Prob (F-statistic):           9.29e-07
Time:                        19:20:57   Log-Likelihood:                 457.24
No. Observations:                1304   AIC:                            -906.5
Df Residuals:                    1300   BIC:                            -885.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                                              coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------