In [1]:
# import relevant libraries and read in the data
import pandas as pd
import numpy as np
import scipy
import statsmodels
from scipy import stats
from scipy.stats import chi2_contingency
from itertools import combinations
from statsmodels.sandbox.stats.multicomp import multipletests
df = pd.read_csv("data_cleaned.csv", sep=';')
df = df.iloc[1:282] # extract complete responses only

In [4]:
# count of general password reuse
print("Reuse passwords: ", df["CB3"].str.contains(r'Yes').sum())
print("Do not reuse passwords: ", df["CB3"].str.contains(r'No').sum())
print("Unsure: ", df["CB3"].str.contains(r'Unsure').sum())

Reuse passwords:  211
Do not reuse passwords:  61
Unsure:  7


In [5]:
# password reuse for third-party PMs
print("Use third-party PM and reuse password: ",((df["RB"].str.contains(r'I used a third-party password manager')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Use third-party PM and don't reuse password: ", ((df["RB"].str.contains(r'I used a third-party password manager')) & (df["CB3"].str.contains(r'No'))).sum())
print("Use third-party PM and unsure about reuse: ", ((df["RB"].str.contains(r'I used a third-party password manager')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Use third-party PM and reuse password:  25
Use third-party PM and don't reuse password:  26
Use third-party PM and unsure about reuse:  2


In [6]:
# password reuse for Remember passwords without recording them
print("Remember passwords without recording them and reuse password: ",((df["RB"].str.contains(r'I remembered my passwords without writing them down or storing them digitally')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Remember passwords without recording them and don't reuse password: ", ((df["RB"].str.contains(r'I remembered my passwords without writing them down or storing them digitally')) & (df["CB3"].str.contains(r'No'))).sum())
print("Remember passwords without recording them and unsure about reuse: ", ((df["RB"].str.contains(r'I remembered my passwords without writing them down or storing them digitally')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Remember passwords without recording them and reuse password:  158
Remember passwords without recording them and don't reuse password:  32
Remember passwords without recording them and unsure about reuse:  6


In [7]:
# password reuse for Save passwords in the browser
print("Save passwords in the browser and reuse password: ",((df["RB"].str.contains(r'I saved my passwords in the browser')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Save passwords in the browser and don't reuse password: ", ((df["RB"].str.contains(r'I saved my passwords in the browser')) & (df["CB3"].str.contains(r'No'))).sum())
print("Save passwords in the browser and unsure about reuse: ", ((df["RB"].str.contains(r'I saved my passwords in the browser')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Save passwords in the browser and reuse password:  135
Save passwords in the browser and don't reuse password:  22
Save passwords in the browser and unsure about reuse:  3


In [8]:
# password reuse for Store passwords in digital files
print("Store passwords in digital files and reuse password: ",((df["RB"].str.contains(r'I stored my passwords in a digital file or files')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Store passwords in digital files and don't reuse password: ", ((df["RB"].str.contains(r'I stored my passwords in a digital file or files')) & (df["CB3"].str.contains(r'No'))).sum())
print("Store passwords in digital files and unsure about reuse: ", ((df["RB"].str.contains(r'I stored my passwords in a digital file or files')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Store passwords in digital files and reuse password:  88
Store passwords in digital files and don't reuse password:  19
Store passwords in digital files and unsure about reuse:  2


In [9]:
# password reuse for Write passwords down
print("Write passwords down and reuse password: ",((df["RB"].str.contains(r'I wrote my passwords down on paper')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Write passwords down and don't reuse password: ", ((df["RB"].str.contains(r'I wrote my passwords down on paper')) & (df["CB3"].str.contains(r'No'))).sum())
print("Write passwords down and unsure about reuse: ", ((df["RB"].str.contains(r'I wrote my passwords down on paper')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Write passwords down and reuse password:  82
Write passwords down and don't reuse password:  23
Write passwords down and unsure about reuse:  3


In [10]:
# password reuse for Use a system-provided password manager
print("Use a system-provided password manager and reuse password: ",((df["RB"].str.contains(r'I used a system-provided password manager')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Use a system-provided password manager and don't reuse password: ", ((df["RB"].str.contains(r'I used a system-provided password manager')) & (df["CB3"].str.contains(r'No'))).sum())
print("Use a system-provided password manager and unsure about reuse: ", ((df["RB"].str.contains(r'I used a system-provided password manager')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Use a system-provided password manager and reuse password:  64
Use a system-provided password manager and don't reuse password:  17
Use a system-provided password manager and unsure about reuse:  2


In [11]:
# password reuse for Reset my password every time I log in
print("Reset my password every time I log in and reuse password: ",((df["RB"].str.contains(r'I reset my password every time I log in rather than remembering my password')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("Reset my password every time I log in and don't reuse password: ", ((df["RB"].str.contains(r'I reset my password every time I log in rather than remembering my password')) & (df["CB3"].str.contains(r'No'))).sum())
print("Reset my password every time I log in and unsure about reuse: ", ((df["RB"].str.contains(r'I reset my password every time I log in rather than remembering my password')) & (df["CB3"].str.contains(r'Unsure'))).sum())

Reset my password every time I log in and reuse password:  25
Reset my password every time I log in and don't reuse password:  2
Reset my password every time I log in and unsure about reuse:  0


In [12]:
# password reuse for None of the above
print("None of the above and reuse password: ",((df["RB"].str.contains(r'None of the above')) & (df["CB3"].str.contains(r'Yes'))).sum())
print("None of the above and don't reuse password: ", ((df["RB"].str.contains(r'None of the above')) & (df["CB3"].str.contains(r'No'))).sum())
print("None of the above and unsure about reuse: ", ((df["RB"].str.contains(r'None of the above')) & (df["CB3"].str.contains(r'Unsure'))).sum())

None of the above and reuse password:  2
None of the above and don't reuse password:  1
None of the above and unsure about reuse:  0


In [2]:
# chi-square test for password reuse across techniques
# the values below are from the values above
remember = [158,32]
browser = [136,22]
digital = [88,19]
write = [83,23]
system = [65,17]
third = [25,26]
reset = [25,2]

tech = np.array([remember, browser, digital, write, system, third, reset])
chi2_stat, p_val, dof, ex = stats.chi2_contingency(tech)

print("===Chi2 Stat===(Password reuse is less prevalent with PMs)")
print(chi2_stat)

print("===Degrees of Freedom===")
print(dof)

print("===P-Value===")
print(p_val)

print("===Contingency Table===")
print(ex)

===Chi2 Stat===(Password reuse is less prevalent with PMs)
39.22263588628574
===Degrees of Freedom===
6
===P-Value===
6.473146701881613e-07
===Contingency Table===
[[152.84327323  37.15672677]
 [127.10124827  30.89875173]
 [ 86.07489598  20.92510402]
 [ 85.2704577   20.7295423 ]
 [ 65.96393897  16.03606103]
 [ 41.02635229   9.97364771]
 [ 21.71983356   5.28016644]]


In [3]:
# post-hoc analysis of password re-use for different techniques
chi2, p, dof, ex = chi2_contingency(tech, correction=True)
print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")

df_tech = pd.DataFrame(tech)
    
def get_asterisks_for_pval(p_val):
    """Receives the p-value and returns asterisks string."""
    if p_val > 0.05:
        p_text = "ns"  # above threshold => not significant
    elif p_val < 1e-4:  
        p_text = '****'
    elif p_val < 1e-3:
        p_text = '***'
    elif p_val < 1e-2:
        p_text = '**'
    else:
        p_text = '*'
    
    return p_text

def chisq_and_posthoc_corrected(df):
    """Receives a dataframe and performs chi2 test and then post hoc.
    Prints the p-values and corrected p-values (after FDR correction)"""
    # start by running chi2 test on the matrix
    chi2, p, dof, ex = chi2_contingency(df, correction=True)
    print(f"Chi2 result of the contingency table: {chi2}, p-value: {p}")
    
    # post-hoc
    all_combinations = list(combinations(df.index, 2))  # gathering all combinations for post-hoc chi2
    p_vals = []
    print("Significance results:")
    for comb in all_combinations:
        new_df = df[(df.index == comb[0]) | (df.index == comb[1])]
        chi2, p, dof, ex = chi2_contingency(new_df, correction=True)
        p_vals.append(p)
        # print(f"For {comb}: {p}")  # uncorrected

    # checking significance
    # correction for multiple testing
    reject_list, corrected_p_vals = multipletests(p_vals, method='fdr_bh')[:2]
    for p_val, corr_p_val, reject, comb in zip(p_vals, corrected_p_vals, reject_list, all_combinations):
        print(f"{comb}: p_value: {p_val:5f}; corrected: {corr_p_val:5f} ({get_asterisks_for_pval(p_val)}) reject: {reject}")
        
        
chisq_and_posthoc_corrected(df_tech)

Chi2 result of the contingency table: 39.22263588628574, p-value: 6.473146701881613e-07
Chi2 result of the contingency table: 39.22263588628574, p-value: 6.473146701881613e-07
Significance results:
(0, 1): p_value: 0.548599; corrected: 0.678889 (ns) reject: False
(0, 2): p_value: 0.967722; corrected: 1.000000 (ns) reject: False
(0, 3): p_value: 0.382126; corrected: 0.617280 (ns) reject: False
(0, 4): p_value: 0.552456; corrected: 0.678889 (ns) reject: False
(0, 5): p_value: 0.000001; corrected: 0.000011 (****) reject: True
(0, 6): p_value: 0.327543; corrected: 0.573200 (ns) reject: False
(1, 2): p_value: 0.500651; corrected: 0.678889 (ns) reject: False
(1, 3): p_value: 0.138949; corrected: 0.407767 (ns) reject: False
(1, 4): p_value: 0.241449; corrected: 0.507044 (ns) reject: False
(1, 5): p_value: 0.000000; corrected: 0.000003 (****) reject: True
(1, 6): p_value: 0.534309; corrected: 0.678889 (ns) reject: False
(2, 3): p_value: 0.581905; corrected: 0.678889 (ns) reject: False
(2, 4): 