In [89]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
from matplotlib.pyplot import figure
#!pip install dataframe-image
import dataframe_image as dfi
import os.path
from os import path

"""
Preprocessing survey data
- shorten column headers
- filter columns with headers containing a particular string
- perform the mapping on series
- display in heatmap style

Survey Categories: demo, awareScale, train, behaviour
Table Types: webUsage, awarePrct, awareTopic
"""
df = pd.read_csv("data/se-survey-results.csv")
#print(df.columns)

nColumns = {'Timestamp':'timestamp', 
           'Age Range':'demoAge', 
           'Gender Identity':'demoGender', 
           'Education level ':'demoEdu',
           'Employment Status':'demoEmploy',
           'How important to you are your security practices for different purposes of your internet usage?  [Entertainment]':'webUsageEntertainment',
           'How important to you are your security practices for different purposes of your internet usage?  [Social]':'webUsageSocial',
           'How important to you are your security practices for different purposes of your internet usage?  [Games]':'webUsageGame',
           'How important to you are your security practices for different purposes of your internet usage?  [Banking]':'webUsageBank',
           'How important to you are your security practices for different purposes of your internet usage?  [Shopping]':'webUsageShop',
           'How important to you are your security practices for different purposes of your internet usage?  [Education ]':'webUsageEdu',
           'How important to you are your security practices for different purposes of your internet usage?  [Information ]':'webUsageInfo',
           'How important to you are your security practices for different purposes of your internet usage?  [Email]':'webUsageEmail',
           'On a scale from 1 to 10, rate your awareness of security practices:':'awareScale',
           'Please rate your familiarity with various security practices: [Use a strong password]':'awarePrctStrongPassword',
           'Please rate your familiarity with various security practices: [Log off public computers]':'awarePrctLogOffPublicPC',
           'Please rate your familiarity with various security practices: [Back up important information]':'awarePrctBackupData',
           'Please rate your familiarity with various security practices: [Protect your personal data]':'awarePrctPersonalData',
           'Please rate your familiarity with various security practices: [Avoid pop-ups, unknown emails, and links]':'awarePrctAvoidPopup',
           'Please rate your familiarity with various security practices: [Connect to secure Wi-Fi]':'awarePrctSecureWifi',
           'Please rate your familiarity with various security topics: [Ransomware]':'awareTopicRansomware',
           'Please rate your familiarity with various security topics: [Credential Stuffing]':'awareTopicCredentialSurfing',
           'Please rate your familiarity with various security topics: [Social engineering]':'awareTopicSE',
           'Please rate your familiarity with various security topics: [Phishing]':'awareTopicPhishing',
           'Please rate your familiarity with various security topics: [Man-in-the-middle attack]':'awareTopicManInTheMiddleAttack',
           'If you were offered an online free course about cybersecurity and privacy, would you be interested in joining the course?':'trainFreeCourse',
           'If you would be interested in joining a course about cybersecurity and privacy, how much time are you willing to allocate during a day?':'trainTimeSpent',
           'Do you use the same password for most of your accounts?':'behaviourUseSamePwd',
           'Do you share some of your passwords with your friends or family members? ':'behaviourSharePwdSomeone',
           'If the bank calls you and asks for your password as verification, would you give it to them?':'behaviourSharePwdBankCall',
           'If you got an email to update your log in information via a link, would you click the link?':'behaviourClickLoginEmailLink',
           'Would you pay to retrieve your data if you are the victim of a ransomware attack? ':'behaviourPayRansomware'}

df.rename(columns=nColumns,inplace=True)

tableHeaders = {"webUsage":{"webUsageEntertainment":"Fun","webUsageSocial":"Social","webUsageGame":"Game",
                  "webUsageBank":"Bank","webUsageShop":"Shop","webUsageEdu":"Edu","webUsageInfo":"Info","webUsageEmail":"Email"},
                  "awarePrct":{"awarePrctStrongPassword":"StrongPwd","awarePrctLogOffPublicPC":"LogOffPublicPC","awarePrctBackupData":"BackupData",
                  "awarePrctPersonalData":"PersonalData","awarePrctAvoidPopup":"AvoidPopup","awarePrctSecureWifi":"SecureWifi"},
               "awareTopic":{"awareTopicRansomware":"Ransomware","awareTopicCredentialSurfing":"CredentialSurf","awareTopicSE":"SocialEng",
                "awareTopicPhishing":"Phishing","awareTopicManInTheMiddleAttack":"MainInTheMiddle"}}
tableRowLabels = {"webUsage":["Very", "Fairly", "Important", "Slightly", "Not at all"],
                     "awarePrct":["Completely", "Mostly", "Somewhat", "A Little Bit"],
                 "awareTopic":["Completely", "Mostly", "Somewhat", "A Little Bit"]}

def createStyleTable(df, tableType, tableHeader, tableRowLabel):
    # Step 1: rename web usage header
    tableData = df.loc[:,df.columns.str.contains(tableType)].copy()
    tableData.rename(columns=tableHeader,inplace=True)
                                               
    # Step 2: Count unique occurences
    counter = []
    for head in tableHeader.values():
        counter.append(round(tableData[head].value_counts()))   
                                               
    # Step 3: merge all web usage counters
    tableCounter = pd.concat(counter,axis=1)
    if tableType == "webUsage":
        # drop No Opinion row, make all values integers
        tableCounter.drop("No Opinion", axis=0, inplace=True)
        for head in tableHeader.values():
            tableCounter[head] = tableCounter[head].apply(int)
                                               
    # Step 4: reorder rows  
    sortedRows = []
    for row in tableRowLabel:
        sortedRows.append(tableCounter.loc[row])
    tableCounterSortedT = pd.concat(sortedRows,axis=1).T
                                               
                                               
    # Step 5: show data in heatmap style 
    tableStyled = tableCounterSortedT.style.background_gradient(cmap ='RdYlGn')\
       .set_properties(**{'font-size': '20px','text-align': 'center', 'padding': '12px'})
                                               
    # Step 6: save styled table
    dfi.export(tableStyled,"figures/"+tableType+"Styled.png")

"""
Create a styled table
Table Types: webUsage, awarePrct, awareTopic
"""
tableType = "webUsage"   
fileName = "figures/"+tableType+"Styled.png"
if not path.exists(fileName):
    createStyleTable(df,tableType,tableHeaders[tableType],tableRowLabels[tableType])
else:
    print("File exists in "+fileName)

tableType = "awarePrct"    
fileName = "figures/"+tableType+"Styled.png"
if not path.exists("figures/"+tableType+"Styled.png"):
    createStyleTable(df,tableType,tableHeaders[tableType],tableRowLabels[tableType])
else:
    print("File exists in "+fileName)

tableType = "awareTopic"  
fileName = "figures/"+tableType+"Styled.png"
if not path.exists("figures/"+tableType+"Styled.png"):
    createStyleTable(df,tableType,tableHeaders[tableType],tableRowLabels[tableType])
else:
    print("File exists in "+fileName)

"""
process security awareness training data

"""
trainData = df.loc[:,df.columns.str.contains("train")].copy()
trainHeader = {"trainFreeCourse":"JoinFreeCourse","trainTimeSpent":"TimeSpentCourse"}
trainData.rename(columns=trainHeader,inplace=True)

ageData = df.loc[:,df.columns.str.contains("demoAge")].copy()

trainAgeData = pd.concat([trainData, ageData],axis=1)
print(trainAgeData)

File exists in figures/webUsageStyled.png
File exists in figures/awarePrctStyled.png
File exists in figures/awareTopicStyled.png
    JoinFreeCourse                      TimeSpentCourse      demoAge
0            Maybe      Between 10 and 30 minutes a day      21 - 29
1              Yes      Between 10 and 30 minutes a day      18 - 21
2              Yes      Between 10 and 30 minutes a day      21 - 29
3              Yes  Between 30 minutes and 1 hour a day      21 - 29
4            Maybe           Less than 10 minutes a day      30 - 49
..             ...                                  ...          ...
198            Yes      Between 10 and 30 minutes a day      30 - 49
199          Maybe      Between 10 and 30 minutes a day  50 or older
200          Maybe      Between 10 and 30 minutes a day  50 or older
201          Maybe      Between 10 and 30 minutes a day  50 or older
202            Yes      Between 10 and 30 minutes a day      30 - 49

[203 rows x 3 columns]
