# Notebook Overview

This notebook is used to analyze the privacy risk survey responses.

In [23]:
import json

scenarios = json.load(open('scenarios-risked.json'))
print('Read %i scenarios.' % len(scenarios))

Read 203 scenarios.


In [24]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)
    
summarize(scenarios)

Scenarios: 203
Apple App: 71
Google Play: 132
Unique Apps: 148
Sentences: 1592


In [25]:
from collections import Counter
import spacy, nltk

# load the spacey nlp processor
nlp = spacy.load("en_core_web_sm")

# extract sentence containing infotype span plus relative indices
def get_sentence(text, start, end):
    doc = nlp(text)
    span = doc.char_span(start, end)
    if span:
        sent = span.sent
        return sent.text, start - sent.start_char, end - sent.start_char
    else:
        return None, None, None

data = {'usage_freq': Counter(), 'false_pos': 0, 'scale_usage': Counter()}
risks = []
info_data = {'W':[], 'S':[], 'U':[]}
info_count = 0
for scenario in scenarios:
    data['usage_freq'][scenario['usage_freq']] += 1
    
    records = []
    risk = []
    for start, end, rating in scenario['risks']:        
        if rating == 'N':
            data['false_pos'] += 1
            continue
            
        start = int(start)
        end = int(end)
        rating = int(rating)
            
        info_count += 1
        data['scale_usage'][rating] += 1
        risk.append(int(rating))
        sent, rel_start, rel_end = get_sentence(scenario['text'], start, end)
        if not sent:
            continue
                
        info_record = {
            'scenario_id': scenario['scenario_id'],
            'sent_text': sent,
            'start': rel_start,
            'end': rel_end,
            'risk': rating
        }
        records.append(info_record)
         
    # update risk dist for each record and sort into bins
    for info_record in records:
        info_record['risk_dist'] = ','.join([str(i) for i in sorted(risk)])
        if info_record['risk'] < 2:
            info_data['W'].append(info_record)
        elif info_record['risk'] < 4:
            info_data['S'].append(info_record)
        else:
            info_data['U'].append(info_record)
    
    # update average risks
    if len(risk) > 0:
        risks.append(sum(risk) / len(risk))

data['avg_infotype'] = info_count / len(scenarios)
        
if len(risks) > 0:
    data['avg_risk'] = sum(risks) / len(risks)

In [26]:
print('Infotypes Scored: %i' % info_count)

for k, v in data.items():
    print('%s:\t%s' % (k, v))

info_count = sum([len(info_data['W']), len(info_data['S']), len(info_data['U'])])
print('\nUser-perceived Precision: %0.3f' % ((info_count - data['false_pos']) / info_count))
    
print('\nscale_usage_w:\t%i' % (sum([data['scale_usage'][i] for i in [0, 1]])))
print('scale_usage_s:\t%i' % (sum([data['scale_usage'][i] for i in [2, 3]])))
print('scale_usage_u:\t%i' % (sum([data['scale_usage'][i] for i in [4, 5]])))

Infotypes Scored: 2338
usage_freq:	Counter({'daily': 107, 'weekly': 78, 'monthly': 14, 'yearly': 4})
false_pos:	267
scale_usage:	Counter({2: 478, 0: 473, 1: 436, 3: 353, 5: 303, 4: 295})
avg_infotype:	11.517241379310345
avg_risk:	2.203186290951853

User-perceived Precision: 0.886

scale_usage_w:	909
scale_usage_s:	831
scale_usage_u:	598


In [27]:
from collections import Counter

# create a nominal distribution from item responses
scale_usage = {}
for scenario in scenarios:
    counter = Counter()
    for start, end, rating in scenario['risks']:
        if rating == 'N':
            continue
        counter[int(rating)] += 1
        
    scale_usage[scenario['scenario_id']] = counter

# count the number of distributions that only use one or more response items
meta_counter = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}

# count the number of distributions where respondents use tails or a mix of scale
scale_counter = Counter({'U':Counter(), 'W':Counter(), 'M':Counter()})

for counter in scale_usage.values():
    meta_counter[len(counter.keys())].append(counter)
    
    skew = sum([1 for c in counter.keys() if c < 3])
    # if all ratings are < 3
    if skew == len(counter):
        scale_counter['W'][len(counter.keys())] += 1
    # elif all ratings are >= 3
    elif skew == 0:
        scale_counter['U'][len(counter.keys())] += 1
    # else all ratings are mixed, above and below 3
    else:
        scale_counter['M'][len(counter.keys())] += 1
    
print('Scale utilization for %i survey submissions:\n' % len(scenarios))
for count in sorted(meta_counter.keys()):
    print('%s: %s' % (count, len(meta_counter[count])))

print('\nScale utilization (W)illing, (U)nwilling and (M)ixed\n')
print('\t1\t2\t3\t4\t5\t6 (# different options used)')
for label, count in scale_counter.items():
    usage = [0, 0, 0, 0, 0, 0]
    for c, f in count.items():
        usage[int(c) - 1] = f
    print('%s:\t%s' % (label, '\t'.join([str(i) for i in usage])))
    
print('\nPercent of Privacy Fundamentalists: %0.2f' % (
    100 * sum(scale_counter['U'].values()) / len(scenarios)))

print('\nSingle-label responses frequencies for %i authors:\n' % len(meta_counter[1]))
for counter in meta_counter[1]:
    print(counter)

Scale utilization for 203 survey submissions:

1: 33
2: 52
3: 60
4: 35
5: 17
6: 6

Scale utilization (W)illing, (U)nwilling and (M)ixed

	1	2	3	4	5	6 (# different options used)
U:	11	9	8	0	0	0
W:	22	27	16	0	0	0
M:	0	16	36	35	17	6

Percent of Privacy Fundamentalists: 13.79

Single-label responses frequencies for 33 authors:

Counter({5: 8})
Counter({4: 10})
Counter({0: 7})
Counter({1: 10})
Counter({2: 14})
Counter({0: 5})
Counter({0: 14})
Counter({0: 12})
Counter({4: 14})
Counter({0: 13})
Counter({0: 7})
Counter({0: 12})
Counter({4: 2})
Counter({3: 4})
Counter({5: 13})
Counter({0: 18})
Counter({0: 14})
Counter({4: 9})
Counter({3: 9})
Counter({0: 7})
Counter({0: 9})
Counter({0: 17})
Counter({0: 10})
Counter({0: 5})
Counter({1: 8})
Counter({1: 8})
Counter({0: 8})
Counter({5: 8})
Counter({0: 13})
Counter({3: 13})
Counter({0: 7})
Counter({5: 15})
Counter({2: 12})


In [28]:
app_urls = json.load(open('app-url-dictionary.json'))
print('Read %i unique app url descriptions.' % len(app_urls))

risk_by_category = {'apple': {}, 'google': {}}
cat_counter = {'apple': Counter(), 'google': Counter()}

for scenario in scenarios:
    # count statistics by platform
    platform = ''
    if scenario['app_url'].startswith('https://play.google.com'):
        platform = 'google'
    else:
        platform = 'apple'
    
    # record the category by platform
    app_category = app_urls[scenario['app_url']]['app_category']
    cat_counter[platform][app_category] += 1
    
    # record the risk ratings by platform
    if not app_category in risk_by_category[platform]:
        risk_by_category[platform][app_category] = []
    
    for start, end, rating in scenario['risks']:
        
        if rating == 'N':
            continue
        risk_by_category[platform][app_category].append(int(rating))


print('\nApple Category Distribution\n')
for app_category in sorted(cat_counter['apple'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['apple'][app_category]))
print('\nRisk by Apple Categories\n')
avg_risk1 = []
for app_category in sorted(risk_by_category['apple'].keys()):
    risk_scores = risk_by_category['apple'][app_category]
    print('%s: %0.2f' % (app_category.ljust(25), sum(risk_scores) / len(risk_scores)))
    avg_risk1.extend(risk_scores)
print('\nAverage Risk (Apple): %0.2f' % (sum(avg_risk1) / len(avg_risk1)))
    
print('\nGoogle Category Distribution\n')
for app_category in sorted(cat_counter['google'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['google'][app_category]))    
print('\nRisk by Google Categories\n')
avg_risk2 = []
for app_category in sorted(risk_by_category['google'].keys()):
    risk_scores = risk_by_category['google'][app_category]
    print('%s: %0.2f' % (app_category.ljust(25), sum(risk_scores) / len(risk_scores)))
    avg_risk2.extend(risk_scores)
print('\nAverage Risk (Google): %0.2f' % (sum(avg_risk2) / len(avg_risk2)))


Read 323 unique app url descriptions.

Apple Category Distribution

Books                    : 1
Entertainment            : 3
Family                   : 1
Finance                  : 12
Games                    : 5
Health & Fitness         : 29
Lifestyle                : 4
Music                    : 2
News                     : 2
Productivity             : 1
Shopping                 : 3
Social Networking        : 4
Sports                   : 2
Utilities                : 2

Risk by Apple Categories

Books                    : 3.95
Entertainment            : 1.46
Family                   : 3.05
Finance                  : 3.13
Games                    : 2.04
Health & Fitness         : 1.21
Lifestyle                : 2.67
Music                    : 0.50
News                     : 0.95
Productivity             : 4.75
Shopping                 : 1.61
Social Networking        : 2.97
Sports                   : 1.93
Utilities                : 0.92

Average Risk (Apple): 1.92

Google Category Dist

In [30]:
# compare two samples of risk ratings between Apple and Google for significant differences

from scipy.stats import wilcoxon
import random

print('Apple Ratings: %i, Google Ratings: %i' % (len(avg_risk1), len(avg_risk2)))

# randomly downsample the larger sample so both sample lengths are equal
if len(avg_risk1) > len(avg_risk2):
    sample1 = random.sample(avg_risk1, len(avg_risk2))
    sample2 = avg_risk2
else:
    sample1 = avg_risk1
    sample2 = random.sample(avg_risk2, len(avg_risk1))

res = wilcoxon(sample1, sample2)
print('Wilcoxon Rank Sum Test Statistic: %0.1f' % res.statistic)
print('p-value: %0.6f' % res.pvalue)
print('N = %i' % len(sample1))

Apple Ratings: 808, Google Ratings: 1530
Wilcoxon Rank Sum Test Statistic: 88539.0
p-value: 0.000003
N = 808


In [21]:
# identify scenarios where the same information type is rated at 
# different risk levels by different users to illustrate context

find_examples = ['phone numbers', 'phone number']
rating_label = ['Very Willing', 'Willing', 'Somewhat Willing', 'Somewhat Unwilling', 'Unwilling', 'Very Unwilling']

for scenario in scenarios:
    for start, end, rating in scenario['risks']:
        start = int(start)
        end = int(end)
        
        if rating == 'N':
            continue
            
        phrase = scenario['text'][start:end]
        
        if phrase in find_examples:
            text = scenario['text'][:start] + '*** ' + phrase + ' ***' + scenario['text'][end:]
            print('\n%s: %s' % (scenario['scenario_id'], text))
            print('Rating: %s' % (rating_label[int(rating)]))
            
            counter = scale_usage[scenario['scenario_id']]
            usage = ', '.join(['%s: %s' % (k, counter[k]) for k in sorted(counter.keys())])
            print('Scale usage: %s' % usage)



MAS-R-2: By using this screen edit my personal and account information. Information including full name, contact info like *** phone number *** and email ID, proof of identity, account ownership, and control settings. I can change my Facebook password and security settings like two-factor authentication, and authorized logins, get alerts about unrecognized logins, safe browsing, etc. I can set my payment preferences by using this screen. I can take a quick privacy checkup to know about my important privacy issues about security. I can set or change my preferences for the following news feed, reaction preferences like emojis and stickers, notification preferences, shortcuts preferences, language, and region, media preferences like autoplay, data saver, video quality, and themes like dark mode/light mode. I can change audience and visibility settings to control who can see my posts, stories, and profile. I can access a log of my activity and also I can access my information and download

In [16]:
print('Most Unwilling Types\n')
indexed = {}
for r in info_data['U']:
    infotype = r['sent_text'][r['start']:r['end']].lower()
    if not infotype in indexed:
        indexed[infotype] = []
    indexed[infotype].append([r['risk'], r['risk_dist']])
    
for infotype in sorted(indexed.keys()):
    print('%s: %s' % (infotype, indexed[infotype]))

import csv
with open('high-risk.csv', 'w') as f:
    writer = csv.DictWriter(f, fieldnames=list(info_data['U'][0].keys()))
    writer.writerows(info_data['U'])

Most Unwilling Types

-out: [[4, '2,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5']]
/ebooks: [[4, '2,3,3,4,4,4,4,4,4,4,4,4,4,4,4,4,5,5,5']]
account: [[4, '4,4,4,4,4,4,4,4,4,4'], [4, '0,0,0,0,0,2,3,4,4,4,4,4,4,4'], [4, '1,2,3,3,4,4,4,4,5,5,5,5,5,5'], [5, '3,3,3,3,3,3,3,3,5,5,5,5'], [4, '4,4,4,4,4,4,4,4,4,4,4,4,4,4'], [5, '4,5,5,5,5,5,5,5'], [5, '3,5,5,5,5,5,5,5,5,5,5,5,5,5,5'], [4, '2,3,4,4,4,4,5,5,5,5'], [5, '5,5,5,5,5,5,5,5'], [5, '3,3,3,4,4,4,4,4,4,4,4,4,5'], [5, '0,0,1,1,4,4,5,5,5,5,5,5']]
account history: [[5, '1,2,3,3,4,4,4,4,5,5,5,5,5,5']]
account information: [[5, '0,0,0,1,1,1,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5'], [5, '1,2,2,4,5,5']]
account ownership: [[4, '0,0,0,1,1,1,1,1,1,1,1,1,2,2,2,2,2,3,3,3,3,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5']]
account settings: [[4, '3,3,3,3,4,4,4,4,4,4,4,4,4']]
accounts: [[4, '1,1,1,2,2,2,2,3,4,4,4,5'], [4, '4,4,4,4,4,4,4,4,4,4,4,4,4,4'], [5, '0,0,0,1,1,1,1,2,2,2,2,2,3,3,3,4,4,4,5']]
activity: [[4, '0,0,0,1,1,1,1,1,1,1,1,1,2,2,2

In [20]:
categories = ['Social']

for scenario in scenarios:
    if not scenario['app_url'] in app_urls:
        continue
    if app_urls[scenario['app_url']]['app_category'] in categories:
        print('%s: (%s / %s) %s' % (scenario['scenario_id'], app_urls[scenario['app_url']]['app_name'], app_urls[scenario['app_url']]['app_category'], scenario['text']))
        for start, end, rating in scenario['risks']:
            start = int(start)
            end = int(end)
            
            if rating != 'N':
                print('%s [%s]' % (scenario['text'][start:end], rating))
        print()
        

MAS-R-2: (Facebook / Social) By using this screen edit my personal and account information. Information including full name, contact info like phone number and email ID, proof of identity, account ownership, and control settings. I can change my Facebook password and security settings like two-factor authentication, and authorized logins, get alerts about unrecognized logins, safe browsing, etc. I can set my payment preferences by using this screen. I can take a quick privacy checkup to know about my important privacy issues about security. I can set or change my preferences for the following news feed, reaction preferences like emojis and stickers, notification preferences, shortcuts preferences, language, and region, media preferences like autoplay, data saver, video quality, and themes like dark mode/light mode. I can change audience and visibility settings to control who can see my posts, stories, and profile. I can access a log of my activity and also I can access my information a

In [19]:
app_names = ['Instagram']

for scenario in scenarios:
    if not scenario['app_url'] in app_urls:
        continue
    if app_urls[scenario['app_url']]['app_name'] in app_names:
        print('%s: (%s / %s) %s' % (scenario['scenario_id'], app_urls[scenario['app_url']]['app_name'], app_urls[scenario['app_url']]['app_category'], scenario['text']))
        for start, end, rating in scenario['risks']:
            start = int(start)
            end = int(end)
            
            if rating != 'N':
                print('%s [%s]' % (scenario['text'][start:end], rating))
        print()
        

MAS-R-14: (Instagram / Social) Well I wouldn't really use this screen for much. These settings act as the profile information to my instagram account. I usually use it whenever I want to change some personal information. For example, if I get a new email address then I would go to this page to update it. I would also change my phone number if I felt it was important for instagram to have that information. More general, I use the account history/personal information page to change aspects of my account that are about me. My name, username, privacy habits, etc are all located there. It also shows me my account history and general instagram usage information. I find this incredibly important for updating personal information and making it private if necessary. For me the less information I give the better. This goes not only for my friends on instagram but includes information given to the instagram platform itself. 
settings [3.0]
profile information [4.0]
instagram account [5.0]
email a