# Notebook Overview

This notebook is used to analyze the privacy risk survey responses.

In [1]:
import json

scenarios = json.load(open('scenarios-risked.json'))
print('Read %i scenarios.' % len(scenarios))

Read 203 scenarios.


In [2]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)
    
summarize(scenarios)

Scenarios: 203
Apple App: 71
Google Play: 132
Unique Apps: 148
Sentences: 1592


In [3]:
from collections import Counter

data = {'usage_freq': Counter(), 'false_pos': 0, 'scale_usage': Counter()}
risks = []
infos = [[], [], []]
info_count = 0
for scenario in scenarios:
    data['usage_freq'][scenario['usage_freq']] += 1
    
    risk = []
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        key_fp = 'part2_false_pos%i' % i
        
        if not key_info in scenario:
            info_count += i - 1
            break
        
        if key_risk in scenario:
            data['scale_usage'][scenario[key_risk]] += 1
            scenario[key_risk] = int(scenario[key_risk])
            risks.append(scenario[key_risk])
            if scenario[key_risk] < 2:
                infos[0].append(scenario[key_info])
            elif scenario[key_risk] < 4:
                infos[1].append(scenario[key_info])
            else:
                infos[2].append(scenario[key_info])
        elif key_fp in scenario:
            data['false_pos'] += 1
    if len(risk) > 0:
        risks.append(sum(risk) / len(risk))

data['avg_infotype'] = info_count / len(scenarios)
        
if len(risks) > 0:
    data['avg_risk'] = sum(risks) / len(risks)
    
for k, v in data.items():
    print('%s:\t%s' % (k, v))

info_count = sum([len(infos[0]), len(infos[1]), len(infos[2])])
print('\nUser-perceived Precision: %0.3f' % ((info_count - data['false_pos']) / info_count))
    
print('\nscale_usage_w:\t%i' % (sum([data['scale_usage'][str(i)] for i in [0, 1]])))
print('scale_usage_s:\t%i' % (sum([data['scale_usage'][str(i)] for i in [2, 3]])))
print('scale_usage_u:\t%i' % (sum([data['scale_usage'][str(i)] for i in [4, 5]])))

usage_freq:	Counter({'daily': 107, 'weekly': 78, 'monthly': 14, 'yearly': 4})
false_pos:	267
scale_usage:	Counter({'2': 478, '0': 473, '1': 436, '3': 353, '5': 303, '4': 295})
avg_infotype:	12.832512315270936
avg_risk:	2.201026518391788

User-perceived Precision: 0.886

scale_usage_w:	909
scale_usage_s:	831
scale_usage_u:	598


In [16]:
from collections import Counter

# create a nominal distribution from item responses
scale_usage = {}
for scenario in scenarios:
    counter = Counter()
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        
        if key_risk in scenario:
            counter[int(scenario[key_risk])] += 1
        
    scale_usage[scenario['scenario_id']] = counter

# count the number of distributions that only use one or more response items
meta_counter = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}

# count the number of distributions where respondents use tails or a mix of scale
scale_counter = Counter({'U':Counter(), 'W':Counter(), 'M':Counter()})

for counter in scale_usage.values():
    meta_counter[len(counter.keys())].append(counter)
    
    skew = sum([1 for c in counter.keys() if c < 3])
    # if all ratings are < 3
    if skew == len(counter):
        scale_counter['W'][len(counter.keys())] += 1
    # elif all ratings are >= 3
    elif skew == 0:
        scale_counter['U'][len(counter.keys())] += 1
    # else all ratings are mixed, above and below 3
    else:
        scale_counter['M'][len(counter.keys())] += 1
    
print('Scale utilization for %i survey submissions:\n' % len(scenarios))
for count in sorted(meta_counter.keys()):
    print('%s: %s' % (count, len(meta_counter[count])))

print('\nScale utilization (W)illing, (U)nwilling and (M)ixed\n')
print('\t1\t2\t3\t4\t5\t6 (# different options used)')
for label, count in scale_counter.items():
    usage = [0, 0, 0, 0, 0, 0]
    for c, f in count.items():
        usage[int(c) - 1] = f
    print('%s:\t%s' % (label, '\t'.join([str(i) for i in usage])))
    
print('\nPercent of Privacy Fundamentalists: %0.2f' % (
    100 * sum(scale_counter['U'].values()) / len(scenarios)))

print('\nSingle-label responses frequencies for %i authors:\n' % len(meta_counter[1]))
for counter in meta_counter[1]:
    print(counter)

Scale utilization for 203 survey submissions:

1: 33
2: 52
3: 60
4: 35
5: 17
6: 6

Scale utilization (W)illing, (U)nwilling and (M)ixed

	1	2	3	4	5	6 (# different options used)
U:	11	9	8	0	0	0
W:	22	27	16	0	0	0
M:	0	16	36	35	17	6

Percent of Privacy Fundamentalists: 13.79

Single-label responses frequencies for 33 authors:

Counter({5: 8})
Counter({4: 10})
Counter({0: 7})
Counter({1: 10})
Counter({2: 14})
Counter({0: 5})
Counter({0: 14})
Counter({0: 12})
Counter({4: 14})
Counter({0: 13})
Counter({0: 7})
Counter({0: 12})
Counter({4: 2})
Counter({3: 4})
Counter({5: 13})
Counter({0: 18})
Counter({0: 14})
Counter({4: 9})
Counter({3: 9})
Counter({0: 7})
Counter({0: 9})
Counter({0: 17})
Counter({0: 10})
Counter({0: 5})
Counter({1: 8})
Counter({1: 8})
Counter({0: 8})
Counter({5: 8})
Counter({0: 13})
Counter({3: 13})
Counter({0: 7})
Counter({5: 15})
Counter({2: 12})


In [31]:
app_urls = json.load(open('app-url-dictionary.json'))
risk_by_category = {'apple': {}, 'google': {}}
cat_counter = {'apple': Counter(), 'google': Counter()}

for scenario in scenarios:
    # count statistics by platform
    platform = ''
    if scenario['app_url'].startswith('https://play.google.com'):
        platform = 'google'
    else:
        platform = 'apple'
    
    # record the category by platform
    app_category = app_urls[scenario['app_url']]['app_category']
    cat_counter[platform][app_category] += 1
    
    # record the risk ratings by platform
    if not app_category in risk_by_category[platform]:
        risk_by_category[platform][app_category] = []
    
    for i in range(len(scenario.keys())):
        key_risk = 'part2_risk%i' % i
        
        if key_risk in scenario:
            risk_by_category[platform][app_category].append(int(scenario[key_risk]))


print('\nApple Category Distribution\n')
for app_category in sorted(cat_counter['apple'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['apple'][app_category]))
print('\nRisk by Apple Categories\n')
avg_risk = []
for app_category in sorted(risk_by_category['apple'].keys()):
    risk_scores = risk_by_category['apple'][app_category]
    print('%s: %0.2f' % (app_category.ljust(25), sum(risk_scores) / len(risk_scores)))
    avg_risk.extend(risk_scores)
print('\nAverage Risk (Apple): %0.2f' % (sum(avg_risk) / len(avg_risk)))
    
print('\nGoogle Category Distribution\n')
for app_category in sorted(cat_counter['google'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['google'][app_category]))    
print('\nRisk by Google Categories\n')
avg_risk = []
for app_category in sorted(risk_by_category['google'].keys()):
    risk_scores = risk_by_category['google'][app_category]
    print('%s: %0.2f' % (app_category.ljust(25), sum(risk_scores) / len(risk_scores)))
    avg_risk.extend(risk_scores)
print('\nAverage Risk (Google): %0.2f' % (sum(avg_risk) / len(avg_risk)))



Apple Category Distribution

Books                    : 1
Entertainment            : 3
Family                   : 1
Finance                  : 12
Games                    : 5
Health & Fitness         : 29
Lifestyle                : 4
Music                    : 2
News                     : 2
Productivity             : 1
Shopping                 : 3
Social Networking        : 4
Sports                   : 2
Utilities                : 2

Risk by Apple Categories

Books                    : 3.95
Entertainment            : 1.46
Family                   : 3.05
Finance                  : 3.13
Games                    : 2.04
Health & Fitness         : 1.21
Lifestyle                : 2.67
Music                    : 0.50
News                     : 0.95
Productivity             : 4.75
Shopping                 : 1.61
Social Networking        : 2.97
Sports                   : 1.93
Utilities                : 0.92

Average Risk (Apple): 1.92

Google Category Distribution

Books & References       : 2

In [11]:
# identify scenarios where the same information type is rated at 
# different risk levels by different users to illustrate context

find_examples = ['phone numbers', 'phone number']
rating_label = ['Very Willing', 'Willing', 'Somewhat Willing', 'Somewhat Unwilling', 'Unwilling', 'Very Unwilling']

for scenario in scenarios:
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        
        if not key_info in scenario:
            break
        
        if scenario[key_info] in find_examples:
            start, end = [int(s) for s in scenario[key_info + '_loc'].split(':')]
            text = scenario['text'][:start] + '*** ' + scenario['text'][start:end] + ' ***' + scenario['text'][end:]
            print('\n%s: %s' % (scenario['scenario_id'], text))
            print('Rating: %s' % (rating_label[int(scenario[key_risk])]))
            
            counter = scale_usage[scenario['scenario_id']]
            usage = ', '.join(['%s: %s' % (k, counter[k]) for k in sorted(counter.keys())])
            print('Scale usage: %s' % usage)



MAS-R-4: To open this screen, I have to log in to the app first. Then I have to click on the icon of the person waving on the bottom navigation bar. I use the friends screen to see which of my friends are online. When they are online, there is a green light next to their name. When they are idle, there is a half moon next to their name. When they are completely offline, there is a grey button next to their names. Then I can reach out to them using a direct message or chat with them on a server that we are both a member of. I can click on their name and see which mutual servers we are currently a member of. I can also use this screen to add new friends by searching for their usernames or *** phone number ***. I can also use this screen to remove friends. The app knows my list of friends and uses that information to populate this screen.
Rating: Unwilling
Scale usage: 4: 5, 5: 3

MAS-R-5: I can talk to the streamer and the other people watching the stream with this chat. I can use emote

In [None]:
print('Most Unwilling Types\n')
print(sorted(infos[2]))

In [None]:
print('Most Willling types\n')
print(sorted(infos[0]))