# Notebook Overview

This notebook is used to analyze the privacy risk survey responses.

In [2]:
import json

scenarios = json.load(open('scenarios-risked.json'))
print('Read %i scenarios.' % len(scenarios))

Read 77 scenarios.


In [3]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)
    
summarize(scenarios)

Scenarios: 77
Apple App: 28
Google Play: 49
Unique Apps: 67
Sentences: 602


In [4]:
from collections import Counter

data = {'usage_freq': Counter(), 'false_pos': 0, 'scale_usage': Counter()}
risks = []
infos = [[], [], []]
info_count = 0
for scenario in scenarios:
    data['usage_freq'][scenario['usage_freq']] += 1
    
    risk = []
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        key_fp = 'part2_false_pos%i' % i
        
        if not key_info in scenario:
            info_count += i - 1
            break
        
        if key_risk in scenario:
            data['scale_usage'][scenario[key_risk]] += 1
            scenario[key_risk] = int(scenario[key_risk])
            risks.append(scenario[key_risk])
            if scenario[key_risk] < 2:
                infos[0].append(scenario[key_info])
            elif scenario[key_risk] < 4:
                infos[1].append(scenario[key_info])
            else:
                infos[2].append(scenario[key_info])
        elif key_fp in scenario:
            data['false_pos'] += 1
    if len(risk) > 0:
        risks.append(sum(risk) / len(risk))

data['avg_infotype'] = info_count / len(scenarios)
        
if len(risks) > 0:
    data['avg_risk'] = sum(risks) / len(risks)
    
for k, v in data.items():
    print('%s:\t%s' % (k, v))

info_count = sum([len(infos[0]), len(infos[1]), len(infos[2])])
print('\nUser-perceived Precision: %0.3f' % ((info_count - data['false_pos']) / info_count))
    
print('\nscale_usage_w:\t%i' % (sum([data['scale_usage'][str(i)] for i in [0, 1]])))
print('scale_usage_s:\t%i' % (sum([data['scale_usage'][str(i)] for i in [2, 3]])))
print('scale_usage_u:\t%i' % (sum([data['scale_usage'][str(i)] for i in [4, 5]])))

usage_freq:	Counter({'daily': 46, 'weekly': 26, 'monthly': 5})
false_pos:	86
scale_usage:	Counter({'2': 201, '0': 178, '1': 161, '3': 124, '5': 121, '4': 97})
avg_infotype:	12.571428571428571
avg_risk:	2.1859410430839

User-perceived Precision: 0.902

scale_usage_w:	339
scale_usage_s:	325
scale_usage_u:	218


In [5]:
print('Most Willling types\n')
print(sorted(infos[0]))

Most Willling types

['BMI', 'Basal Metabolic Rate', 'Chipotle', 'Episodes', 'Fast chart', 'Foods', 'Like Songs', 'McDonalds Day', 'Pay in 4', 'Perks', 'Pokemon card collection', 'Pokemon card sets', 'QR code', 'Teams', "What is your employer's industry", 'account', 'account', 'account', 'account', 'activities', 'activities', 'activity', 'activity goals', 'activity goals', 'activity level', 'age', 'age', 'agreement', 'app summary', 'application', 'audio', 'band', 'bank account', 'bar codes', 'barcode', 'being', 'birthday', 'body mass index', 'bonuses', 'bonuses', 'book', 'boosters', 'burgers', 'buyers', 'caloric intake', 'calorie', 'calorie burn routine', 'calorie burning numbers', 'calorie count setting', 'calories', 'calories', 'calories', 'carbs', 'card', 'cardio', 'cards', 'cash discount', 'cellular service', 'charges', 'clothes merchandise', 'collection', 'collections', 'customer service', 'data', 'database', 'date', 'date wise', 'debit card', 'description', 'device', 'diet goals'

In [6]:
print('Most Unwilling Types\n')
print(sorted(infos[2]))

Most Unwilling Types

['Amazon', 'Amazon Prime account', 'Amazon account', 'Calls', 'Charter services', 'Cyber security', 'GPS', 'GPS', 'Group', 'I have done on a given day', 'Money', 'Services', 'TV provider', 'Username', 'account', 'account', 'account', 'account', 'account', 'account information', 'accounts', 'activity', 'address', 'address', 'address', 'address', 'address', 'ads', 'authentication messages', 'authentication process', 'badges', 'bank', 'bank account', 'bank account', 'bank usage', 'budget', 'calendar', 'calls', 'camera', 'cards', 'cards', 'charges', 'charter', 'charter trip', 'chat', 'chatbot conversations', 'checking account', 'checking account number', 'child', 'community', 'community', 'contact', 'contact list', 'contacts', 'contacts', "contacts'", 'content', 'conversation', 'conversations', 'conversations', 'credit card information', 'data', 'data usage', 'date', 'deliveries', 'delivery', 'diaries', 'doctor', 'doctors appointments', 'email address', 'email address

In [22]:
from collections import Counter

# create a nominal distribution from item responses
scale_usage = {}
for scenario in scenarios:
    counter = Counter()
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        
        if key_risk in scenario:
            counter[int(scenario[key_risk])] += 1
        
    scale_usage[scenario['scenario_id']] = counter

# count the number of distributions that only use on response item
meta_counter = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
scale_counter = Counter({'U':Counter(), 'W':Counter(), 'M':Counter()})
for counter in scale_usage.values():
    meta_counter[len(counter.keys())].append(counter)
    
    skew = sum([1 for c in counter.keys() if c < 3])
    # if all ratings are < 3
    if skew == len(counter):
        scale_counter['W'][len(counter.keys())] += 1
    # elif all ratings are >= 3
    elif skew == 0:
        scale_counter['U'][len(counter.keys())] += 1
    # else all ratings are mixed, above and below 3
    else:
        scale_counter['M'][len(counter.keys())] += 1
    
print('Scale utilization for %i survey submissions:\n' % len(scenarios))
for count in sorted(meta_counter.keys()):
    print('%s: %s' % (count, len(meta_counter[count])))

print('\nScale utilization (W)illing, (U)nwilling and (M)ixed\n')
for label, count in scale_counter.items():
    usage = [0, 0, 0, 0, 0, 0]
    for c, f in count.items():
        usage[int(c) - 1] = f
    print('%s: %s' % (label, usage))
    
print('\nSingle-label responses frequencies for %i authors:\n' % len(meta_counter[1]))
for counter in meta_counter[1]:
    print(counter)

Scale utilization for 77 survey submissions:

1: 17
2: 19
3: 20
4: 13
5: 6
6: 2

Scale utilization (W)illing, (U)nwilling and (M)ixed

U: [8, 4, 2, 0, 0, 0]
W: [9, 11, 5, 0, 0, 0]
M: [0, 4, 13, 13, 6, 2]

Single-label responses frequencies for 17 authors:

Counter({5: 8})
Counter({4: 10})
Counter({2: 14})
Counter({0: 5})
Counter({0: 14})
Counter({3: 4})
Counter({5: 13})
Counter({4: 9})
Counter({3: 9})
Counter({0: 7})
Counter({0: 17})
Counter({1: 8})
Counter({0: 8})
Counter({0: 13})
Counter({3: 13})
Counter({5: 15})
Counter({2: 12})


In [11]:
# identify scenarios where the same information type is rated at 
# different risk levels by different users to illustrate context

find_examples = ['phone numbers', 'phone number']
rating_label = ['Very Willing', 'Willing', 'Somewhat Willing', 'Somewhat Unwilling', 'Unwilling', 'Very Unwilling']

for scenario in scenarios:
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        
        if not key_info in scenario:
            break
        
        if scenario[key_info] in find_examples:
            start, end = [int(s) for s in scenario[key_info + '_loc'].split(':')]
            text = scenario['text'][:start] + '*** ' + scenario['text'][start:end] + ' ***' + scenario['text'][end:]
            print('\n%s: %s' % (scenario['scenario_id'], text))
            print('Rating: %s' % (rating_label[int(scenario[key_risk])]))
            
            counter = scale_usage[scenario['scenario_id']]
            usage = ', '.join(['%s: %s' % (k, counter[k]) for k in sorted(counter.keys())])
            print('Scale usage: %s' % usage)



MAS-R-4: To open this screen, I have to log in to the app first. Then I have to click on the icon of the person waving on the bottom navigation bar. I use the friends screen to see which of my friends are online. When they are online, there is a green light next to their name. When they are idle, there is a half moon next to their name. When they are completely offline, there is a grey button next to their names. Then I can reach out to them using a direct message or chat with them on a server that we are both a member of. I can click on their name and see which mutual servers we are currently a member of. I can also use this screen to add new friends by searching for their usernames or *** phone number ***. I can also use this screen to remove friends. The app knows my list of friends and uses that information to populate this screen.
Rating: Unwilling
Scale usage: 4: 5, 5: 3

MAS-R-5: I can talk to the streamer and the other people watching the stream with this chat. I can use emote