# Notebook Overview

This notebook is used to analyze the privacy risk survey responses.

In [1]:
import json

scenarios = json.load(open('scenarios-risked.json'))
print('Read %i scenarios.' % len(scenarios))

Read 57 scenarios.


In [2]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)
    
summarize(scenarios)

Scenarios: 57
Apple App: 20
Google Play: 37
Unique Apps: 51
Sentences: 453


In [4]:
from collections import Counter

data = {'usage_freq': Counter(), 'false_pos': 0, 'scale_usage': Counter()}
risks = []
infos = [[], [], []]
info_count = 0
for scenario in scenarios:
    data['usage_freq'][scenario['usage_freq']] += 1
    
    risk = []
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        key_fp = 'part2_false_pos%i' % i
        
        if not key_info in scenario:
            info_count += i - 1
            break
        
        if key_risk in scenario:
            data['scale_usage'][scenario[key_risk]] += 1
            scenario[key_risk] = int(scenario[key_risk])
            risks.append(scenario[key_risk])
            if scenario[key_risk] < 2:
                infos[0].append(scenario[key_info])
            elif scenario[key_risk] < 4:
                infos[1].append(scenario[key_info])
            else:
                infos[2].append(scenario[key_info])
        elif key_fp in scenario:
            data['false_pos'] += 1
    if len(risk) > 0:
        risks.append(sum(risk) / len(risk))

data['avg_infotype'] = info_count / len(scenarios)
        
if len(risks) > 0:
    data['avg_risk'] = sum(risks) / len(risks)
    
for k, v in data.items():
    print('%s:\t%s' % (k, v))

info_count = sum([len(infos[0]), len(infos[1]), len(infos[2])])
print('\nUser-perceived Precision: %0.3f' % ((info_count - data['false_pos']) / info_count))
    
print('\nscale_usage_w:\t%i' % (sum([data['scale_usage'][str(i)] for i in [0, 1]])))
print('scale_usage_s:\t%i' % (sum([data['scale_usage'][str(i)] for i in [2, 3]])))
print('scale_usage_u:\t%i' % (sum([data['scale_usage'][str(i)] for i in [4, 5]])))

usage_freq:	Counter({'daily': 35, 'weekly': 18, 'monthly': 4})
false_pos:	75
scale_usage:	Counter({'2': 153, '0': 127, '1': 104, '3': 103, '5': 99, '4': 68})
avg_infotype:	12.789473684210526
avg_risk:	2.27217125382263

User-perceived Precision: 0.885

scale_usage_w:	231
scale_usage_s:	256
scale_usage_u:	167


In [5]:
print('Most Willling types\n')
print(sorted(infos[0]))

Most Willling types

['BMI', 'Basal Metabolic Rate', 'Fast chart', 'Foods', 'Pay in 4', 'Teams', 'account', 'account', 'account', 'account', 'activities', 'activities', 'activity', 'activity goals', 'activity goals', 'age', 'age', 'agreement', 'app summary', 'application', 'audio', 'bank account', 'bar codes', 'barcode', 'body mass index', 'bonuses', 'bonuses', 'book', 'boosters', 'buyers', 'calorie burning numbers', 'calories', 'calories', 'carbs', 'card', 'cardio', 'charges', 'data', 'database', 'date', 'debit card', 'description', 'device', 'diet goals', 'diet plan', 'distance', 'download quality', 'drinks', 'eat', 'effort', 'email', 'emotes', 'energy', 'everything that I', 'excersizes', 'exercise', 'exercise', 'exercises', 'fast', 'fasting', 'fasts', 'fat', 'fat intake', 'feature', 'feature', 'fitness', 'fitness activities', 'fitness class', 'fitness goal', 'fitness goals', 'fitness level', 'fitness plans', 'food', 'food calories', 'food item', 'food items', 'food product barcodes'

In [6]:
print('Most Unwilling Types\n')
print(sorted(infos[2]))

Most Unwilling Types

['Amazon', 'Amazon Prime account', 'Amazon account', 'Charter services', 'Cyber security', 'GPS', 'Group', 'I have done on a given day', 'Services', 'TV provider', 'Username', 'account', 'account', 'account', 'account', 'account', 'account information', 'accounts', 'activity', 'address', 'address', 'address', 'ads', 'authentication messages', 'authentication process', 'bank', 'bank usage', 'budget', 'calendar', 'camera', 'charges', 'charter', 'charter trip', 'chat', 'chatbot conversations', 'checking account', 'checking account number', 'child', 'community', 'contact', 'contact list', 'contacts', "contacts'", 'content', 'conversation', 'conversations', 'conversations', 'credit card information', 'data', 'data usage', 'date', 'deliveries', 'delivery', 'diaries', 'doctor', 'doctors appointments', 'email address', 'email address', 'email address', 'email addresses', 'emails', 'file location', 'finances', 'friend', 'friend', 'friends', 'friends', 'hand sanitizer trash

In [7]:
# identify scenarios where the same information type is rated at 
# different risk levels by different users to illustrate context

find_examples = ['phone numbers', 'phone number']
rating_label = ['Very Unwilling', 'Unwilling', 'Somehwhat Unwilling', 'Somewhat Willing', 'Willing', 'Very Willing']

for scenario in scenarios:
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        
        if not key_info in scenario:
            break
        
        if scenario[key_info] in find_examples:
            start, end = [int(s) for s in scenario[key_info + '_loc'].split(':')]
            text = scenario['text'][:start] + '*** ' + scenario['text'][start:end] + ' ***' + scenario['text'][end:]
            print('\n%s: %s' % (scenario['scenario_id'], text))
            print('Rating: %s' % (rating_label[int(scenario[key_risk])]))



MAS-R-3: To open this screen, I have to log in to the app first. Then I have to click on the icon of the person waving on the bottom navigation bar. I use the friends screen to see which of my friends are online. When they are online, there is a green light next to their name. When they are idle, there is a half moon next to their name. When they are completely offline, there is a grey button next to their names. Then I can reach out to them using a direct message or chat with them on a server that we are both a member of. I can click on their name and see which mutual servers we are currently a member of. I can also use this screen to add new friends by searching for their usernames or *** phone number ***. I can also use this screen to remove friends. The app knows my list of friends and uses that information to populate this screen.
Rating: Willing

MAS-R-4: I can talk to the streamer and the other people watching the stream with this chat. I can use emotes that Twitch provide you,

In [10]:
from collections import Counter

# create a nominal distribution from item responses
scale_usage = []
for scenario in scenarios:
    counter = Counter()
    for i in range(1, len(scenario.keys())):
        key_info = 'part2_info%i' % i
        key_risk = 'part2_risk%i' % i
        
        if key_risk in scenario:
            counter[int(scenario[key_risk])] += 1
        
    scale_usage.append(counter)

# count the number of distributions that only use on response item
meta_counter = Counter()
single_label = []
for counter in scale_usage:
    meta_counter[len(counter.keys())] += 1
    if len(counter.keys()) == 1:
        single_label.append(counter)
    
print('Scale utilization for %i survey submissions:\n' % len(scenarios))
for count in sorted(meta_counter.keys()):
    print('%s: %s' % (count, meta_counter[count]))

print('\nSingle-label responses frequencies for %i authors:\n' % len(single_label))
for counter in single_label:
    print(counter)

Scale utilization for 57 survey submissions:

1: 12
2: 14
3: 14
4: 11
5: 5
6: 1

Single-label responses frequencies for 12 authors:

Counter({4: 10})
Counter({2: 14})
Counter({0: 5})
Counter({0: 14})
Counter({3: 4})
Counter({5: 13})
Counter({3: 9})
Counter({0: 17})
Counter({0: 13})
Counter({3: 13})
Counter({5: 15})
Counter({2: 12})
