# Notebook Overview

This notebook summarized statistics about the pre- and post-labeled scenario data, 
excluding author information.

In [1]:
import json

labeled = json.load(open('scenarios-labeled.json'))

In [2]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)

In [3]:
summarize(labeled)

Scenarios: 300
Apple App: 106
Google Play: 194
Unique Apps: 193
Sentences: 2426


In [4]:
entities = {'general': {}, 'privacy': {}}

for scenario in labeled:
    entity_map = None
    if scenario['scenario_id'].startswith('MAS-G'):
        entity_map = entities['general']
    else:
        entity_map = entities['privacy']
    
    # identify all the phrases
    for start, end, label in scenario['labels']:
        if not label in entity_map:
            entity_map[label] = []
        entity_map[label].append(scenario['text'][start:end])

unique = []
print('*** General Scenarios ***\n')
for k, v in entities['general'].items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
    
print('\n*** Privacy Scenarios ***\n')
for k, v in entities['privacy'].items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
    
print('\nTotal: %i' % len(unique))
print('Unique: %i' % len(set(unique)))

*** General Scenarios ***

SIM: 1252
QUE: 95
COM: 123

*** Privacy Scenarios ***

SIM: 2666
QUE: 245
COM: 182

Total: 4563
Unique: 1953


In [5]:
app_urls = json.load(open('app-url-dictionary.json'))
cat_counter = {'apple': Counter(), 'google': Counter()}

def count_app_cats(scenarios):
    for scenario in scenarios:
        # count statistics by platform
        platform = ''
        if scenario['app_url'].startswith('https://play.google.com'):
            platform = 'google'
        else:
            platform = 'apple'
            
        if not scenario['app_url'] in app_urls:
            print('Missing URL: %s' % scenario['app_url'])
            continue
        if not 'app_category' in app_urls[scenario['app_url']]:
            print('Missing category for URL: %s' % scenario['app_url'])
            continue
            
        # record the category by platform
        app_category = app_urls[scenario['app_url']]['app_category']
        cat_counter[platform][app_category] += 1
    return cat_counter

def report_app_cats(counter):
    print('\nApple Category Distribution\n')
    apps = 0
    cats  = 0
    for app_category in sorted(cat_counter['apple'].keys()):
        print('%s: %i' % (app_category.ljust(25), cat_counter['apple'][app_category]))
        apps += cat_counter['apple'][app_category]
        cats += 1
    print('\nTotal %i apps, and %i categories.' % (apps, cats))
    
    print('\nGoogle Category Distribution\n')
    apps = 0
    cats = 0
    for app_category in sorted(cat_counter['google'].keys()):
        print('%s: %i' % (app_category.ljust(25), cat_counter['google'][app_category]))
        apps += cat_counter['google'][app_category]
        cats += 1
    print('\nTotal %i apps, and %i categories.' % (apps, cats))

report_app_cats(count_app_cats(labeled))


Apple Category Distribution

Books                    : 1
Education                : 2
Entertainment            : 2
Finance                  : 17
Food & Drink             : 4
Games                    : 3
Health & Fitness         : 31
Lifestyle                : 2
Local Forecasts & Live Maps: 1
Music                    : 6
Navigation               : 1
News                     : 7
Photo & Video            : 6
Productivity             : 3
Reference                : 1
Shopping                 : 4
Social Networking        : 4
Sports                   : 6
Travel                   : 1
Utilities                : 2
Weather                  : 2

Total 106 apps, and 21 categories.

Google Category Distribution

Books & References       : 3
Business                 : 1
Communications           : 12
Dating                   : 1
Education                : 5
Entertainment            : 10
Finance                  : 5
Food & Drink             : 13
Games                    : 4
Health & Fitness         :

In [6]:

scenarios = json.load(open('scenarios-labeled.json'))
scenarios.extend(json.load(open('scenarios-risked.json')))

def count_app_freq(scenarios):
    app_freq = Counter()
    
    for scenario in scenarios:
        app_freq[scenario['app_url']] += 1
    return app_freq

sorted_freq = [(u,c) for u, c in count_app_freq(scenarios).items()]
sorted_freq = sorted(sorted_freq, key=lambda x:x[1], reverse=True)
for url, count in sorted_freq:
    print('%s\t%i' % (url, count))


https://play.google.com/store/apps/details?id=com.twitter.android	13
https://play.google.com/store/apps/details?id=com.instagram.android	13
https://play.google.com/store/apps/details?id=com.myfitnesspal.android	13
https://apps.apple.com/us/app/apple-health/id1242545199	12
https://apps.apple.com/us/app/myfitnesspal-calorie-counter/id341232718	9
https://apps.apple.com/us/app/fitness/id1208224953	9
https://play.google.com/store/apps/details?id=com.instacart.client	8
https://play.google.com/store/apps/details?id=com.zhiliaoapp.musically	7
https://apps.apple.com/us/app/fitbit-health-fitness/id462638897	7
https://play.google.com/store/apps/details?id=com.amazon.mShop.android.shopping	7
https://play.google.com/store/apps/details?id=com.reddit.frontpage	7
https://play.google.com/store/apps/details?id=com.facebook.katana	7
https://play.google.com/store/apps/details?id=com.google.android.apps.maps	6
https://play.google.com/store/apps/details?id=com.duolingo	5
https://apps.apple.com/us/app/spotif