# Notebook Overview

This notebook summarized statistics about the pre- and post-labeled scenario data, 
excluding author information.

In [1]:
import json

data1 = json.load(open('scenarios-general.json'))
data2 = json.load(open('scenarios-privacy.json'))

In [2]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)

In [3]:
print('*** General Scenarios ***\n')
summarize(data1)

*** General Scenarios ***

Scenarios: 100
Apple App: 38
Google Play: 62
Unique Apps: 84
Sentences: 810


In [4]:
print('*** Privacy Scenarios ***\n')
summarize(data2)

*** Privacy Scenarios ***

Scenarios: 200
Apple App: 68
Google Play: 132
Unique Apps: 134
Sentences: 1616


In [7]:
import json

labeled = json.load(open('scenarios-labeled.json'))

In [8]:
entities = {'general': {}, 'privacy': {}}

for scenario_id, data in labeled.items():
    entity_map = None
    if scenario_id.startswith('MAS-G'):
        entity_map = entities['general']
    else:
        entity_map = entities['privacy']
    
    # identify all the phrases
    start = -1
    for i, code in enumerate(data['codes']):
        if code.startswith('B-'):
            start = i
        elif code == 'O' and start >= 0:
            label = data['codes'][start][2:]
            if not label in entity_map:
                entity_map[label] = []
            entity_map[label].append(' '.join(data['words'][start:i]))
            start = -1
    if start >= 0:
        label = data['codes'][start][2:]
        if not label in entity_map:
            entity_map[label] = []
        entity_map[label].append(' '.join(data['words'][start:]))

unique = []
print('*** General Scenarios ***\n')
for k, v in entities['general'].items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
    
print('\n*** Privacy Scenarios ***\n')
for k, v in entities['privacy'].items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
    
print('\nTotal: %i' % len(unique))
print('Unique: %i' % len(set(unique)))

*** General Scenarios ***

SIM: 1115
QUE: 94
COM: 123

*** Privacy Scenarios ***

SIM: 2409
QUE: 243
COM: 179

Total: 4163
Unique: 1881
