# Notebook Overview

This notebook summarized statistics about the pre- and post-labeled scenario data, 
excluding author information.

In [1]:
import json

data1 = json.load(open('scenarios-general.json'))
data2 = json.load(open('scenarios-privacy.json'))

In [2]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)

In [5]:
print('*** General Scenarios ***\n')
summarize(data1)

*** General Scenarios ***

Scenarios: 100
Apple App: 38
Google Play: 62
Unique Apps: 84
Sentences: 810


In [6]:
print('*** Privacy Scenarios ***\n')
summarize(data2)

*** Privacy Scenarios ***

Scenarios: 200
Apple App: 68
Google Play: 132
Unique Apps: 134
Sentences: 1616


In [26]:
import json

labeled = json.load(open('scenarios-labeled.json'))

In [27]:
phrases = {}

for scenario_id, data in labeled.items():
    #data['text'] = data['clean_text']
    #del data['clean_text']
    
    # identify all the phrases
    start = -1
    for i, code in enumerate(data['codes']):
        if code.startswith('B-'):
            start = i
        elif code == 'O' and start >= 0:
            label = data['codes'][start][2:]
            if not label in phrases:
                phrases[label] = []
            phrases[label].append(' '.join(data['words'][start:i]))
            start = -1
    if start >= 0:
        label = data['codes'][start][2:]
        if not label in phrases:
            phrases[label] = []
        phrases[label].append(' '.join(data['words'][start:]))

unique = []
for k, v in phrases.items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
print('\nTotal: %i' % len(unique))
print('Unique: %i' % len(set(unique)))

Noun Phrase: 3524
Questions: 337
Complex Terms: 302

Total: 4163
Unique: 1881


In [28]:
replace = {'Noun Phrase': 'SIM', 'Questions': 'QUE', 'Complex Terms': 'COM'}

for scenario_id, data in labeled.items():
    data['text'] = data['clean_text']
    del data['clean_text']
    
    # identify all the phrases
    start = -1
    for i, code in enumerate(data['codes']):
        if code.startswith('B-'):
            data['codes'][i] = 'B-' + replace[data['codes'][i][2:]]
        elif code.startswith('I-'):
            data['codes'][i] = 'I-' + replace[data['codes'][i][2:]]

    print(data['codes'])

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SIM', 'O', 'O', 'B-SIM', 'I-SIM', 'O', 'O', 'B-SIM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SIM', 'O', 'B-SIM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SIM', 'I-SIM', 'O', 'O', 'O', 'B-SIM', 'O', 'B-SIM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SIM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-SIM', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['O', 'O', 'O', 'O', 'O', 'O', 'B-QUE', 'I-QUE', 'I-QUE'

In [29]:
json.dump(labeled, open('scenarios-relabeled.json', 'w'))