# Notebook Overview

This notebook summarized statistics about the pre- and post-labeled scenario data, 
excluding author information.

In [1]:
import json

data1 = json.load(open('../datasets/scenarios1.json'))
data2 = json.load(open('../datasets/scenarios2_200.json'))

In [16]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)

In [17]:
print('*** General Scenarios ***')
summarize(data1)

*** General Scenarios ***
Scenarios: 100
Apple App: 38
Google Play: 62
Unique Apps: 84
Sentences: 810


In [18]:
print('*** Privacy Scenarios ***')
summarize(data2)

*** Privacy Scenarios ***
Scenarios: 200
Apple App: 68
Google Play: 132
Unique Apps: 134
Sentences: 1616
