# Notebook Overview

This notebook summarized statistics about the pre- and post-labeled scenario data, 
excluding author information.

In [11]:
import json

data1 = json.load(open('scenarios-general.json'))
data2 = json.load(open('scenarios-privacy.json'))

In [2]:
from collections import Counter
import nltk

apple_url = 'https://apps.apple.com'
googl_url = 'https://play.google.com'

def summarize(data):
    app_urls = Counter()
    stores = Counter()
    sent_count = 0
    for d in data:
        # count app url without parameters (Google only)
        url = d['app_url'].split('&')[0]
        app_urls[url] += 1
        
        # count app store by url prefix
        if url.startswith(apple_url):
            stores[apple_url] += 1
        elif url.startswith(googl_url):
            stores[googl_url] += 1
            
        # count sentences
        sent = nltk.sent_tokenize(d['text'])
        sent_count += len(sent)

    print('Scenarios: %i' % len(data))
    print('Apple App: %i' % stores[apple_url])
    print('Google Play: %i' % stores[googl_url])
    print('Unique Apps: %i' % len(app_urls.keys()))
    print('Sentences: %i' % sent_count)

In [3]:
print('*** General Scenarios ***\n')
summarize(data1)

*** General Scenarios ***

Scenarios: 100
Apple App: 38
Google Play: 62
Unique Apps: 84
Sentences: 810


In [4]:
print('*** Privacy Scenarios ***\n')
summarize(data2)

*** Privacy Scenarios ***

Scenarios: 200
Apple App: 68
Google Play: 132
Unique Apps: 134
Sentences: 1616


In [5]:
import json

labeled = json.load(open('scenarios-labeled.json'))

In [6]:
entities = {'general': {}, 'privacy': {}}

for scenario_id, data in labeled.items():
    entity_map = None
    if scenario_id.startswith('MAS-G'):
        entity_map = entities['general']
    else:
        entity_map = entities['privacy']
    
    # identify all the phrases
    start = -1
    for i, code in enumerate(data['codes']):
        if code.startswith('B-'):
            start = i
        elif code == 'O' and start >= 0:
            label = data['codes'][start][2:]
            if not label in entity_map:
                entity_map[label] = []
            entity_map[label].append(' '.join(data['words'][start:i]))
            start = -1
    if start >= 0:
        label = data['codes'][start][2:]
        if not label in entity_map:
            entity_map[label] = []
        entity_map[label].append(' '.join(data['words'][start:]))

unique = []
print('*** General Scenarios ***\n')
for k, v in entities['general'].items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
    
print('\n*** Privacy Scenarios ***\n')
for k, v in entities['privacy'].items():
    print('%s: %i' % (k, len(v)))
    unique.extend(v)
    
print('\nTotal: %i' % len(unique))
print('Unique: %i' % len(set(unique)))

*** General Scenarios ***

SIM: 1115
QUE: 94
COM: 123

*** Privacy Scenarios ***

SIM: 2409
QUE: 243
COM: 179

Total: 4163
Unique: 1881


In [9]:
app_urls = json.load(open('app-url-dictionary.json'))
cat_counter = {'apple': Counter(), 'google': Counter()}

for scenario in data1:
    # count statistics by platform
    platform = ''
    if scenario['app_url'].startswith('https://play.google.com'):
        platform = 'google'
    else:
        platform = 'apple'
    
    # record the category by platform
    app_category = app_urls[scenario['app_url']]['app_category']
    cat_counter[platform][app_category] += 1


print('\nApple Category Distribution - General\n')
for app_category in sorted(cat_counter['apple'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['apple'][app_category]))
    
print('\nGoogle Category Distribution - General\n')
for app_category in sorted(cat_counter['google'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['google'][app_category]))    


Apple Category Distribution - General

Entertainment            : 1
Finance                  : 1
Food & Drink             : 1
Games                    : 1
Health & Fitness         : 5
Lifestyle                : 1
Local Forecasts & Live Maps: 1
Music                    : 4
Navigation               : 1
News                     : 7
Photo & Video            : 4
Productivity             : 1
Shopping                 : 2
Social Networking        : 2
Sports                   : 2
Travel                   : 1
Utilities                : 1
Weather                  : 2

Google Category Distribution - General

Books & References       : 2
Communications           : 4
Education                : 2
Entertainment            : 4
Finance                  : 1
Food & Drink             : 3
Games                    : 2
Health & Fitness         : 3
Lifestyle                : 1
Maps & Navigation        : 1
Medical                  : 1
Music & Audio            : 6
News & Magazines         : 1
Photography       

In [10]:
for scenario in data2:
    # count statistics by platform
    platform = ''
    if scenario['app_url'].startswith('https://play.google.com'):
        platform = 'google'
    else:
        platform = 'apple'
    
    # record the category by platform
    app_category = app_urls[scenario['app_url']]['app_category']
    cat_counter[platform][app_category] += 1


print('\nApple Category Distribution - Privacy\n')
for app_category in sorted(cat_counter['apple'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['apple'][app_category]))
    
print('\nGoogle Category Distribution - Privacy\n')
for app_category in sorted(cat_counter['google'].keys()):
    print('%s: %i' % (app_category.ljust(25), cat_counter['google'][app_category]))    


Apple Category Distribution - Privacy

Books                    : 1
Education                : 2
Entertainment            : 2
Finance                  : 17
Food & Drink             : 4
Games                    : 3
Health & Fitness         : 31
Lifestyle                : 2
Local Forecasts & Live Maps: 1
Music                    : 6
Navigation               : 1
News                     : 7
Photo & Video            : 6
Productivity             : 3
Reference                : 1
Shopping                 : 4
Social Networking        : 4
Sports                   : 6
Travel                   : 1
Utilities                : 2
Weather                  : 2

Google Category Distribution - Privacy

Books & References       : 3
Communications           : 12
Dating                   : 1
Education                : 5
Entertainment            : 10
Finance                  : 6
Food & Drink             : 13
Games                    : 4
Health & Fitness         : 33
Lifestyle                : 5
Maps & Navig

In [18]:
import json

data3 = json.load(open('scenarios-labeled.json'))

for scenario_id, scenario in data3.items():
    for i in range(len(scenario['codes'])):
        codes = set([c[2:] for c in scenario['codes'][i:min(i + 10, len(scenario['codes']))] if len(c) >= 2])
        if len(codes) == 3:
            print(scenario)
            print()
                 
        

{'id': 'MAS-G-0022', 'text': '(1)I use this screen to look at what LINE friends have recently posted. It\'s a timeline that consists of posts from my friends and also from myself. As for the goals, there\'s really only one: to see what my friends have been up to -- you know, what\'s new in their life. \n(2)As for how I get to this screen, once you open the app, you\'re usually on the tab you were last using, so you generally have 3 others options to choose. Once I opened the LINE app, I was already on the "Chats" tab, therefore all I had to do was just a slide of my thumb to the right and press on the "Timeline" tab. \n(3)Now once at this screen, there aren\'t really any steps to take to achieve my goal. I simply scroll through the posts and hope to see some interesting content, or at least some good things happening the lives of my friends. ', 'words': ['(', '1)I', 'use', 'this', 'screen', 'to', 'look', 'at', 'what', 'LINE', 'friends', 'have', 'recently', 'posted', '.', 'It', "'s", 'a