In [56]:
# extract scenarios and save to JSONL file
import json

data = []
for i, scenario in enumerate(json.load(open('scenarios-labeled.json', 'r'))):
    data.append({
        'id': i + 1, 
        'text': scenario['text'], 
        'label':[] if not 'labels' in scenario else scenario['labels']
    })
                             
with open('scenarios-labeled-input.jsonl', 'w') as f:
    for d in data:
        f.write(json.dumps(d) + '\n')

In [71]:
# integrate JSONL annotations into scenarios file
import json

data = [json.loads(s) for s in open('scenarios-risked.jsonl', 'r').readlines()]
scenarios = json.load(open('scenarios-risked.json', 'r'))

updated = []
exclude = [45, 65, 110, 112, 121]
for d, s in zip(data, scenarios):
    if int(d['id']) in exclude:
        continue
    
    if d['text'] != s['text']:
        print('Mismatch at record %i and Scenario ID %s' % (d['id'], s['scenario_id']))
    if 'label' in d:
        s['labels'] = d['label']
    else:
        s['labels'] = []
    updated.append(s)
        
json.dump(updated, open('scenarios-risked1.json', 'w'))


In [74]:
data[175]

{'id': 2,
 'text': 'This is the screen that lets me track what games I have rented, and which ones I have on my lists. This screen needs payment and an address in order to function. My goal is to organize my game lists in such a way that I get the games I want at the right time, and that they do not send me less desirable games. To do this I have to order the games in the order I want them, but I also need to take into account the Availity of the games as I do this. The app uses both their in-store counts on what they have, as well as my name and address to know where to send the games. Other than that it also has my payment info as I need to be able to pay for the services that the company provides to me. Lastly, I also have the option to buy the games, so I need to pay for that as well. ',
 'label': [[38, 62, 'QUE'],
  [68, 97, 'QUE'],
  [117, 124, 'SIM'],
  [132, 139, 'SIM'],
  [193, 198, 'SIM'],
  [228, 233, 'SIM'],
  [304, 309, 'SIM'],
  [342, 347, 'SIM'],
  [431, 436, 'SIM'],
  [

In [54]:
last_count = 0
count = 0
for i, d in enumerate(data):
    count += len(d['text'])
    if count > 5000:
        print('%i = %i chars' % (i-1, last_count))
        break
    last_count = count
print(count)

4 = 4567 chars
5410


In [46]:
# Test grammary checking facility
import requests, uuid

key = 'FXR0LFQ6V3QI6E38XSYITD088PZZNG2V'

url = 'https://api.sapling.ai/api/v1/edits'
post_data = {
    'key': key,
    'text': data[175]['text'],
    'session_id': uuid.uuid4().hex.upper(),
}

try:
    resp = requests.post(url, json=post_data)
    resp_json = resp.json()
    if 200 <= resp.status_code < 300:
      edits = resp_json['edits']
      print('Edits: ', edits)
    else:
      print('Error: ', resp_json)
except Exception as e:
    print('Error: ', e)

Edits:  [{'end': 25, 'error_type': 'R:VERB:SVA', 'general_error_type': 'Grammar', 'id': 'bf7bbe22-e822-5fdb-ae72-ee04a74754a2', 'replacement': 'has', 'sentence': 'This app manages and have streaks for variable habits which are very good for us to maintaining regular activities as well they discipline us in order to the data we have provided to them in the app.', 'sentence_start': 102, 'start': 21}, {'end': 95, 'error_type': 'R:VERB:FORM', 'general_error_type': 'Grammar', 'id': '1c2ba86d-338b-52ab-baba-151dab5e9f47', 'replacement': 'maintain', 'sentence': 'This app manages and have streaks for variable habits which are very good for us to maintaining regular activities as well they discipline us in order to the data we have provided to them in the app.', 'sentence_start': 102, 'start': 84}, {'end': 56, 'error_type': 'M:DET:ART', 'general_error_type': 'Grammar', 'id': '3fb1d7ce-c3b3-58ee-8f03-8f966b46f332', 'replacement': 'the', 'sentence': 'They have mentioned and promised that, they wi

In [97]:
# extract information types from tagged words
import spacy

nlp = spacy.load("en_core_web_sm")

def find_start_char(doc, head_token):
    prefix_tags = ['NOUN', 'ADJ', 'ADP' 'NUM']
    exclusions = ['more']
    
    for i in reversed(range(0, head_token.i)):
        if not doc[i].pos_ in prefix_tags or doc[i].text in exclusions:
            return doc[i + 1].idx
    return head_token.idx

def find_end_char(doc, head_token):
    prefix_tags = ['NOUN']
    
    for i in range(head_token.i + 1, len(doc)):
        if not doc[i].pos_ in prefix_tags:
            return doc[i - 1].idx + len(doc[i - 1].text)
    return head_token.idx + len(head_token.text)

def get_token(doc, start_char, end_char):
    for token in doc:
        if token.idx >= start_char and token.idx < end_char:
            return token
    return None

def extract_types(scenario):
    extracted = {}
    doc = nlp(scenario['text'])
    missed = 0
    
    for label in scenario['label']:
        phrase = None
        match = []
        if label[2] == 'QUE':
            phrase = scenario['text'][label[0]:label[1]].lower()
            match = [label[0], label[1]]
            
        elif label[2] == 'SIM':
            token = get_token(doc, label[0], label[1])
            start = find_start_char(doc, token)
            end = find_end_char(doc, token)
            phrase = scenario['text'][start:end].lower()
            match = [start, end]

        if not phrase:
            missed += 1
            continue
        if not phrase in extracted:
            extracted[phrase] = []
        extracted[phrase].append([start, end])

    for phrase, match in extracted.items():
        print(phrase)

    print('\nFound %i/%i or %0.3f' % (
        len(scenario['label']) - missed,
        len(scenario['label']),
        (len(scenario['label']) - missed) / len(scenario['label'])
    ))
        
extract_types(data[2])

account information
information
full name
contact info
phone number
email id
identity
account ownership
control settings
password
security settings
factor authentication
logins
alerts
payment preferences
important privacy issues
preferences
news feed
reaction preferences
emojis
stickers
notification preferences
shortcuts preferences
language
region
media preferences
autoplay
data saver
video quality
themes
dark mode
light mode
visibility settings
who can see my posts, stories, and profile

Found 35/37 or 0.946
