In [None]:
! pip install transformers[sentencepiece] datasets tokenizers evaluate
! pip install torch
! pip install tensorflow
! pip install spacy
! pip install seqeval
! pip install ipywidgets
! pip install "ray[tune]" scipy sklearn
! pip install pyinterval

In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/mobile_privacy/cleaned/models

Mounted at /content/drive
/content/drive/MyDrive/mobile_privacy/cleaned/models


In [5]:
import json
from interval import interval
from copy import deepcopy
from random import shuffle
from functools import cmp_to_key
import pandas as pd
import spacy
nlp = spacy.load("en_core_web_sm")
from spacy.training import offsets_to_biluo_tags, biluo_to_iob

In [6]:
def replace_linebreak(text):
  return text.replace("\r","")

def load_jsonl_file(filename):
  '''
  load the original jsonl file.
  replace the line break \r\n to \n, in order to get correct label offset.
  return a list of dict{'text':string, 'label':list of [start, end, category]}
  '''
  jsonl_content = open(filename,'r').read()
  result = [json.loads(jline) for jline in jsonl_content.splitlines()]
  for data in result:
    data['text'] = replace_linebreak(data['text'])
  return result

In [51]:
def overlapped(label1, label2):
  ''' 
  Input: two labels with [start, end, category]
  Output: whether they are overlapped
  Notice that the range of token is [start, end), end is exclusive
  '''
  start1, end1 = label1[0], label1[1]-1
  start2, end2 = label2[0], label2[1]-1
  r1 = interval([start1, end1])
  r2 = interval([start2, end2])
  if len((r1 & r2)) == 0:
    return False
  return True

In [52]:
url_df = pd.read_csv('./app_name_category_300.csv', encoding='latin',on_bad_lines = 'warn')
id_app = {}
for index, row in url_df.iterrows():
  id_app[row['scenario_id']] = [row['app_name'], row['app_category'], row['scenario_id']]

In [53]:
jsonl_file = load_jsonl_file('./raw_300_scenarios.jsonl')

In [54]:
from re import I

label_match_dict = {
    'O':'O',
    'B-Noun Phrase':'B-SIM',
    'I-Noun Phrase': 'I-SIM',
    'B-Complex Terms':'B-COM',
    'I-Complex Terms':'I-COM',
    'B-Questions':'B-QUE',
    'I-Questions':'I-QUE'
}

def find_overlap(label1, label_list):
  for i in range(len(label_list)):
    if overlapped(label1, label_list[i]):
      return i
  return -1

def dedup_scenario_label(jsonl_list):
  '''
  Deduplication rules:
  If noun phrase conflict with clause, keep clause only.
  If clause conflict with each other, keep the longest one.
  If noun conflict with each other, keep the shortest one.
  '''
  res = []
  for scenario in jsonl_list:
    tmp = {}
    tmp['id'] = scenario['id']
    tmp['text'] = scenario['text']
    tmp['app_url'] = scenario['app_url']
    tmp['scenario_id'] = scenario['scenario_id']
    tmp['raw_label'] = deepcopy(scenario['label'])
    tmp['app_name'] = id_app[scenario['scenario_id']][0]
    tmp['app_category'] = id_app[scenario['scenario_id']][1]
    labels = list()
    noun = list()
    clause = list()
    for label in scenario['label']:
      start, end, category = label
      if category == 'Noun Phrase':
        noun.append(label)
      else:
        clause.append(label)
    # add non-conflict clause first
    for label in clause:
      ind = find_overlap(label, labels)
      if ind == -1:
        labels.append(label)
      else:
        if labels[ind][1] - labels[ind][0] < label[1] - label[0]:
          labels[ind] = label
    # dedup noun
    dedup_noun = list()
    for label in noun:
      ind = find_overlap(label, dedup_noun)
      if ind == -1:
        dedup_noun.append(label)
      else:
        if dedup_noun[ind][1] - dedup_noun[ind][0] > label[1] - label[0]:
          dedup_noun[ind] = label
    # add non-conflict noun
    for label in dedup_noun:
      ind = find_overlap(label, labels)
      if ind == -1:
        labels.append(label)
    tmp['label'] = labels
    res.append(tmp)
  return res


def convert_jsonl_to_bio(jsonl_list):
  res = {}
  for data in jsonl_list:
    id = data['scenario_id']
    clean_text = data['text']
    raw_labels = deepcopy(data['label'])
    labels = data['label']
    doc = nlp(clean_text)
    biluo_tags = offsets_to_biluo_tags(doc, labels)
    iob_tags = biluo_to_iob(biluo_tags)
    codes = list()
    for c in iob_tags:
        codes.append(label_match_dict.get(c,'O'))
    res[id] = {'id':id,
               'text':clean_text,
               'words': [str(token) for token in doc],
               'codes':codes,
               'raw_labels': raw_labels,
               'app_name':data['app_name'],
               'app_category':data['app_category']}
  return res

dedup_jsonl_file = dedup_scenario_label(jsonl_file)
bio_jsonl = convert_jsonl_to_bio(dedup_jsonl_file)



In [55]:
with open('./scenarios-labeled.json', 'w') as f:
  f.write(json.dumps(bio_jsonl))