In [2]:
import json
import argparse
import code
from pprint import pprint
from termcolor import colored
import re
import random

import numpy as np
from typing import Dict, Any

In [3]:
train_file = '../raw-datasets/conala-corpus/conala-train.json'
test_file = '../raw-datasets/conala-corpus/conala-test.json'

In [4]:
keywords = [
    "django", "os.", "urllib.", "sys.", "scipy.", "numpy.",
    "pickle.", "struct.", "subprocess.", "datetime.", "time.",
    "request.", "re.", "map(", "filter(", "reduce(",
    "print("
]

def get_unique(xs):
    u, c = np.unique(xs, return_counts=True)
    _c = np.argsort(-c)

    return u[_c], c[_c]

def query_by_key(data: Dict[str, Any], key):
    return [data[i][key] for i in range(len(data))]

def get_by_qid(data: Dict[str, Any], qid: int):
    return list(filter(lambda ex: ex["question_id"] == qid, data))

def get_by_keywords(data: Dict[str, Any]):
    xs = {l: [] for l in keywords}

    for q in data:
        i = q["intent"]
        ri = q["rewritten_intent"]
        s = q["snippet"]
        for l in keywords:
            if (i and l in i) or (ri and l in ri) or (s and l in s):
                xs[l].append(q)

    return xs

In [5]:
data = json.load(open(train_file, "rt"))
keys = data[0].keys()

uids, cuids = get_unique(query_by_key(data, "question_id"))

print(f"There are {uids.size} unique ids")

There are 1710 unique ids


In [6]:
xs = get_by_keywords(data)
print(len(data))

for l, qs in sorted(xs.items(), key=lambda k : len(k[1]), reverse=True):
    print(f"{l} -> {len(qs)}({round(100.0 * len(qs)/len(data), 3)}%)")
print()

# for l, qs in xs.items():
#     print(colored(l, "green"))
#     for q in qs[:2]:
#         pprint(q)
#         print()

2379
re. -> 177(7.44%)
print( -> 163(6.852%)
os. -> 111(4.666%)
time. -> 71(2.984%)
datetime. -> 58(2.438%)
map( -> 48(2.018%)
subprocess. -> 29(1.219%)
django -> 25(1.051%)
numpy. -> 24(1.009%)
sys. -> 22(0.925%)
request. -> 22(0.925%)
urllib. -> 19(0.799%)
filter( -> 17(0.715%)
struct. -> 11(0.462%)
scipy. -> 2(0.084%)
pickle. -> 2(0.084%)
reduce( -> 2(0.084%)



In [None]:
# for i, uid in enumerate(uids, start=1):
#     print(f"{i}:")
#     for q in get_by_qid(data, uid):
#         print("> ", q["rewritten_intent"])
#         print("> ", q["snippet"])
#         print()
#     print("-"*32)

In [7]:
QUOTED_TOKEN_RE = re.compile(r"(?P<quote>''|[`'\"])(?P<string>.*?)(?P=quote)")


def infer_slot_type(quote, value):
    if quote == '`' and value.isidentifier():
        return 'var'
    return 'str'


def canonicalize_intent(intent):
    # handle the following special case: quote is `''`
    marked_token_matches = QUOTED_TOKEN_RE.findall(intent)

    slot_map = dict()
    var_id = 0
    str_id = 0
    for match in marked_token_matches:
        quote = match[0]
        value = match[1]
        quoted_value = quote + value + quote

        # try:
        #     # if it's a number, then keep it and leave it to the copy mechanism
        #     float(value)
        #     intent = intent.replace(quoted_value, value)
        #     continue
        # except:
        #     pass

        slot_type = infer_slot_type(quote, value)

        if slot_type == 'var':
            slot_name = 'var_%d' % var_id
            var_id += 1
            slot_type = 'var'
        else:
            slot_name = 'str_%d' % str_id
            str_id += 1
            slot_type = 'str'

        # slot_id = len(slot_map)
        # slot_name = 'slot_%d' % slot_id
        # # make sure slot_name is also unicode
        # slot_name = unicode(slot_name)

        intent = intent.replace(quoted_value, slot_name)
        slot_map[slot_name] = {'value': value.strip().encode().decode('unicode_escape', 'ignore'),
                               'quote': quote,
                               'type' : slot_type}

    return intent, slot_map

In [10]:
x = random.sample(data, 1)[0]
i = x['rewritten_intent'] if x['rewritten_intent'] is not None else x['intent']
print(i)
print("---")

i, s = canonicalize_intent(x['rewritten_intent'])

print(i)
print("---")

pprint(s)

click on the text button 'section-select-all' using selenium python
---
click on the text button str_0 using selenium python
---
{'str_0': {'quote': "'", 'type': 'str', 'value': 'section-select-all'}}
