In [1]:
import json
import argparse
import code
from pprint import pprint
from termcolor import colored

import numpy as np
from typing import Dict, Any

In [2]:
train_file = '../datasets/conala-corpus/conala-train.json'
test_file = '../datasets/conala-corpus/conala-test.json'

In [3]:
keywords = [
    "django", "os.", "urllib.", "sys.", "scipy.", "numpy.",
    "pickle.", "struct.", "subprocess.", "datetime.", "time.",
    "request.", "re.", "map(", "filter(", "reduce(",
    "print("
]

def get_unique(xs):
    u, c = np.unique(xs, return_counts=True)
    _c = np.argsort(-c)

    return u[_c], c[_c]

def query_by_key(data: Dict[str, Any], key):
    return [data[i][key] for i in range(len(data))]

def get_by_qid(data: Dict[str, Any], qid: int):
    return filter(lambda ex: ex["question_id"] == qid, data)

def get_by_keywords(data: Dict[str, Any]):
    xs = {l: [] for l in keywords}

    for q in data:
        i = q["intent"]
        ri = q["rewritten_intent"]
        s = q["snippet"]
        for l in keywords:
            if (i and l in i) or (ri and l in ri) or (s and l in s):
                xs[l].append(q)

    return xs

In [4]:
data = json.load(open(train_file, "rt"))
keys = data[0].keys()

uids, cuids = get_unique(query_by_key(data, "question_id"))

print(f"There are {uids.size} unique ids")

There are 1710 unique ids


In [5]:
xs = get_by_keywords(data)
print(len(data))

for l, qs in sorted(xs.items(), key=lambda k : len(k[1]), reverse=True):
    print(f"{l} -> {len(qs)}({round(100.0 * len(qs)/len(data), 3)}%)")
print()

for l, qs in xs.items():
    print(colored(l, "green"))
    for q in qs[:2]:
        pprint(q)
        print()

2379
re. -> 177(7.44%)
print( -> 163(6.852%)
os. -> 111(4.666%)
time. -> 71(2.984%)
datetime. -> 58(2.438%)
map( -> 48(2.018%)
subprocess. -> 29(1.219%)
django -> 25(1.051%)
numpy. -> 24(1.009%)
sys. -> 22(0.925%)
request. -> 22(0.925%)
urllib. -> 19(0.799%)
filter( -> 17(0.715%)
struct. -> 11(0.462%)
scipy. -> 2(0.084%)
pickle. -> 2(0.084%)
reduce( -> 2(0.084%)

[32mdjango[0m
{'intent': 'django filter with list of values',
 'question_id': 9304908,
 'rewritten_intent': 'create a django query for a list of values `1, 4, 7`',
 'snippet': 'Blog.objects.filter(pk__in=[1, 4, 7])'}

{'intent': 'Is it possible to serve a static html page at the root of a django '
           'project?',
 'question_id': 30650254,
 'rewritten_intent': "serve a static html page 'your_template.html' at the "
                     'root of a django project',
 'snippet': "url('^$', "
            "TemplateView.as_view(template_name='your_template.html'))"}

[32mos.[0m
{'intent': 'get file creation & modification d

In [6]:
# for i, uid in enumerate(uids, start=1):
#     print(f"{i}:")
#     for q in get_by_qid(data, uid):
#         print("> ", q["rewritten_intent"])
#         print("> ", q["snippet"])
#         print()
#     print("-"*32)