In [1]:
import json
import numpy as np
from itertools import chain
import os.path

In [2]:
def read(file):
    with open(file, 'r', encoding='utf-8') as f:
        return json.loads(f.read())

Name of the project:

In [3]:
projectNames = ['elasticsearch-master',
                'cassandra-trunk',
                'xmlgraphics-batik-trunk',
                'ant-master']

In [4]:
def get_file_stats(lexed, predictions, n=1):
    '''
        n -- number of predictions by which the true prediction is defined.
    '''
    assert lexed['left'] == predictions['left'], 'Predictions to wrong file!'
    lexs, preds = lexed['right'], predictions['right'] 
    trues, total, mrr = 0., 0, 0.
    for l, ps in zip(chain(*lexs), chain(*preds)):
        if len(ps) != 0:
            trues += 1 if l in ps[:n] else 0
            total += 1
            mrr += 1/(1 + ps.index(l)) if l in ps else 0
    return trues, total, mrr

In [5]:
def get_average_stats(lexed, predictions, top_n=1, bounds=None):
    '''
        n -- number of predictions by which the true prediction is defined.
        bounds=(l,r) -- bounds in which total count of identifiers in a file has to lie.
    '''
    trues, total, mrr = 0, 0, 0
    for lexs, preds in zip(lexed, predictions):
        tr, tot, m = get_file_stats(lexs, preds, n=top_n)
        if bounds is None or bounds[0] <= tot <= bounds[1]:
            trues += tr
            total += tot
            mrr += m
    print(f'Count of identifiers: {total}')
    print(f'Accuracy: {trues/total:.4f}')
    print(f'MRR: {mrr/total:.4f}')

In [6]:
def get_stats_for_files(lexed, predictions, top_n=1, bounds=None, all=False):
    '''
        Shows state for each file (10 files in a batch).
        
        n -- number of predictions by which the true prediction is defined.
        bounds=(l,r) -- bounds in which total count of identifiers in a file has to lie.
    '''
    print(f'File id |  Count  |Accuracy {top_n}|  MRR  |')
    print('--------|---------|----------|-------|')
    k = 0
    for i, (lexs, preds) in enumerate(zip(lexed, predictions)):
        trues, total, mrr = get_file_stats(lexs, preds, n=top_n)
        if bounds is None or bounds[0] <= total <= bounds[1]:
            k += 1
            if total == 0:
                print(f'{i: <8}|0        |-         |-      |')
            else:
                print(f'{i: <8}|{total: <9}|{(trues/total): <10.4f}|{mrr/total: <7.4f}|')
            if not all and k % 10 == 0 and input('Do you want more?[y]/n') in {'n', 'No', 'no', 'exit'}:
                break

In [7]:
def get_average_project_stats(projectNames, top_n=1, bounds=None):
    for projectName in projectNames:
        lexed = read("cash/" + projectName + "/lexed.json")
        predictions = read("cash/" + projectName + "/predictions.json")
        print(f'{projectName:_^30}')
        get_average_stats(lexed, predictions, top_n=top_n, bounds=bounds)

In [8]:
get_average_project_stats(projectNames, top_n=1, bounds=None)

_____elasticsearch-master_____
Count of identifiers: 2830079
Accuracy: 0.4591
MRR: 0.5291
_______cassandra-trunk________
Count of identifiers: 516216
Accuracy: 0.3181
MRR: 0.3746
___xmlgraphics-batik-trunk____
Count of identifiers: 180824
Accuracy: 0.3045
MRR: 0.3515
__________ant-master__________
Count of identifiers: 145884
Accuracy: 0.3134
MRR: 0.3612


In [12]:
projectName = 'elasticsearch-master'
# projectName = 'cassandra-trunk'
# projectName = 'ant-master'
# projectName = 'xmlgraphics-batik-trunk'

In [9]:
vocabulary = read("cash/" + projectName + "/vocabulary.json")
lexed = read("cash/" + projectName + "/lexed.json")
identifiers = set(read("cash/" + projectName + "/identifiers.json"))
predictions = read("cash/" + projectName + "/predictions.json")

In [10]:
get_stats_for_files(lexed, predictions, bounds=None, top_n=1)

File id |  Count  |Accuracy 1|  MRR  |
--------|---------|----------|-------|
0       |112      |0.5804    |0.6197 |
1       |55       |0.5636    |0.5997 |
2       |164      |0.5366    |0.6047 |
3       |126      |0.6429    |0.7420 |
4       |45       |0.7333    |0.7630 |
5       |43       |0.7907    |0.8333 |
6       |110      |0.3636    |0.4267 |
7       |22       |0.3636    |0.4867 |
8       |7        |0.2857    |0.2857 |
9       |344      |0.1919    |0.2463 |


Do you want more?[y]/n n


In [11]:
len(vocabulary), len(identifiers)

(151038, 50721)