In [100]:
import json
import numpy as np
from itertools import chain
import os.path
import os
from collections import OrderedDict
from copy import copy
import random as rnd

In [2]:
def read(file):
    with open(file, 'r', encoding='utf-8') as f:
        return json.loads(f.read())

In [41]:
def get_file_stats(identifiers, n=1):
    '''
        n -- number of predictions by which the true prediction is defined.
    '''
    trues, mrr = 0., 0.
    for i in identifiers:
        gt = i['gt']
        ps = [p['left'] for p in i['prediction']]
        trues += 1 if gt in ps[:n] else 0
        mrr += 1/(1 + ps.index(gt)) if gt in ps else 0
    return trues, len(identifiers), mrr

In [42]:
def get_average_stats(files, top_n=1, bounds=None):
    '''
        top_n -- number of predictions by which the true prediction is defined.
        bounds=(l,r) -- bounds in which total count of identifiers in a file has to lie.
    '''
    trues, total, mrr, n_files, avr_acc, avr_mrr = 0, 0, 0, 0, 0, 0
    for identifiers in files.values():
        tr, tot, m = get_file_stats(identifiers['identifiers'], n=top_n)
        if tot>0 and (bounds is None or bounds[0] <= tot <= bounds[1]):
            avr_acc += tr/tot
            avr_mrr += m/tot
            trues += tr
            mrr += m
            total += tot
            n_files += 1
    print(f'Count of files: {n_files}')
    print(f'Count of identifiers: {total}')
    print(f'Top-{top_n} accuracy: {trues/total:.4f}')
    print(f'MRR: {mrr/total:.4f}')
    print('Leave-one-out CV')
    print(f'Top-{top_n} accuracy: {avr_acc/n_files:.4f}')
    print(f'MRR: {avr_mrr/n_files:.4f}')

In [71]:
def get_stats_for_files(predictions, files, top_n=1,
                        bounds=None,
                        acc_bounds=None,
                        mrr_bounds=None,
                        path=None,
                        all=False):
    '''
        Shows state for each file (10 files in a batch).
        
        top_n -- number of predictions by which the true prediction is defined.
        bounds=(l,r) -- bounds in which total count of identifiers in a file has to lie.
    '''
    print(f'File id |N identifiers|Accuracy {top_n}|  MRR  |')
    print('--------|-------------|----------|-------|')
    k = 0
    for i, file in enumerate(files):
        trues, total, mrr = get_file_stats(predictions[file]['identifiers'], n=top_n)
        acc, mrr = (trues/total, mrr/total) if total > 0 else (0, 0)
        if (bounds is None or bounds[0] <= total <= bounds[1]) and (
            acc_bounds is None or acc_bounds[0] <= acc <= acc_bounds[1]) and (
            mrr_bounds is None or mrr_bounds[0] <= mrr <= mrr_bounds[1]):
            k += 1
            print(f'{i: <8}|{total: <13}|{acc: <10.4f}|{mrr: <7.4f}|')
            if not all and k % 10 == 0 and input('Do you want more?[y]/n') in {'n', 'No', 'no', 'exit'}:
                break

In [145]:
def get_random_files_with_stats(predictions, k,
                                top_n=1,
                                bounds=None,
                                acc_bounds=None,
                                mrr_bounds=None,
                                path=None,
                                all=False):
    i = 0
    files = list(predictions)
    rnd.shuffle(files)
    for file in files:
        trues, total, mrr = get_file_stats(predictions[file]['identifiers'], n=top_n)
        acc, mrr = (trues/total, mrr/total) if total > 0 else (0, 0)
        if (bounds is None or bounds[0] <= total <= bounds[1]) and (
            acc_bounds is None or acc_bounds[0] <= acc <= acc_bounds[1]) and (
            mrr_bounds is None or mrr_bounds[0] <= mrr <= mrr_bounds[1]):
            yield file
            i += 1
            if i == k: break

In [50]:
def get_average_project_stats(trainingType, projectNames, top_n=1, bounds=None):
    print(f'{trainingType: ^30}')
    for projectName in projectNames:
        predictions = read(trainingType + "/predictions_" + projectName + ".json")
        print(f'{projectName:_^30}')
        get_average_stats(predictions, top_n, bounds)

# Experiments

Train on a project and self-test on each file in it. Self-testing means that before inference model forget about all tokens that it learned on the test file and only after that it makes predictions on it.

In [51]:
projectNames = ['elasticsearch-master',
                'cassandra-trunk',
                'xmlgraphics-batik-trunk',
                'ant-master']

In [29]:
get_average_project_stats('selfTraining', projectNames, top_n=1)

         selfTraining         
_____elasticsearch-master_____
Count of files: 12528
Count of identifiers: 349602
Top-1 accuracy: 0.6119
MRR: 0.6757
Leave-one-out CV
Top-1 accuracy: 0.4911
MRR: 0.5459
_______cassandra-trunk________
Count of files: 2610
Count of identifiers: 81027
Top-1 accuracy: 0.6137
MRR: 0.6863
Leave-one-out CV
Top-1 accuracy: 0.5237
MRR: 0.5829
___xmlgraphics-batik-trunk____
Count of files: 1383
Count of identifiers: 33053
Top-1 accuracy: 0.6182
MRR: 0.6918
Leave-one-out CV
Top-1 accuracy: 0.4954
MRR: 0.5546
__________ant-master__________
Count of files: 1127
Count of identifiers: 23013
Top-1 accuracy: 0.4648
MRR: 0.5390
Leave-one-out CV
Top-1 accuracy: 0.3762
MRR: 0.4336


In [47]:
get_average_project_stats('selfTesting', projectNames, top_n=1)

         selfTesting          
_____elasticsearch-master_____
Count of files: 12528
Count of identifiers: 349602
Top-1 accuracy: 0.5171
MRR: 0.5844
Leave-one-out CV
Top-1 accuracy: 0.6231
MRR: 0.6839
_______cassandra-trunk________
Count of files: 2610
Count of identifiers: 81027
Top-1 accuracy: 0.4367
MRR: 0.5033
Leave-one-out CV
Top-1 accuracy: 0.5332
MRR: 0.5946
___xmlgraphics-batik-trunk____
Count of files: 1383
Count of identifiers: 33053
Top-1 accuracy: 0.5266
MRR: 0.5806
Leave-one-out CV
Top-1 accuracy: 0.6884
MRR: 0.7334
__________ant-master__________
Count of files: 1127
Count of identifiers: 23013
Top-1 accuracy: 0.3810
MRR: 0.4490
Leave-one-out CV
Top-1 accuracy: 0.5016
MRR: 0.5666


In [91]:
get_average_project_stats('selfTestingIdentifier', projectNames, top_n=1)

    selfTestingIdentifier     
_____elasticsearch-master_____
Count of files: 12528
Count of identifiers: 349602
Top-1 accuracy: 0.6678
MRR: 0.7317
Leave-one-out CV
Top-1 accuracy: 0.6977
MRR: 0.7579
_______cassandra-trunk________
Count of files: 2610
Count of identifiers: 81027
Top-1 accuracy: 0.6289
MRR: 0.6979
Leave-one-out CV
Top-1 accuracy: 0.6511
MRR: 0.7149
___xmlgraphics-batik-trunk____
Count of files: 1383
Count of identifiers: 33053
Top-1 accuracy: 0.7055
MRR: 0.7639
Leave-one-out CV
Top-1 accuracy: 0.7501
MRR: 0.7994
__________ant-master__________
Count of files: 1127
Count of identifiers: 23013
Top-1 accuracy: 0.5450
MRR: 0.6193
Leave-one-out CV
Top-1 accuracy: 0.5907
MRR: 0.6602


In [73]:
trainingType = 'selfTestingIdentifier'

# projectName = 'elasticsearch-master'
projectName = 'cassandra-trunk'
# projectName = 'ant-master'
# projectName = 'xmlgraphics-batik-trunk'

In [83]:
predictions = read(trainingType + "/predictions_" + projectName + ".json")
files = list(predictions.keys())

Here you can see more detailed information about each file in the project.

In [86]:
get_stats_for_files(predictions, files, bounds=(200, 1000))

File id |N identifiers|Accuracy 1|  MRR  |
--------|-------------|----------|-------|
48      |222          |0.7973    |0.8381 |
111     |240          |0.7792    |0.8412 |
204     |201          |0.7363    |0.7814 |
255     |203          |0.5271    |0.6485 |
495     |314          |0.5382    |0.6199 |
663     |224          |0.3125    |0.3551 |
717     |200          |0.4900    |0.6161 |
897     |282          |0.5816    |0.6714 |
962     |370          |0.5838    |0.6589 |
1042    |219          |0.9269    |0.9568 |


Do you want more?[y]/n 


1117    |329          |0.5653    |0.6525 |
1131    |337          |0.7893    |0.8462 |
1152    |229          |0.6114    |0.6727 |
1180    |298          |0.7114    |0.7626 |
1209    |449          |0.7906    |0.8499 |
1238    |932          |0.5440    |0.6176 |
1355    |292          |0.5788    |0.6657 |
1483    |266          |0.7368    |0.8057 |
1568    |307          |0.4202    |0.5248 |
1585    |202          |0.8762    |0.9089 |


Do you want more?[y]/n 


1589    |309          |0.6990    |0.7657 |
1628    |218          |0.6927    |0.7819 |
1810    |222          |0.7973    |0.8391 |
1893    |290          |0.7103    |0.7749 |
1895    |290          |0.5103    |0.5851 |
1934    |356          |0.6152    |0.7250 |
1976    |467          |0.5824    |0.6565 |
2057    |204          |0.5882    |0.6541 |
2110    |357          |0.6303    |0.7098 |
2137    |452          |0.5597    |0.6503 |


Do you want more?[y]/n 


2336    |256          |0.9766    |0.9779 |
2346    |544          |0.6011    |0.6852 |
2398    |268          |0.7500    |0.8271 |
2410    |273          |0.6300    |0.6908 |
2670    |281          |0.6690    |0.7290 |


In [85]:
len(files)

2738

# Let's make some files with predictions

In [121]:
def pair2tuple(pair):
    return (pair['left'], pair['right'])

def pairs2tuples(pairs):
    return [pair2tuple(pair) for pair in pairs]

In [103]:
trainingType = 'selfTestingIdentifier'
projectName = 'ant-master'
predictions = read(trainingType + "/predictions_" + projectName + ".json")
os.makedirs('predicted files/' + projectName, exist_ok=True)

In [151]:
def save_file_with_predictions(path, projectName, trainingType, predictions):
    path = os.path.abspath(path)
    new_file = ''
    new_file += f'// Type of training: {trainingType}\n'
    new_file += f'// Path to file: {path}\n'
    identifiers = predictions[path]['identifiers']
    trues, total, mrr = get_file_stats(identifiers)
    if total == 0:
        new_file += f'// Number of identifiers: {total}\n'
    else:
        new_file += f'// Number of identifiers: {total}\tAccuracy: {(trues/total*100.):.2f}%\tMRR: {mrr/total*100.:.2f}%\n'
        new_file += '// True \tRank of true in predictions : [(predicted token, probability of a token), ...]\n\n'
    with open(path, 'r') as f:
        file_lines = f.readlines()
        identifiers_per_line = [[] for _ in range(len(file_lines))]
        for identifier in identifiers:
            line = identifier['range']['begin']['line'] - 1
            identifiers_per_line[line].append(identifier)
        for i, line in enumerate(file_lines):
            new_file += line
            for j, identifier in enumerate(identifiers_per_line[i]):
                prediction_with_prob = pairs2tuples(identifier['prediction'])
                prediction = list(zip(*prediction_with_prob))[0]
                gt = identifier['gt']
                gt_i = prediction.index(gt) if gt in prediction else 'No'
                new_file += f'// {gt: <20} {gt_i}\t: {prediction_with_prob}\n'
    with open(os.path.join('predicted_files/' + projectName, os.path.basename(path)), 'w') as f:
        f.write(new_file)

In [152]:
def make_random_predictions(projectName, trainingType='selfTestingIdentifier', n=10, **kargs):
    print(f'File name                                    |N identifiers|Accuracy  |  MRR  |')
    print('---------------------------------------------|-------------|----------|-------|')
    predictions = read(trainingType + "/predictions_" + projectName + ".json")
    os.makedirs('predicted_files/' + projectName, exist_ok=True)
    files = rnd.sample(list(predictions), n)
    for file in get_random_files_with_stats(predictions, n, **kargs):
        trues, total, mrr = get_file_stats(predictions[file]['identifiers'])
        save_file_with_predictions(file, projectName, trainingType, predictions)
        if total == 0:
            print(f'{os.path.basename(file): <45}|0            |-         |-      |')
        else:
            print(f'{os.path.basename(file): <45}|{total: <13}|{(trues/total): <10.4f}|{mrr/total: <7.4f}|')

In [153]:
for projectName in projectNames:
    print(f'{projectName: ^80}')
    make_random_predictions(projectName, bounds=(10, 1000))

                              elasticsearch-master                              
File name                                    |N identifiers|Accuracy  |  MRR  |
---------------------------------------------|-------------|----------|-------|
TransportGetAutoFollowPatternAction.java     |17           |0.8235    |0.8603 |
ActionListener.java                          |59           |0.5593    |0.6416 |
SizeMappingIT.java                           |16           |0.5625    |0.6458 |
SimpleChecksAdapter.java                     |14           |0.5714    |0.5893 |
AttachmentProcessorTests.java                |54           |0.7037    |0.7575 |
TransportAddVotingConfigExclusionsAction.java|37           |0.6757    |0.7032 |
MissingAggregator.java                       |13           |0.7692    |0.8654 |
DataCounts.java                              |17           |0.8824    |0.9314 |
ValidateMappingRequestPluginIT.java          |16           |0.9375    |0.9688 |
SingleOrdinalsTests.java               