# Read Data

In [1]:
import numpy as np
import pandas as pd
import json
from collections import OrderedDict, defaultdict
from pathlib import Path
import os
ROOT = Path('devign')
ROOT.mkdir(exist_ok=True)
data = json.load(open(ROOT/'Devign.json'), object_pairs_hook=OrderedDict)

In [2]:
testIDs = list(map(int,open(os.path.join(ROOT,"test.txt")).readlines()))
trainIDs = list(map(int,open(os.path.join(ROOT,"train.txt")).readlines()))
validIDs = list(map(int,open(os.path.join(ROOT,"valid.txt")).readlines()))
len(testIDs),min(testIDs), min(trainIDs), min(validIDs)

(2732, 3, 0, 8)

In [3]:
details = []
with open('TransformationDetails-TEST.txt', 'r') as f:
    content = f.readlines()
    for row in content:
        details.append(list(row.split()))
len(details)

17199

# Read predictions

In [4]:
import torch
def getPercentages(l):
    return torch.nn.functional.softmax(torch.tensor(l), dim=0)

In [5]:
f = open("Results/vulberta-TEST.txt").readlines()
origLabel_VULBERTA = dict()
origPrediction_VULBERTA = dict()
origProbability_VULBERTA = dict()
for i,l in enumerate(f):
    content = l.split()
    origLabel_VULBERTA[testIDs[i]] = content[1]
    origPrediction_VULBERTA[testIDs[i]] = content[0]
    
    probabilities = eval(content[2]+content[3])
    predicitonProbability = getPercentages(probabilities)[1]
    origProbability_VULBERTA[testIDs[i]] = predicitonProbability

In [6]:
f = open("Results/plbart-TEST.txt").readlines()
origLabel_PLBART = dict()
origPrediction_PLBART = dict()
origProbability_PLBART = dict()
for i,l in enumerate(f):
    content = l.split()
    origLabel_PLBART[testIDs[i]] = content[1]
    origPrediction_PLBART[testIDs[i]] = content[0]
    
    
    probabilities = eval(content[2][7:]+content[3][:-1])
    predicitonProbability = getPercentages(probabilities)[1]
    origProbability_PLBART[testIDs[i]] = predicitonProbability

# Read predictions on transformations

In [7]:
f = open("Results/vulberta-transformed-TEST.txt").readlines()
ensemblePredictions_VULBERTA = defaultdict(list)
ensembleProbability_VULBERTA = defaultdict(list)
for i,l in enumerate(f):
    content = l.split()
    getID = int(details[i][2])
    ensemblePredictions_VULBERTA[getID].append(int(content[0]))
    
    
    probabilities = eval(content[2]+content[3])
    predicitonProbability = getPercentages(probabilities)[1]
    ensembleProbability_VULBERTA[getID].append(predicitonProbability)

In [8]:
f_PLBART = open("Results/plbart-transformed-TEST.txt").readlines()
ensemblePredictions_PLBART = defaultdict(list)
ensembleProbability_PLBART = defaultdict(list)
for i,l in enumerate(f_PLBART):
    if l.startswith("Accuracy"):break
    content = l.split()
    getID = int(details[i][2])
    ensemblePredictions_PLBART[getID].append(int(content[0]))
    
    
    probabilities = eval(content[2][7:]+content[3][:-1])
    predicitonProbability = getPercentages(probabilities)[1]
    ensembleProbability_PLBART[getID].append(predicitonProbability)

## Majority VULBERTA

### Majorty, Ties 0

In [9]:
results = []
for k,v in ensemblePredictions_VULBERTA.items():
    origPred = int(origPrediction_VULBERTA[k])
    ensemblePred = (origPred+sum(v)) / (1.0+len(v)) 
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1405, 0.5142752562225475)

### Majority ties 1

In [10]:
results = []
for k,v in ensemblePredictions_VULBERTA.items():
    origPred = int(origPrediction_VULBERTA[k])
    ensemblePred = (origPred+sum(v)) / (1.0+len(v)) 
    if ensemblePred>=0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1404, 0.513909224011713)

### Average

In [13]:
results = []
for k,v in ensembleProbability_VULBERTA.items():
    origPred = origProbability_VULBERTA[k]
    ensemblePred = (origPred+sum(v)) / (1.0+len(v)) 
    if ensemblePred==0.5:
        print ("asflkghsaj")
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1434, 0.5248901903367497)

# Majority PLBART


In [11]:
results = []
for k,v in ensemblePredictions_PLBART.items():
    origPred = int(origPrediction_PLBART[k])
    ensemblePred = (origPred+sum(v)) / (1.0+len(v)) 
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_PLBART[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1315, 0.4813323572474378)

In [12]:
results = []
for k,v in ensembleProbability_PLBART.items():
    origPred = (origProbability_PLBART[k])
    ensemblePred = (origPred+sum(v)) / (1.0+len(v)) 
    if ensemblePred==0.5:
        print ("asflkghsaj")
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_PLBART[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1389, 0.5084187408491947)

# Combine Models

In [25]:
results = []
for k,v in ensemblePredictions_VULBERTA.items():
    predVULBERTA = int(origPrediction_VULBERTA[k])
    predPLBART = int(origPrediction_PLBART[k])
    ensemblePred = (predVULBERTA+predPLBART) / 2.0
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1608, 0.5885797950219619)

In [20]:
results = []
for k,v in ensemblePredictions_VULBERTA.items():
    predVULBERTA = origProbability_VULBERTA[k]
    predPLBART = origProbability_PLBART[k]
    ensemblePred = (predVULBERTA+predPLBART) / 2.0
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1708, 0.6251830161054173)

# Combine Models and Transformations


In [22]:
results = []
for k,v in ensemblePredictions_VULBERTA.items():
    predVULBERTA = int(origPrediction_VULBERTA[k])
    predPLBART = int(origPrediction_PLBART[k])
    
    
    ensemblePred = (predVULBERTA+predPLBART+sum(v)+sum(ensemblePredictions_PLBART[k])) / (2.0+len(v)+len(ensemblePredictions_PLBART[k]))
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1424, 0.5212298682284041)

In [23]:
results = []
for k,v in ensembleProbability_VULBERTA.items():
    predVULBERTA = origProbability_VULBERTA[k]
    predPLBART = origProbability_PLBART[k]
    #ensemblePred = (predVULBERTA+predPLBART) / 2.0
    ensemblePred = (predVULBERTA+predPLBART+sum(v)+sum(ensembleProbability_PLBART[k])) / (2.0+len(v)+len(ensembleProbability_PLBART[k]))
    if ensemblePred==0.5:print("asdfj")
    if ensemblePred>0.5:
        ensemblePred = 1
    else:
        ensemblePred = 0
    #ensemblePred = origPred
    label = int(origLabel_VULBERTA[k])
    results.append(label == ensemblePred)
len(results),sum(results),sum(results)/len(results)

(2732, 1424, 0.5212298682284041)