# CodeBert Grid Experiment Evaluation

Nice to see you around! Have a seat.
Would you like a drink? Maybe a cigar?

Make sure to have all required dependencies installed - they are listed in the [environment.yml](./environment.yml). 
You create a conda environment from the yml using 

```
conda env create -f environment.yml
conda activate Lampion-Codebert-Evaluation
```

Make sure to run your Jupyter Notebook from that environment! 
Otherwise you are (still) missing the dependencies. 

**OPTIONALLY** you can use the environment in which your jupter notebook is already running, with starting a new terminal (from jupyter) and run 

```
conda env update --prefix ./env --file environment.yml  --prune
```

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import nltk
nltk.download("punkt")
# Homebrew Imports (python-file next to this)
import bleu_evaluator as foreign_bleu

# Set Jupyter vars
# %matplotlib notebook
%matplotlib inline

## Data-Loading / Preparation

Make sure that your dataset looks like described in the [Readme](./README.md), that is 

```
./data
    /GridExp_XY
        /configs
            /reference
                test_0.gold
                test_0.output
                bleu.txt (optional, can be created below)
            /config_0
                config.properties
                test_0.gold
                test_0.output
                bleu.txt (optional, can be created below)
            /config_1
                config.properties
                test_0.gold
                test_0.output
                bleu.txt (optional, can be created below)
    ...
```

where the configs **must** be numbered to be correctly detected. 

In [None]:
# This runs the bleu-score upon the config files, creating the bleu.txt's 
# If your data package was provided including the txt you dont need to do this. 
# Existing bleu.txt's will be overwritten. 

#!./metric_runner.sh ./data/PreliminaryResults/

In [None]:
data_directory = "./data/PreliminaryResults"

# These archetypes are later used to group the configurations
# While to grouping is up to you, right here it simply is one archetype for each transformation type, 
# Grouping together different configs with the same transformations applied (but different #Transformations)
config_archetypes = {
    "config_0":"if-true","config_1":"if-true","config_2":"if-true",
    "config_3":"mixed names(pseudo)","config_4":"mixed names(pseudo)","config_5":"mixed names(pseudo)",
    "config_6":"add-neutral","config_7":"add-neutral","config_8":"add-neutral",
    "config_9":"mixed-names(random)","config_10":"mixed-names(random)","config_11":"mixed-names(random)",
    "config_12": "add-var(pseudo)","config_13": "add-var(pseudo)","config_14": "add-var(pseudo)",
    "config_15": "add-var(random)","config_16": "add-var(random)","config_17": "add-var(random)",
    "config_18": "if-true & add-neutral","config_19": "if-true & add-neutral","config_20": "if-true & add-neutral"
}

print(f"looking for results in {data_directory}" )

results={}

for root,dirs,files in os.walk(data_directory):
    for name in files:
        if ".gold" in name:
            directory = os.path.basename(root)
            results[directory]={}
            
            results[directory]["result_file"]=os.path.join(root,"test_0.output")
            results[directory]["gold_file"]=os.path.join(root,"test_0.gold")
            results[directory]["bleu_file"]=os.path.join(root,"bleu.txt")
            if os.path.exists(os.path.join(root,"config.properties")):
                results[directory]["property_file"]=os.path.join(root,"config.properties")

In [None]:
def load_properties(filepath, sep='=', comment_char='#'):
    """
    Read the file passed as parameter as a properties file.
    """
    props = {}
    with open(filepath, "rt") as f:
        for line in f:
            l = line.strip()
            if l and not l.startswith(comment_char):
                key_value = l.split(sep)
                key = key_value[0].strip()
                value = sep.join(key_value[1:]).strip().strip('"') 
                props[key] = value 
    return props

print("reading in property-files")

for key in results.keys():
    if "property_file" in results[key].keys():
        results[f"{key}"]["properties"]=load_properties(results[key]["property_file"])

print("done reading the properties")

In [None]:
print("reading in result-files")

for key in results.keys():
    result_file = results[key]["result_file"]
    f = open(result_file)
    lines=f.readlines()
    results[key]["results"]={}
    for l in lines:
        num = int(l.split("\t")[0])
        content = l.split("\t")[1]
        content = content.strip()
        results[key]["results"][num] = content
    f.close()
    
    gold_file = results[key]['gold_file']
    gf = open(gold_file)
    glines=gf.readlines()
    results[key]["gold_results"]={}
    for gl in glines:
        num = int(gl.split("\t")[0])
        content = gl.split("\t")[1]
        content = content.strip()
        results[key]["gold_results"][num] = content
    gf.close()

print("done reading the result files")
# Comment this in for inspection of results
#results

In [None]:
print("reading in the bleu-scores")

for key in results.keys():
    bleu_file = results[key]["bleu_file"]
    f = open(bleu_file)
    score=f.readlines()[0]
    results[key]["bleu"]=float(score)
    f.close()
    
print("done reading the bleu-scores")

#results["config_0"]["bleu"]

In [None]:
for key in results.keys():
    if "property_file" in results[key].keys():
        results[key]["archetype"]=config_archetypes[key]
        
non_reference_configs = [key for key in results.keys() if "reference" != key]

def archetype_info(config):
    archetype = config_archetypes[config]
    transforms = int(results[config]["properties"]["transformations"])
    return (archetype,transforms)

print_archetype_info = lambda config: f"{(archetype_info(config))[0]}@{(archetype_info(config))[1]}"

all_archetypes = set(config_archetypes.values())

In [None]:
def jaccard_wrapper(sentenceA,sentenceB,ngram=1):
    tokensA = nltk.word_tokenize(sentenceA)
    tokensB = nltk.word_tokenize(sentenceB)

    ngA_tokens = set(nltk.ngrams(tokensA, n=ngram))
    ngB_tokens = set(nltk.ngrams(tokensB, n=ngram))
    
    return nltk.jaccard_distance(ngA_tokens, ngB_tokens)

def bleu_wrapper(sentence_to_check,reference):
    check_tokens = nltk.word_tokenize(sentence_to_check)
    ref_tokens = nltk.word_tokenize(reference)
    
    # From comparing the foreign_bleu and nltk the method4 seems to match
    # The Paper mentiones the BLEU-4-Score with a citation to chen & cherry
    # I wish I could be named chen & cherry, its a very cool name. 
    chencherry = nltk.translate.bleu_score.SmoothingFunction()
    smooth_fn = chencherry.method4
    
    try:
        return nltk.translate.bleu_score.sentence_bleu([ref_tokens],check_tokens,smoothing_function=smooth_fn)
    except:
        return 0

sample_index=125
bleu_wrapper(results["config_2"]["results"][sample_index],results["config_2"]["gold_results"][sample_index])

## Bleu-Scores

In the following, the BLEU-scores will be calculated using the foreign libary. 
While there have been minor changes to standard-BLEU, it is the same as used in the original experiment.

The aggregated BLEU-Scores will be stored to the results.

In [None]:
bleu_data = {}
archetypes = set([results[k]["archetype"] for k in results.keys() if "archetype" in results[k].keys()])
for archetype in archetypes:
    bleu_data[archetype]={}
    bleu_data[archetype][0]=results["reference"]["bleu"]
    relevant_configs = [k for k 
                        in results.keys() 
                        if "archetype" in results[k].keys() 
                        and results[k]["archetype"]==archetype]
    for c in relevant_configs:
        bleu_data[archetype][int(results[c]["properties"]["transformations"])]=results[c]["bleu"]
   
bleu_data_df = pd.DataFrame.from_dict(bleu_data)
#bleu_data_df = bleu_data_df.sort_index() #(by="index")
bleu_data_df = bleu_data_df.sort_index()
bleu_data_df = bleu_data_df.applymap(lambda cell: round(cell,3))


with open("./exports/bleu_table.tex","w") as f:
    # I maybe want to consider column_format="{rrrlll}" etc. 
    f.write(
        bleu_data_df.to_latex(
            caption="BLEU4-Scores for increasing number of metamorphic transformations \n (applied n-times per datapoint)"
            ,label="tab:bleu_scores"
            ,position="h"
        )         
    )

bleu_data_df

In [None]:
plt.figure(figsize=(14,7))
plt.ylabel("BLEU-Score")
plt.xlabel("# Transformations")

#for latex, its nicer to have the title set from latex itself
#plt.title("BLEU4-Scores for increasing number of metamorphic transformations \n (applied n-times per datapoint)")

plot = sns.lineplot(data=bleu_data_df,markers=True,style=None,dashes=False)
plt.xticks([0,1,5,10])
plt.xlim(-0.025,10.1)
plt.legend(bleu_data_df.columns)
plt.savefig('images/bleu_scores.png')
plt.show()

In [None]:
calculate_bleus = lambda config_id : [
    bleu_wrapper(results[config_id]["results"][i],results[config_id]["gold_results"][i]) 
    for i 
    in results[config_id]["results"].keys()
]


bleus_reference_data = calculate_bleus("reference")
sample_bleus_config_data = calculate_bleus("config_20")

In [None]:
def plot_bleu_histogram(config_data,reference_data,title):
    plt.figure(figsize=(14,7))
    
    histo_df=pd.DataFrame.from_dict(
        {"reference":reference_data,
            title:config_data }
    )

    sns.displot(
        data=histo_df,
        kind="hist", kde=True,
        height=6, aspect=10/6
               )
    plt.title(f"Histogram of Bleu-Scores for {title}")
    plt.xlabel("Bleu-Score")
    #plt.ylabel("# of Entries")
    plt.xlim(0,1)
    plt.savefig(f'images/{title}_bleu_histogram.png')
    plt.show()
    
def plot_bleu_boxplot(config_data,reference_data,title=None):
    fig = plt.figure(figsize=(6,4))
    ax = fig.add_subplot(1, 1, 1)
    box_df=pd.DataFrame.from_dict(
        {"reference":reference_data,
            title:config_data }
    )    
    sns.boxplot(
        data=box_df)
    
    plt.title(f"Boxplot of Bleu-Scores for {title}")
    plt.ylabel("Bleu-Score")
    
    major_ticks = np.arange(0, 1, 0.2)
    minor_ticks = np.arange(0, 1, 0.05)

    ax.set_yticks(major_ticks)
    ax.set_yticks(minor_ticks, minor=True)

    # And a corresponding grid
    ax.grid(which='both')

    #plt.grid()
    plt.savefig(f'images/{title}_bleu_box.png')
    plt.ylim(0,1)
    
    plt.show()
    
def plot_bleu_violinplot(config_data,reference_data,title):
    plt.figure(figsize=(6,4))
    violin_df=pd.DataFrame.from_dict(
        {"reference":reference_data,
            title:config_data }
    )
    
    sns.violinplot(data=violin_df)
    
    plt.grid()
    plt.title(f"ViolinPlot of Bleu-Scores for {title}")
    plt.ylabel("Bleu-Score")
    
    plt.savefig(f'images/{title}_bleu_violin.png')
    plt.show()
    
#plot_bleu_violinplot(sample_bleus_config_data,bleus_reference_data,"config_20")
plot_bleu_boxplot(sample_bleus_config_data,bleus_reference_data,"config_20")
#plot_bleu_histogram(sample_bleus_config_data,bleus_reference_data,"config_20")

In [None]:
%%time
bleus_reference_data = calculate_bleus("reference")
results["reference"]["bleu_values"]=bleus_reference_data

for config in non_reference_configs:
    bleus_data = calculate_bleus(config)
    results[config]["bleu_values"]=bleus_data
    plot_bleu_violinplot(bleus_data,bleus_reference_data,config)
    plot_bleu_boxplot(bleus_data,bleus_reference_data,config)
    plot_bleu_histogram(bleus_data,bleus_reference_data,config)
    del bleus_data

## Samples

Before the samples can be inspected, the items need to be re-indexed. 
While all config_results are in the reference_results, there might is an issue with the data being shuffeld. 

To fix this, a reindexing is done.

In [None]:
%%time
#Reindexing Pseudocode

def lookup_index(sentence, comparison_dict):
    for (gold_key,gold_value) in comparison_dict.items():
        if sentence == gold_value:
            return gold_key
    return -1

# For each config (that is not reference)
    # Create a lookup of reference_gold_index -> config_gold_index
    # Invert the lookup 
    # Make a new dictionary where
        # For every key of the config_gold
        # lookup the key of the reference_gold
        # And fill it with {reference_gold_key : config_gold_value}
        # Do the same with the non-gold results
        # Fill it with {reference_gold_key : config_result_value}
    # Set result[config_X]["gold_results"] to the newly created, matching index one 
    # same for non-gold-results
    
for config in non_reference_configs:
    keyMapping={}
    for (k,v) in results[config]["gold_results"].items():
        gk = lookup_index(v,results["reference"]["gold_results"])
        keyMapping[k]=gk
    new_gold_results={}
    new_results={}
    for (config_key,gold_key) in keyMapping.items():
        if gold_key != -1:
            new_gold_results[gold_key]=results[config]["gold_results"][config_key]
            new_results[gold_key]=results[config]["results"][config_key]
    results[config]["gold_results"]=new_gold_results
    results[config]["results"]=new_results
    #print(config,keyMapping)

In [None]:
sample_index = 250
print(results["reference"]["gold_results"][sample_index] )
#print(results["config_2"]["gold_results"][sample_index])
print()
print(results["reference"]["results"][sample_index])
print(results["config_2"]["results"][sample_index])

**Hall of Shame** 

worst entries in terms of t-score (either one)

In [None]:
%%time
biggest_len_inc = 0
biggest_len_inc_pos = ()

biggest_len_dec = 0
biggest_len_dec_pos = ()

biggest_jaccard_dist = 0
biggest_jaccard_dist_pos = ()

smallest_jaccard_dist = 1 
smallest_jaccard_dist_pos = ()

for config in non_reference_configs:
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        
        if len(reference)-len(altered)>biggest_len_inc:
            biggest_len_inc = len(reference)-len(altered)
            biggest_len_inc_pos = (index,config)
        if len(altered)-len(reference)>biggest_len_dec:
            biggest_len_dec = len(altered)-len(reference)
            biggest_len_dec_pos = (index,config)
            
        jacc_dist = jaccard_wrapper(altered,reference)
        if jacc_dist > biggest_jaccard_dist and jacc_dist < 1:
            biggest_jaccard_dist = jacc_dist
            biggest_jaccard_dist_pos = (index,config)
        if jacc_dist < smallest_jaccard_dist and jacc_dist > 0:
            smallest_jaccard_dist = jacc_dist
            smallest_jaccard_dist_pos = (index,config)

In [None]:
def print_config_item_with_reference(index,config):
    print("Gold:")
    print(results[config]["gold_results"][index])
    print("Reference:")
    print(results["reference"]["results"][index])
    print("Altered:")
    print(results[config]["results"][index])

In [None]:
print("Biggest jaccard Distance (that is not 1):\n")
print_config_item_with_reference(biggest_jaccard_dist_pos[0],biggest_jaccard_dist_pos[1])

In [None]:
print("Biggest decrease in length:\n")
print_config_item_with_reference(biggest_len_inc_pos[0],biggest_len_inc_pos[1])

In [None]:
print("Biggest increase in length:\n")
print_config_item_with_reference(biggest_len_dec_pos[0],biggest_len_dec_pos[1])

In [None]:
print("Smallest Jaccard Distance (that is not 0):\n")
print_config_item_with_reference(smallest_jaccard_dist_pos[0],smallest_jaccard_dist_pos[1])

Fishy Example from a Kids Java-Learning Book. 
Code is actually about learning switch-case statements and set a image to the corresponding fishes (e.g. empty fish glass, fish glass with 2 fishes etc.)

In [None]:
fishyKey = -1
for (key,value) in results["reference"]["gold_results"].items():
    #print(value)
    if "makeAFishyDecision " in value:
        fishyKey = key

print("Fishy Results! \n")
print("Gold:")
print(results["reference"]["gold_results"][fishyKey])
print("Reference:")
print(results["reference"]["results"][fishyKey],"\n")
#for config in non_reference_configs:
for config in ["config_0","config_1","config_20","config_10"]:
    print(f"Altered({config},{print_archetype_info(config)}):")
    print(results[config]["results"][fishyKey])

In [None]:
"""
This method requires for the xs and ys to be sorted! 
Without matching indizes it does not make any sense.
"""
def calculate_jaccard_distances(xs,ys,ngrams=1):
    agg = []
    indX = len(xs)
    indY = len(ys)
    if indX != indY:
        raise IndexError()
    else:
        running_index = 0
        while running_index < indX:
            agg.append(jaccard_wrapper(xs[running_index],ys[running_index],ngrams))
            running_index = running_index + 1
    return agg

In [None]:
jaccs = {}
for config in non_reference_configs:
    distances = calculate_jaccard_distances(results["reference"]["results"],results[config]["results"])
    
    jaccs[config]=distances
    
    sns.displot(
        distances,
        kind="hist", kde=True,
        bins=20,
        height=6, aspect=10/6)
    plt.title(f"Histogram of JaccardDistances for {config}\n({print_archetype_info(config)})")
    plt.xlabel("JaccardDistance \n Reference to Altered")
    plt.ylabel("# of Entries")
    plt.xlim(0,1)
    plt.ylim(0,10000)
    
    plt.savefig(f'images/{config}_jaccard_histogram.png')
    plt.show()

In [None]:
plt.figure(figsize=(20,9))

violin_parts = plt.violinplot(
    dataset=jaccs.values()
    ,showextrema=False
    ,points=35
)

archetype_color = {
    'add-neutral':plt.cm.get_cmap('PuBuGn')(0.1),
    'add-var(pseudo)':plt.cm.get_cmap('PuBuGn')(0.25),
    'add-var(random)':plt.cm.get_cmap('PuBuGn')(0.4),
    'if-true':plt.cm.get_cmap('PuBuGn')(0.5),
    'if-true & add-neutral':plt.cm.get_cmap('PuBuGn')(0.65),
    'mixed names(pseudo)':plt.cm.get_cmap('PuBuGn')(0.8),
    'mixed-names(random)':plt.cm.get_cmap('PuBuGn')(1.0)
}

part_runner = 0

while part_runner < len(violin_parts['bodies']):
    pc = violin_parts['bodies'][part_runner]
    con = list(jaccs.keys())[part_runner]
    pc.set_facecolor(archetype_color[config_archetypes[con]])
    pc.set_edgecolor('black')
    pc.set_label(config_archetypes[con])
    part_runner = part_runner +1 
    
plt.title(f"ViolinPlot of Jaccard_Distances")
plt.ylabel("Jaccard Distance")
plt.ylim(0,1)

plt.xticks(
    range(1,len(jaccs.keys())+1)
    ,labels=[f"{config}\n{print_archetype_info(config)}" for config in jaccs.keys()]
)

#plt.legend()

plt.savefig(f'images/jaccard_distances_violinplot.png')
plt.show()

In [None]:
plt.figure(figsize=(30,12))
jacc_df=pd.DataFrame.from_dict(jaccs)

sns.boxplot(
    data=jacc_df)

plt.grid()
plt.title(f"Boxplot of Jaccard_Distances")
plt.ylabel("Jaccard Distance")
plt.ylim(0,1)

plt.savefig(f'images/jaccard_distances_boxplot.png')
plt.show()

## Transformation Scores

from the paper, we provide two ways of checking on the transformation score: 

```
delta_tscore = bleu(gold,reference) - bleu(gold,altered)
```

and 

```
tquot = bleu(gold,altered) / bleu(gold,reference)
```

which we can perfectly write as python and see how they are doing. 


In [None]:
delta_tscore = lambda gold,reference,altered : bleu_wrapper(reference,gold) - bleu_wrapper(altered,gold)

tquot = lambda gold,reference,altered : bleu_wrapper(altered,gold) / bleu_wrapper(reference,gold)

In [None]:
delta_tscore(results["config_2"]["gold_results"][sample_index],results["reference"]["results"][sample_index],results["config_2"]["results"][sample_index])

In [None]:
tquot(results["config_2"]["gold_results"][sample_index],results["reference"]["results"][sample_index],results["config_2"]["results"][sample_index])

In [None]:
%%time
n = 10
worst_n = []
best_n = []

for config in non_reference_configs:
    running_agg = []
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        try:
            r = (delta_tscore(gold,reference,altered),config,index)
        except ZeroDivisionError: 
            print("Error at ",index,config)
        else:
            running_agg.append(r)
            
    running_agg = sorted(running_agg + worst_n + best_n,key=lambda x:x[0])
    worst_n=running_agg[:n]
    best_n=running_agg[-n:]
    del running_agg
    #print("ckecked for worst in ",config)

In [None]:
for (diff,config,index) in worst_n+best_n:
    print(
        f"delta_tscore {round(diff,4)}, {config}@{index}\n",
        f'gold: \t {results["reference"]["gold_results"][index]} \n',
        f'ref: \t {results["reference"]["results"][index]}\n',
        f'alt: \t {results[config]["results"][index]}\n',
    )

In [None]:
%%time
m = 10
worst_m = []
best_m = []

for config in [key for key in results.keys() if "reference" != key]:
    running_agg = []
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        r = ()
        try:
            r = (tquot(gold,reference,altered),config,index)
        except ZeroDivisionError: 
            #print("Error at ",index,config)
            #there are quite a lot of errors 
            r = (0,config,index)
        running_agg.append(r)
            
    running_agg = sorted(running_agg + worst_m + best_m,key=lambda x:x[0])
    worst_m=running_agg[:m]
    best_m=running_agg[-m:]
    del running_agg
    #print("ckecked for worst in ",config)

In [None]:
for (diff,config,index) in worst_n+best_n:
    print(
        f"tquot {round(diff,4)}, {config}@{index}\n",
        f'gold: \t {results["reference"]["gold_results"][index]} \n',
        f'ref: \t {results["reference"]["results"][index]}\n',
        f'alt: \t {results[config]["results"][index]}\n',
    )

### Histograms of TScores

In [None]:
%%time
delta_tscore_data = []
quot_tscore_data = []

for config in [key for key in results.keys() if "reference" != key]:
    delta_running_agg = []
    quot_running_agg = []
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        a = ()
        b = ()
        try:
            a = delta_tscore(gold,reference,altered)
        except ZeroDivisionError: 
            #print("Error at ",index,config)
            a = 0
        delta_running_agg.append(a)
        try:
            b = tquot(gold,reference,altered)
        except ZeroDivisionError: 
            #print("Error at ",index,config)
            b = 0
        
        quot_running_agg.append(b)    
    delta_tscore_data.append(delta_running_agg)
    quot_tscore_data.append(quot_running_agg)
    results[config]["quot-tscores"]=quot_tscore_data
    results[config]["delta-tscores"]=delta_tscore_data
    del quot_running_agg
    del delta_running_agg
    
filtered_quot_tscore_data = [[a for a in x if a<2] for x in quot_tscore_data]
#filtered_delta_tscore_data = [[a for a in x if a != 0] for x in delta_tscore_data]
filtered_delta_tscore_data = [[a for a in x if abs(a)>0.025] for x in delta_tscore_data]

In [None]:
plt.figure(figsize=(21, 7))

filtered_delta_tscore_df = pd.DataFrame(filtered_delta_tscore_data )
filtered_delta_tscore_df = filtered_delta_tscore_df.transpose()
filtered_delta_tscore_df.columns=(list(non_reference_configs))

plt.ylim(-1,1)
plt.grid()
# plot_data = delta_tscore_data 
plot_data = filtered_delta_tscore_data 

# plot violin plot
sns.violinplot(data=filtered_delta_tscore_df,
                  showmeans=False,
                  showmedians=True)
plt.title('Distribution of non-null delta-tscores')

plt.savefig(f'images/delta_tscore_violins_by_config.png')
plt.show()

plt.figure(figsize=(21, 7))

plt.ylim(-1,1)
plt.grid()
plt.title('Distribution of non-null delta-tscores \n by archetype ')

sns.boxplot(data=filtered_delta_tscore_df)

    
plt.savefig(f'images/delta_tscore_boxplots_by_config.png')
plt.show()

In [None]:
plt.figure(figsize=(21, 7))

filtered_quot_tscore_df = pd.DataFrame(filtered_quot_tscore_data )
filtered_quot_tscore_df = filtered_quot_tscore_df.transpose()
filtered_quot_tscore_df.columns=(list(non_reference_configs))

plt.grid()

# plot violin plot
sns.violinplot(data=filtered_quot_tscore_df,
                  showmeans=False,
                  showmedians=True)
plt.title('Distribution of quot-tscores')

plt.savefig(f'images/quot_tscore_violins_by_config.png')
plt.show()

plt.figure(figsize=(21, 7))

plt.grid()
plt.title('Distribution of quot-tscores \n by archetype ')

sns.boxplot(data=filtered_quot_tscore_df)

    
plt.savefig(f'images/quot_tscore_boxplots_by_config.png')
plt.show()

### Delta Score by archetype

In [None]:
archetype_delta_tscore_data = {}

for config in non_reference_configs:
    running_agg = []
    if not config_archetypes[config] in archetype_delta_tscore_data.keys():
        archetype_delta_tscore_data[config_archetypes[config]] = []
    else:
        running_agg = archetype_delta_tscore_data[config_archetypes[config]]
    running_agg = running_agg + results[config]["delta-t-scores"]
            
    archetype_delta_tscore_data[config_archetypes[config]] = running_agg
    del running_agg

#archetype_filtered_delta_tscore_data = [[a for a in x if a != 0] for x in delta_tscore_data]
archetype_filtered_delta_tscore_data = [[a for a in x if abs(a)>0.025] for x in archetype_delta_tscore_data.values()]

In [None]:
plt.figure(figsize=(16, 7))

delta_tscore_df = pd.DataFrame(archetype_delta_tscore_data.values() )
delta_tscore_df = delta_tscore_df.transpose()
delta_tscore_df.columns=(list(all_archetypes))
plt.ylim(-1,1)
plt.grid()
plt.title('Distribution of delta-tscores \n by archetype ')

sns.violinplot(data=delta_tscore_df)

    
plt.savefig(f'images/delta_tscore_violins_by_archetype.png')
plt.show()

plt.figure(figsize=(16, 7))

plt.ylim(-1,1)
plt.grid()
plt.title('Distribution of delta-tscores \n by archetype ')

sns.boxplot(data=delta_tscore_df)

    
plt.savefig(f'images/delta_tscore_boxplots_by_archetype.png')
plt.show()

In [None]:
plt.figure(figsize=(16, 7))

non_null_delta_df = pd.DataFrame(archetype_filtered_delta_tscore_data)

non_null_delta_df = non_null_delta_df.transpose()
non_null_delta_df.columns=(list(all_archetypes))
plt.ylim(-1,1)
plt.grid()
plt.title('Distribution of non-null delta-tscores \n by archetype ')

sns.violinplot(data=non_null_delta_df)

    
plt.savefig(f'images/nonnull_delta_tscore_violins_by_archetype.png')
plt.show()

plt.figure(figsize=(16, 7))
plt.ylim(-1,1)
plt.grid()
plt.title('Distribution of non-null delta-tscores \n by archetype ')

sns.boxplot(data=non_null_delta_df)

    
plt.savefig(f'images/nonnull_delta_tscore_boxplots_by_archetype.png')
plt.show()

### T-Robustness

the t-robustness is the average of tscores on a configuration

In [None]:

for config in non_reference_configs:
    running_agg = []
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        try:
            running_agg.append(delta_tscore(gold,reference,altered))
        except ZeroDivisionError: 
            print("Error at ",index,config)
    
    robustness = np.mean(running_agg)
    variance = np.var(running_agg)
    #print(config,"delta-trobustness",robustness)
    #print(config,"delta-t-variance",variance)
    results[config]["delta-t-robustness"]=robustness
    results[config]["delta-t-variance"]=variance
    del running_agg

In [None]:
for config in non_reference_configs:
    running_agg = []
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        try:
            running_agg.append(tquot(gold,reference,altered))
        except ZeroDivisionError: 
            running_agg.append(0.0)
    
    robustness = np.mean(running_agg)
    variance = np.var(running_agg)
    #print(config,"quot-trobustness",robustness)
    #print(config,"quot-t-variance",variance)
    results[config]["quot-t-robustness"]=robustness
    results[config]["quot-t-variance"]=variance
    del running_agg

In [None]:
robustness_data = {}
archetypes = set([results[k]["archetype"] for k in results.keys() if "archetype" in results[k].keys()])
for archetype in archetypes:
    robustness_data[archetype]={}
    relevant_configs = [k for k 
                        in results.keys() 
                        if "archetype" in results[k].keys() 
                        and results[k]["archetype"]==archetype]
    for c in relevant_configs:
        robustness_data[archetype][results[c]["properties"]["transformations"]]=results[c]["delta-t-robustness"]
   
robustness_data_df = pd.DataFrame.from_dict(robustness_data)

plt.title("TRobustness given delta-tscore")
plt.ylabel("Delta-T-Robustness")
plt.xlabel("# Transformations")

plot = plt.plot(robustness_data_df,marker="o")

plt.legend(robustness_data_df.columns)
plt.show()

In [None]:
robustness_data = {}
archetypes = set([results[k]["archetype"] for k in results.keys() if "archetype" in results[k].keys()])
for archetype in archetypes:
    robustness_data[archetype]={}
    relevant_configs = [k for k 
                        in results.keys() 
                        if "archetype" in results[k].keys() 
                        and results[k]["archetype"]==archetype]
    for c in relevant_configs:
        robustness_data[archetype][results[c]["properties"]["transformations"]]=results[c]["quot-t-robustness"]
   
robustness_data_df = pd.DataFrame.from_dict(robustness_data)

plt.title("TRobustness given quot-tscore")
plt.ylabel("Quot-T-Robustness")
plt.xlabel("# Transformations")

plot = plt.plot(robustness_data_df,marker="o")

plt.legend(robustness_data_df.columns)
plt.show()

In [None]:
variance_data = {}
archetypes = set([results[k]["archetype"] for k in results.keys() if "archetype" in results[k].keys()])
for archetype in archetypes:
    variance_data[archetype]={}
    relevant_configs = [k for k 
                        in results.keys() 
                        if "archetype" in results[k].keys() 
                        and results[k]["archetype"]==archetype]
    for c in relevant_configs:
        variance_data[archetype][results[c]["properties"]["transformations"]]=results[c]["delta-t-variance"]
   
variance_data_df = pd.DataFrame.from_dict(variance_data)

plt.title("TVariance given delta-tscore")
plt.ylabel("Delta-T-Variance")
plt.xlabel("# Transformations")

plot = plt.plot(variance_data_df,marker="o")

plt.legend(variance_data_df.columns)
plt.show()

In [None]:
variance_data = {}
archetypes = set([results[k]["archetype"] for k in results.keys() if "archetype" in results[k].keys()])
for archetype in archetypes:
    variance_data[archetype]={}
    relevant_configs = [k for k 
                        in results.keys() 
                        if "archetype" in results[k].keys() 
                        and results[k]["archetype"]==archetype]
    for c in relevant_configs:
        variance_data[archetype][results[c]["properties"]["transformations"]]=results[c]["quot-t-variance"]
   
variance_data_df = pd.DataFrame.from_dict(variance_data)

plt.title("TVariance given quot-tscore")
plt.ylabel("quot-T-Variance")
plt.xlabel("# Transformations")

plot = plt.plot(variance_data_df,marker="o")

plt.legend(variance_data_df.columns)
plt.show()

### Differences

Looking for Differences in results - similar to Jaccard Distance

In [None]:
for config in [key for key in results.keys() if "reference" != key]:
    entries = len(results[config]["results"].keys())
    diffs = 0
    for index in list(results[config]["results"].keys()):
        gold = results["reference"]["gold_results"][index]
        reference = results["reference"]["results"][index]
        altered = results[config]["results"][index]
        
        if reference != altered:
            diffs = diffs + 1 
            
    results[config]["diffs"]=diffs
    #print(f"{config} had {diffs} differences {round(float(diffs)/entries*100,2)}%")

In [None]:
diffs_df_data = [(c,results[c]["diffs"],config_archetypes[c]) for c in non_reference_configs ]
    
diffs_df = pd.DataFrame(diffs_df_data)
diffs_df.columns=["config","diffs","archetype"]

diffs_df= diffs_df.sort_values(by=["archetype", "config"])

plt.figure(figsize=(21, 7))
plt.grid()

plt.title('Result-Differences per Configuration')


sns.barplot(x="config",y="diffs",data=diffs_df,hue="archetype")


plt.savefig(f'images/number_of_diffs_by_config.png')
plt.show()