# Guided-MT Code2Vec Evaluation

This Notebook runs over the experiment-outputs, extracts data and makes plots.

Expected Layout:

```
.
├── README.md
├── data
│   └── random-MRR-max
│       ├── seed-2880
│       │   ├── data
│       │   │   ├── gen0
│       │   │   │   ├── 3b2459
│       │   │   │   ├── 3b2459.json
│       │   │   │   ├── 447e22
│       │   │   │   ├── 447e22.json
│       │   │   │   ├── 4495c7
│       │   │   │   ├── 4495c7.json
│       │   │   │   ├── 52667b
│       │   │   │   ├── 52667b.json
│       │   │   │   ├── 6855ba
│       │   │   │   ├── 6855ba.json
│       │   │   │   ├── 68ec75
│       │   │   │   ├── 68ec75.json
│       │   │   │   ├── 6cc14d
│       │   │   │   ├── 6cc14d.json
│       │   │   │   ├── 6d6845
│       │   │   │   ├── 6d6845.json
│       │   │   │   ├── 7a2d67
│       │   │   │   ├── 7a2d67.json
│       │   │   │   ├── ed0dd9
│       │   │   │   └── ed0dd9.json
│       │   │   ├── gen1
│       │   │   ├── ...
│       │   │   ├── gen8
│       │   │   ├── ...
│       │   │   ├── generation_0
│       │   │   │   ├── Some.java
│       │   │   │   ├── ...
│       │   │   │   ├── Other.java
│       │   │   │   └── Different.java
│       │   │   └── initialGen
│       │   │       └── 3bf9ce
│       │   └── results.txt
│       ├── seed-5142
│           └── results.txt
│       ...
├── evaluation.ipynb
└── requirements.txt
```

## Data Loading

Most of this is done in the nearby extract script, but we also extract some highlevel variables.

In [None]:
import pandas as pd
import seaborn as sbn
import matplotlib.pyplot as plt
import extract

# Important: Specify Directory without / at the end!
directory:str = "./data"
# Whether only the most important plots will be made
verbose:bool = False

In [None]:
%%time
df = extract.make_df(directory)

In [None]:
all_metrics = ["F1","MRR","EDITDIST","PMRR","REC","PREC"]
all_transformers = extract.get_known_transformers()
all_experiments = set(df["experiment"])
all_seeds = set(df["seed"])

In [None]:
df.head(5)

In [None]:
df["algorithm"] = df.algorithm.astype("category")
#df["algorithm"]
df.dtypes



## Per Experiment Plots

In [None]:
#grouped_df = df.groupby(["experiment","generation","seed"]).mean().reset_index()
#grouped_df.head(5)

In [None]:
grouped_by_generation = df.groupby(["experiment","generation"]).mean().reset_index()
#Note: The grouping removes all fields that are not used for grouping or can be meaned,
#so we have to re-add algorithm
grouped_by_generation["algorithm"]=grouped_by_generation.experiment.apply(lambda x: "random" if "random" in x else "genetic")
grouped_by_generation.head(5)

In [None]:
sbn.relplot(data=grouped_by_generation,x="generation",y="F1",hue="algorithm",facet_kws=dict(legend_out=False))
plt.ylim([0.35,0.55])
plt.title("General Experiment Output for F1")
plt.savefig("figures/general-generations-f1")
plt.show()

In [None]:
ax = sbn.relplot(data=grouped_by_generation,x="generation",y="MRR",hue="algorithm",
    facet_kws=dict(legend_out=False))
plt.ylim([0.2,0.8])
plt.title("General Experiment Output for MRR")
plt.savefig("figures/general-generations-mrr")
plt.show()

In [None]:
grouped_by_transformation =  df.groupby(["experiment","TRANSFORMATIONS"]).mean().reset_index()
#Note: The grouping removes all fields that are not used for grouping or can be meaned,
#so we have to re-add algorithm
grouped_by_transformation["algorithm"]=grouped_by_transformation.experiment.apply(lambda x: "random" if "random" in x else "genetic")
grouped_by_transformation.head(5)

In [None]:
ax = sbn.relplot(data=grouped_by_transformation,x="TRANSFORMATIONS",y="F1",hue="algorithm",
    facet_kws=dict(legend_out=False))
plt.ylim([0.2,0.8])
plt.title("General Experiment Output for F1")
plt.savefig("figures/general-transformations-f1")
plt.show()

In [None]:
ax = sbn.relplot(data=grouped_by_transformation,x="TRANSFORMATIONS",y="MRR",hue="algorithm",
    facet_kws=dict(legend_out=False))
plt.ylim([0.2,0.8])
plt.title("General Experiment Output for MRR")
plt.savefig("figures/general-transformations-mrr")
plt.show()


In [None]:
if verbose:
    for exp in all_experiments:
        ax = sbn.relplot(data=grouped_by_generation[grouped_by_generation["experiment"]==exp],x="generation",y="F1",kind="line")
        plt.title(f"F1 Score for {exp}")
        plt.xlabel("Generation")
        plt.ylabel("F1")
        plt.ylim([0,1])
        plt.show()

    for exp in all_experiments:
        sbn.relplot(data=grouped_by_generation[grouped_by_generation["experiment"]==exp],x="generation",y="MRR",kind="line")
        plt.title(f"MRR Score for {exp}")
        plt.xlabel("Generation")
        plt.ylabel("MRR")
        plt.ylim([0,1])
        plt.show()

In [None]:
# Combi-Plots

combinations = ["F1-min","MRR-min","random-F1-min","random-MRR-min"]
mask = grouped_by_generation["experiment"].isin(combinations)
filtered_df = grouped_by_generation[mask]
filtered_df["experiment"]=filtered_df.experiment.astype(str)


sbn.relplot(data=filtered_df,x="generation",y="MRR",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.3,0.7])
plt.xlabel("Generation")
plt.ylabel("MRR")
plt.title("Minimizing MRR Score")
plt.savefig("figures/minimizing-mrr")
plt.show()


sbn.relplot(data=filtered_df,x="generation",y="F1",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.4,0.7])
plt.xlabel("Generation")
plt.ylabel("F1")
plt.title("Minimizing F1 Score")
plt.savefig("figures/minimizing-f1")
plt.show()

del mask,filtered_df,combinations

In [None]:
# Combi-Plots

combinations = ["F1-max","MRR-max","random-F1-max","random-MRR-max"]
mask = grouped_by_generation["experiment"].isin(combinations)
filtered_df = grouped_by_generation[mask]
filtered_df["experiment"]=filtered_df.experiment.astype(str)


plt.figure(figsize=(1,1))
sbn.relplot(data=filtered_df,x="generation",y="MRR",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.4,0.7])
plt.xlabel("Generation")
plt.title("Maximizing MRR Score")
plt.ylabel("MRR")
plt.savefig("figures/maximizing-mrr")
plt.show()


sbn.relplot(data=filtered_df,x="generation",y="F1",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.4,0.7])
plt.xlabel("Generation")
plt.ylabel("F1")
plt.title("Maximizing F1 Score")
plt.savefig("figures/maximizing-f1")
plt.show()

del mask,filtered_df,combinations

### Pareto Tradeoffs

In [None]:
combinations = ["pareto-F1-trans-min","pareto-MRR-trans-min","random-pareto-F1-trans-min","random-pareto-MRR-trans-min"]
mask = grouped_by_generation["experiment"].isin(combinations)
filtered_df = grouped_by_generation[mask]
filtered_df["experiment"]=filtered_df.experiment.astype(str)

sbn.relplot(data=filtered_df,x="generation",y="MRR",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.4,0.8])
plt.xlabel("Generation")
plt.ylabel("MRR")
plt.title("Pareto-Minimizing MRR Score")
plt.savefig("figures/pareto-minimizing-mrr")
plt.show()

sbn.relplot(data=filtered_df,x="TRANSFORMATIONS",y="MRR",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.4,0.8])
plt.xlabel("Transformations")
plt.ylabel("MRR")
plt.title("Pareto Minimizing MRR Score")
plt.savefig("figures/pareto-minimizing-mrr-2")
plt.show()

del mask,filtered_df,combinations


In [None]:
combinations = ["pareto-F1-trans-min","pareto-MRR-trans-min","random-pareto-F1-trans-min","random-pareto-MRR-trans-min"]
mask = grouped_by_generation["experiment"].isin(combinations)
filtered_df = grouped_by_generation[mask]
filtered_df["experiment"]=filtered_df.experiment.astype(str)

sbn.relplot(data=filtered_df,x="generation",y="F1",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.35,0.7])
plt.xlabel("Generation")
plt.ylabel("F1")
plt.title("Pareto Minimizing F1 Score")
plt.savefig("figures/pareto-minimizing-f1")
plt.show()


sbn.relplot(data=filtered_df,x="TRANSFORMATIONS",y="F1",style="algorithm",hue="experiment",kind="line",facet_kws=dict(legend_out=False))
plt.ylim([0.35,0.7])
plt.xlabel("transformers")
plt.ylabel("F1")
plt.title("Pareto Minimizing F1 Score")
plt.savefig("figures/pareto-minimizing-f1-2")
plt.show()

del mask,filtered_df,combinations


In [None]:
sbn.scatterplot(data=df,x="generation",y="TRANSFORMATIONS",hue="algorithm")
plt.title("Distribution Generations vs. Transformations")

In [None]:
sub_df = df[["experiment","TRANSFORMATIONS"]+all_transformers]
agg_transformations = sub_df.groupby(["experiment","TRANSFORMATIONS"]).sum().reset_index()

In [None]:
for exp in all_experiments:
    sbn.barplot(data=agg_transformations[agg_transformations["experiment"]==exp])
    plt.legend()
    plt.ylabel("Count")
    plt.title(f"Transformation Distribution for {exp}")
    plt.show()