In [None]:
from pandas import DataFrame
import pandas as pd
import numpy as np
import git
import matplotlib.pyplot as plt
import random
import os
from scipy.stats import wilcoxon
from sqlalchemy import create_engine, or_, Column, Integer, String, Float, DateTime, ForeignKey, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
from do_analysis import Experiment, ExperimentSet

In [None]:
engine = create_engine('sqlite:///results.db')
session_class = sessionmaker(bind=engine)
session = session_class()

# Load data into the dataframe

In [None]:
last = ""
headers = []
datasets = []
series = []
serie = None
code_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
repo = git.Repo(code_directory)

for experiment in session.query(Experiment).order_by(Experiment.file_name, Experiment.id):
#for experiment in session.query(Experiment).filter(or_(Experiment.set_id==i for i in [17])).order_by(Experiment.file_name):
#for experiment in session.query(Experiment).filter(Experiment.number_of_clusters == Experiment.number_of_classes).order_by(Experiment.file_name):
    if last != experiment.file_name:
        last = experiment.file_name
        dataset_name = experiment.file_name.rsplit('/')[-1].split(".")[0]
        datasets.append(dataset_name)
        if serie is not None:
            series.append(serie)
        serie = []
    message = repo.commit(experiment.set.commit).message.strip()
    message = experiment.set.description
    measure =  experiment.method.split('.')[-1]
    if "LearningBased" in measure:
        strategy = measure.split(" ")[4]
        weight = measure.split(" ")[6]
        measure = f"LearningBased strategy {strategy}, weight {weight}"
    header = f"{measure} - {message} - {experiment.set.commit}"
    if header not in headers:
        headers.append(header)
    if experiment.number_of_classes is None or experiment.number_of_clusters != experiment.number_of_classes:
        serie.append(np.nan)
    else:
        serie.append(experiment.f_score)


series.append(serie)

df = pd.DataFrame(series, index = datasets, columns=headers)
df

# Rank each value against the other options, per dataset

In [None]:
dft = df.apply(lambda x: x.rank(ascending=False), axis = 1)
dft

# Get the mean rank per measure and order them from best to worst

In [None]:
averages = dft.mean()
averages.sort_values()

# Compare the best and second best using the Wilcoxon test

In [None]:
best = averages.sort_values().index[0]
second_best = averages.sort_values().index[1]
_, p = wilcoxon(df[best], df[second_best])
p
#for i, a in zip(range(3), averages.sort_values()):
#    print(type(a))

# Check the differences between the best and second best

In [None]:
print(f"{best} | {second_best}")
for i, (x, y) in enumerate(zip(df[best], df[second_best])):
    if x != y: 
        print(f"{df.index[i]}: {x} - {y}")

In [None]:
def only_upper(s):
    return "".join(c for c in s if c.isupper())


originals = []
for av in averages.index:
    if "Base measures times kappa" in av:
        name = av.split(" ")[0]
        if "OccurenceFrequency" in av:
            name = only_upper(name)
        originals.append((name, averages[av]))
originals

In [None]:
better_than = {}
for av in averages.index:
    measure = f"{av.split(' ')[0]}"
    for o_name, o_value in originals:
        if f"{o_name}" == measure:
            if o_value > averages[av]:
                if o_name not in better_than:
                    better_than[o_name] = []
                better_than[o_name].append((av, averages[av]))
better_than

In [None]:
all_colors = list(plt.cm.colors.cnames.keys())
random.seed(1000)
c = random.choices(all_colors, k=125)
fig = plt.figure(figsize=(16,10), dpi= 80)
ax = fig.add_subplot(111)
#fig, ax = plt.subplots()
#box = df.boxplot(ax=ax)

bp = ax.boxplot(df.transpose(), autorange=True, widths=0.65, patch_artist=True)
ax.margins(y=0.05)

for label in (plt.gca().get_yticklabels()):
    label.set_fontsize(18) # Size here overrides font_prop
for i, box in enumerate(bp['boxes']):
    # change outline color
    #box.set( color='#7570b3', linewidth=2)
    # change fill color
    box.set_facecolor(c[i])
    pass
for median in bp['medians']:
    median.set(color='black')
    
for i, val in enumerate(averages.values):
    plt.text(i+1, val, "{0:.4f}".format(val),horizontalalignment='center', verticalalignment='bottom', fontdict={'fontweight':500, 'size':18})
    
plt.ylabel('F1-Score', fontsize=20)
plt.gca().set_xticklabels(df.index, rotation=60, horizontalalignment= 'right', fontdict={'fontweight':500, 'size':18})
plt.savefig("box.png", transparent=True, bbox_inches="tight")
plt.show()

In [None]:
df.reindex(dft.mean().sort_values().index, axis=1)