In [None]:
from pandas import DataFrame
import pandas as pd
import numpy as np
import git
import matplotlib.pyplot as plt
import random
import os
from scipy.stats import wilcoxon
from sqlalchemy import create_engine, or_, Column, Integer, String, Float, DateTime, ForeignKey, Boolean
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, sessionmaker
from do_analysis import Experiment, ExperimentSet

In [None]:
def from_db_to_pandas(query):
    last = ""
    headers = []
    datasets = []
    series = []
    serie = None
    code_directory = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
    repo = git.Repo(code_directory)

    for experiment in query:
        if last != experiment.file_name:
            last = experiment.file_name
            dataset_name = experiment.file_name.rsplit('/')[-1].split(".")[0]
            datasets.append(dataset_name)
            if serie is not None:
                series.append(serie)
            serie = []
        message = experiment.set.description
        measure =  experiment.method.split('.')[-1]
        if "Learning Based" in message:
            continue
            strategy = measure.split(" ")[4]
            weight = measure.split(" ")[6]
            measure = f"LearningBased strategy {strategy}, weight {weight}"
        header = f"{measure.replace('Modified','')}"
        if header not in headers:
            headers.append(header)
        if experiment.number_of_classes is None or experiment.number_of_clusters != experiment.number_of_classes:
            serie.append(np.nan)
        else:
            serie.append(experiment.f_score)


    series.append(serie)

    return pd.DataFrame(series, index = datasets, columns=headers)


In [None]:
engine = create_engine('sqlite:///results.db', echo=False)
session_class = sessionmaker(bind=engine)
session = session_class()
query = session.query(Experiment).order_by(Experiment.file_name, Experiment.id)
#query = for experiment in session.query(Experiment).filter(or_(Experiment.set_id==i for i in [17])).order_by(Experiment.file_name):
#query = for experiment in session.query(Experiment).filter(Experiment.number_of_clusters == Experiment.number_of_classes).order_by(Experiment.file_name):
df = from_db_to_pandas(query)
df

In [None]:
def parse_options(name: str):
    options = name.split(" ", 1)[1].split(" ")
    if options[1] == "K":
        weight = "Kappa"
    else:
        weight = "Auc"

    if options[3] == "B":
        strategy = "Base"
    elif options[3] == "D":
        strategy ="Discard"
    elif options[3] == "M":
        strategy = "Maximum"
    elif options[3] == "L":
        strategy ="Original value"

    if options[5] == "I":
        multiplier = "1 - weight"
    else:
        multiplier = "Normal"
    return (weight, strategy, multiplier)

def create_tag(options: str):
    weight, strategy, multiplier = parse_options(options)
    return f"{strategy} when {weight} is low and multiplying for {multiplier}"

all_columns = [x.replace("OccurenceFrequency", "OF") for x in df.columns]
all_columns = [x.replace("Inverse", "I") for x in all_columns]
measures = ["Eskin", "Gambaryan", "Goodall", "OF", "IOF", "Lin"]
a = []
cols = []
for measure in measures:
    al = [x for x in all_columns if measure in x]
    other = [x for x in al if measure != x]
    b= []
    for o in other:
        col = create_tag(o)
        if col not in cols:
            
            cols.append(col)
        better = False
        c, p = wilcoxon(df[o], df[measure], alternative="greater")
        if (p < 0.1):
            #print(f"{o} is better than base")
            b.append("+")
        else:
            c, p = wilcoxon(df[measure], df[o], alternative="less")
            if (p < 0.1):
                #print(f"Base is lesser than {o}")
                b.append("+")
            else:
                b.append("-")
    a.append(b)

comp = pd.DataFrame(a, index = measures, columns=cols)
comp

In [None]:
better_than = {"w" :{"Kappa": 0, "Auc":0}, 
               "o": {"Base": 0, "Discard": 0, "Maximum": 0, "Original value": 0,}, 
               "t": {"Normal": 0,"1 - weight": 0,}}

for measure in measures:
    
    al = [x for x in all_columns if measure in x]
    other = [x for x in al if measure != x]
    for o in other:
        better = False
        c, p = wilcoxon(df[o], df[measure], alternative="greater")
        if (p < 0.1):
            #print(f"{o} is better than base")
            better = True
        else:
            c, p = wilcoxon(df[measure], df[o], alternative="less")
            if (p < 0.1):
                #print(f"Base is lesser than {o}")
                better = True
        
        if better:
            weight, strategy, multiplier = parse_options(o)
            better_than["w"][weight] += 1
            better_than["o"][strategy] += 1
            better_than["t"][multiplier] += 1
better_than        