In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import glob
import gzip
import operator
import statistics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.externals import joblib

%matplotlib inline

In [None]:
MODELS_DIR = './models/'
CANDIDATES_INFO_FILENAME = MODELS_DIR + 'candidates_info.csv'
RESULTS_DIR = './out/'
RESULTS_FILENAME = RESULTS_DIR + 'results.csv'
RESULTS_SEQ_FILENAME = RESULTS_DIR + 'results-seq.csv'
KNN_FILENAME = RESULTS_DIR + 'results-brute-force-1-nn-4000-eps-1.knn.gz'
KNN_SEQ_FILENAME = RESULTS_DIR + 'results-seq-brute-force-1-nn-4000-eps-1.knn.gz'

# $\epsilon$-transformation information

In [None]:
candidates_info = pd.read_csv(CANDIDATES_INFO_FILENAME)

In [None]:
candidates_info

In [None]:
print("Total number of epsilon-transformations extracted/indexed: {}".format(candidates_info.n_paths.sum()))

# Set global plotting configuration

In [None]:
sns.set(context="paper", style="darkgrid", font_scale=2)

In [None]:
colors_palette = [
                    '#78C850',  # Grass
                    '#F08030',  # Fire
                    '#6890F0',  # Water
                    '#A040A0',  # Poison
                    '#A8A878',  # Normal
                    '#A8B820',  # Bug
                    '#F8D030',  # Electric
                    '#E0C068',  # Ground
                    '#EE99AC',  # Fairy
                    '#C03028',  # Fighting
                    '#F85888',  # Psychic
                    '#B8A038',  # Rock
                    '#705898',  # Ghost
                    '#98D8D8',  # Ice
                    '#7038F8'   # Dragon
                   ]

colors_palette = sns.color_palette("Paired", 10)
colors_palette = sns.color_palette("Set3", 10)

# Plot $\epsilon$-transformations distribution over class labels

In [None]:
def plot_candidate_distr(candidates):
    
    fig, ax = plt.subplots(1, 1, figsize=(12,8))
    
    ax = sns.barplot(x="class", 
                     y=(candidates.n_paths * 100)/(candidates.n_paths.sum()), 
                     data=candidates, 
                     palette=colors_palette[:10],
                     edgecolor=".2"
                    )
    _ = ax.set_xlabel('label', fontsize=24, labelpad=14)
    _ = ax.set_ylabel(r'$\epsilon$-transformations (%)', fontsize=24, labelpad=14)
    #_ = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
plot_candidate_distr(candidates_info)

# Best $\epsilon$-transformations

In [None]:
knn_files = glob.glob(KNN_FILENAME)

In [None]:
knn_files

In [None]:
np.all([os.path.isfile(f) for f in knn_files])

In [None]:
def load_knn(knn_filename):
    with open(knn_filename, 'rb') as knn_file:
        return joblib.load(knn_file)

In [None]:
def freq_knn(knn_res):
    freq = {}
    for key in knn_res:
        iid, label = key
        transf = knn_res[key]
        sorted_transf = sorted(transf.items(), key=operator.itemgetter(1))
        freq.setdefault(label, []).append(sorted_transf[0][0])

    return freq

In [None]:
all_freqs = {}
for kf in knn_files:
    key = "-".join(kf.split('results-')[1].split('-')[:2])
    print(key)
    all_freqs[key] = freq_knn(load_knn(kf))

In [None]:
for label in sorted(all_freqs['brute-force']):
    print("Most frequent target for original class {} = {}".
          format(label, statistics.mode(all_freqs['brute-force'][label])))

In [None]:
data = pd.DataFrame([(key, var) for (key, L) in all_freqs['brute-force'].items() for var in L], 
                    columns=['origin_label', 'target_label'])

In [None]:
data.head()

In [None]:
def plot_hist(data, ax, origin_label, color):
    
    sns.distplot(data[data['origin_label'] == origin_label]['target_label'], 
                 ax=ax, 
                 kde=True, 
                 bins=np.arange(11) - 0.5,
                 hist_kws=dict(edgecolor="k", linewidth=1),
                 color=color
                )
    _ = ax.set_xlim(-0.5,9.5)
    _ = ax.set_xticks(range(10))
    _ = ax.set_title('origin label = {}'.format(origin_label), fontsize=18)
    _ = ax.set_xlabel('target label', fontsize=16, labelpad=10)
    _ = ax.set_ylabel('density', fontsize=16, labelpad=10)

In [None]:
def plot_top_candidate_distr(class_freqs, data):
    
    fig, ax = plt.subplots(2, 5, figsize=(20,8))
    i = 0
    fig.subplots_adjust(hspace=0.6, wspace=0.5)
    for k in sorted(class_freqs):
        plot_hist(data, ax[i//5][i%5], k, colors_palette[k])
        i += 1

In [None]:
plot_top_candidate_distr(all_freqs['brute-force'], data)

In [None]:
results = pd.read_csv(RESULTS_FILENAME)
results_seq = pd.read_csv(RESULTS_SEQ_FILENAME)

In [None]:
def rename_methods(df):
    df['method'] = df['method'].replace({'brute-force':'brute force'})
    df['method'] = df['method'].replace({'ball-tree':'ball tree'})
    df['method'] = df['method'].replace({'kd-tree':'K-d tree'})

In [None]:
results.head()

In [None]:
results_seq.head()

In [None]:
rename_methods(results)
rename_methods(results_seq)

In [None]:
results.head()

In [None]:
results_seq.head()

In [None]:
data = results.sort_values(by=['method', 'k_nn', 'n_samples', 'epsilon'])

In [None]:
data.head()

In [None]:
data.shape

In [None]:
def plot(data, epsilon, k):
    data_to_plot = data
    #data_to_plot = data[(data['epsilon'] == epsilon)]
    #data_to_plot = data[(data['epsilon'] == epsilon) & (data['k_nn'] == k)]
    fig, ax = plt.subplots(1, 1, figsize=(12,8))

    # Plot using seaborn
    sns.set(font_scale = 1.6)
    ax = sns.lineplot(x='n_samples', 
                      y='elapsed_time (secs.)', 
                      hue='method', 
                      markers=["s", "D", "o"],#True, 
                      data=data_to_plot, 
                      style='method',
                      palette=['#C03028', '#6890F0', '#78C850'],
                      hue_order = ['brute force', 'K-d tree', 'ball tree'],
                      markersize=8,
                      err_style="band", 
                      ci="sd"
                     )

    #_ = ax.set_xlim(0, 500)
    #_ = ax.set_ylim(0, 2000)
    #_ = ax.set_xscale('log')
    
    _ = ax.set_yscale('log')
    _ = ax.set_xlabel('n. of samples', fontsize=24, labelpad=14)
    _ = ax.set_ylabel('Avg. Time (secs.)', fontsize=24, labelpad=14)

    #_ = ax.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    _ = ax.legend(loc='best', 
                  fontsize=18,
                  bbox_to_anchor=(0.95, 0.3), 
                  fancybox=True, 
                  shadow=True, 
                  borderaxespad=0.)
    #_ = ax.legend(loc='upper center', bbox_to_anchor=(0.5, 1.1), ncol=3, fancybox=True, shadow=True)
    
    x = np.linspace(100, 4000)
    y = x
    y_log = np.log10(y)
    y_n_log = [y_i * np.log(y_i) for y_i in y]
    y_sq = np.square(x)
    
    #_ = ax.plot(x, y, linewidth=1, color='black',linestyle='dashed')
    #_ = ax.plot(x, y_sq, linewidth=1, color='orange',linestyle='dashed')
    #_ = ax.plot(x, y_n_log, linewidth=1, color='magenta',linestyle='dashed')
    #_ = ax.plot(x, y_log, linewidth=1, color='cyan',linestyle='dashed')
    
    #_ = ax.text(1800, 2400, r'$y = x$', fontsize=14)

    plt.show()

    plt.close()

In [None]:
_ = plot(data, 1, 10) #[data['n_samples'] <= 2000]

In [None]:
data = results_seq.sort_values(by=['method', 'k_nn', 'n_samples', 'epsilon'])

In [None]:
_ = plot(data, 1, 10)