In [53]:
from pathlib import Path

import numpy as np
import pandas as pd
import xgboost as xgb
import sklearn as skl

import operator
import tabulate

from mylib import class_distributions
from mylib import data_selection
from mylib import helper_funcs

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# open file

data_folder = Path("../../../data/DryBeanDataset/")
file_to_open = data_folder / "Dry_Bean_Dataset.xlsx"

In [4]:
# read and prepare data

data = pd.read_excel(file_to_open)

labels_dict = {key:value for (value,key) in enumerate(data["Class"].unique())}
data["Class"] = data["Class"].map(labels_dict)
#data.Class.astype("category").cat.codes

# need feature matrix X and labels labels for xgboost
labels = data["Class"]
X = data.drop(["Class"],axis=1,inplace=False)

In [5]:
label_proportions = class_distributions.label_proportions(labels)
print(label_proportions)

largest_class_label = max(label_proportions.items(), key=operator.itemgetter(1))[0]
smallest_class_label = min(label_proportions.items(), key=operator.itemgetter(1))[0]

6    0.260525
5    0.193667
0    0.148924
4    0.141650
3    0.119756
1    0.097127
2    0.038351
Name: Class, dtype: float64


In [29]:
results = []

training_method = 'continued_training'
sort_type = 'closest'
largest_or_smallest_class = 'largest class'

for data_selection_method in ['split_criterion', 'dist_to_mean', 'nearest_neighbors', 'entropy']:
    experiment_results = helper_funcs.unpack_results(training_method, data_selection_method, sort_type, largest_or_smallest_class)
    full_data_mean = experiment_results['full_data_mean'][10]
    results.append(full_data_mean)

experiment_results = helper_funcs.unpack_results(training_method, 'random', sort_type, largest_or_smallest_class)
full_data_mean = experiment_results['full_data_mean'][10]
results.append(full_data_mean)

In [30]:
results_df = pd.DataFrame(results)
results_df = results_df*100

In [50]:
index_names = ['split-criterion', 'dist-to-mean', 'nearest-neighbors', 'entropy', 'random']
header = [f'{10*i}%' for i in range(1,10)]
results_df.columns = header
results_df.index = index_names

In [51]:
results_df

Unnamed: 0,10%,20%,30%,40%,50%,60%,70%,80%,90%
split-criterion,78.672397,86.444787,89.613548,90.413636,90.800088,91.572258,92.012343,92.036588,92.046874
dist-to-mean,87.046507,90.608331,90.869885,90.931599,91.019763,91.193887,91.581809,91.679524,91.876423
nearest-neighbors,85.158328,87.835574,89.364485,90.340166,90.822129,91.094703,91.010947,91.080009,90.771435
entropy,89.224892,89.327015,91.202704,91.634707,91.980751,92.096099,92.26361,92.186467,92.11667
random,89.457057,90.744986,91.83675,92.124752,92.168834,92.184997,92.221732,92.235692,92.159283


In [55]:
tex_table = results_df.to_latex(formatters={"name": str.upper},
                    float_format="{:.2f}".format,
                  )

with open("tex_table.txt", "w") as text_file:
    print(f"{tex_table}", file=text_file)

In [11]:
old_data_mean = results['old_data_mean']
old_data_std = results['old_data_std']
new_data_mean = results['new_data_mean']
new_data_std = results['new_data_std']
update_data_mean = results['update_data_mean']
update_data_std = results['update_data_std']
full_data_mean = results['full_data_mean']
full_data_std = results['full_data_std']

In [26]:
test = [[1,2,3],[2,3,4],[3,4,5]]
headers = ['a', 'b', 'c']

In [17]:
df = pd.DataFrame(dict(name=['Raphael', 'Donatello'],
                           age=[26, 45],
                           height=[181.23, 177.65]))


tex_table = df.to_latex(index=False,
                      formatters={"name": str.upper},
                      float_format="{:.1f}".format
                  )

  tex_table = df.to_latex(index=False,


In [18]:
print(tex_table)

\begin{tabular}{lrr}
\toprule
     name &  age &  height \\
\midrule
  RAPHAEL &   26 &   181.2 \\
DONATELLO &   45 &   177.7 \\
\bottomrule
\end{tabular}



In [19]:
with open("Output.txt", "w") as text_file:
    print(f"{tex_table}", file=text_file)