Goal of this notebook:

Tuning the n parameter for IGAR attribute selection algorithm:

Average number of attributes among the projects: 169.16

Minimum number of attributes among the projects: 84


## Tuning strategy

### Collect accuracy for each n

- Vary the n parameter from 1 to 84 (min attributes)
- For each n:
	- For each project:
		- Execute the IGAR using n
		- Execute the prediction (calculate accuracy and normalized improvement)


### Calculate the number of wins for each n:

- For each project:
	- Check which n achieved the best accuracy

IGAR reference: https://www.cs.waikato.ac.nz/~mhall/HallHolmesTKDE.pdf

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
import classifier_utils
warnings.filterwarnings("ignore", category=Warning)
import configs

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
rf = RandomForestClassifier(random_state=99, n_jobs=-1, n_estimators=400, max_features=0.3, min_samples_leaf=1)

In [5]:
min_n = 0 # min_n = 0 means no selection at all, used as a baseline
max_n = 84

In [6]:
import importlib
importlib.reload(classifier_utils)
tuning_results = classifier_utils.IGAR_tuning(rf, projects, non_features_columns, min_n, max_n, True)

In [7]:
tuning_results

Unnamed: 0,project,n,accuracy,accuracy_selected,orig_attr
0,Ramblurr__Anki-Android,0,0.725955,0.725955,129
1,apache__directory-server,0,0.934074,0.934074,96
2,android__platform_frameworks_base,0,0.817104,0.817104,566
3,freenet__fred,0,0.671948,0.671948,134
4,alexo__wro4j,0,0.589848,0.589848,107
...,...,...,...,...,...
1695,apache__accumulo,84,0.864047,0.862141,137
1696,Unidata__thredds,84,0.919109,0.911718,99
1697,jgralab__jgralab,84,0.865157,0.867371,98
1698,sebastianbenz__Jnario,84,0.999021,0.999021,84


In [8]:
import importlib
importlib.reload(classifier_utils)
summary = classifier_utils.compute_IGAR_tuning_summary(tuning_results, min_n, max_n)

In [9]:
summary

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
0,0,0.80223,0.802230,0.000000,23.025,6
1,1,0.80223,0.686854,-0.143820,71.275,1
2,2,0.80223,0.717859,-0.105171,73.375,0
3,3,0.80223,0.722035,-0.099965,73.450,0
4,4,0.80223,0.730357,-0.089592,73.925,0
...,...,...,...,...,...,...
80,80,0.80223,0.799686,-0.003172,25.225,0
81,81,0.80223,0.800445,-0.002226,19.850,0
82,82,0.80223,0.801590,-0.000798,18.950,0
83,83,0.80223,0.800384,-0.002301,22.125,1


In [10]:
summary.sort_values(['number_wins', 'average_accuracy', 'average_ranking'], ascending=False)

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
0,0,0.80223,0.802230,0.000000,23.025,6
73,73,0.80223,0.800398,-0.002284,20.900,3
84,84,0.80223,0.800824,-0.001753,23.825,1
71,71,0.80223,0.800623,-0.002003,20.875,1
74,74,0.80223,0.800413,-0.002266,21.600,1
...,...,...,...,...,...,...
6,6,0.80223,0.742538,-0.074408,71.975,0
5,5,0.80223,0.730643,-0.089235,73.625,0
4,4,0.80223,0.730357,-0.089592,73.925,0
3,3,0.80223,0.722035,-0.099965,73.450,0


In [11]:
summary.sort_values(['average_ranking', 'number_wins', 'average_accuracy'], ascending=True)

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
82,82,0.80223,0.801590,-0.000798,18.950,0
81,81,0.80223,0.800445,-0.002226,19.850,0
76,76,0.80223,0.800799,-0.001784,19.900,0
78,78,0.80223,0.800931,-0.001620,20.200,0
79,79,0.80223,0.800455,-0.002212,20.875,0
...,...,...,...,...,...,...
6,6,0.80223,0.742538,-0.074408,71.975,0
2,2,0.80223,0.717859,-0.105171,73.375,0
3,3,0.80223,0.722035,-0.099965,73.450,0
5,5,0.80223,0.730643,-0.089235,73.625,0
