Goal of this notebook:

Tuning the n parameter for IGAR attribute selection algorithm:

Average number of attributes among the projects: 169.16

Minimum number of attributes among the projects: 84


## Tuning strategy

### Collect accuracy for each n

- Vary the n parameter from 1 to 84 (min attributes)
- For each n:
	- For each project:
		- Execute the IGAR using n
		- Execute the prediction (calculate accuracy and normalized improvement)


### Calculate the number of wins for each n:

- For each project:
	- Check which n achieved the best accuracy

IGAR reference: https://www.cs.waikato.ac.nz/~mhall/HallHolmesTKDE.pdf

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
import classifier_utils
warnings.filterwarnings("ignore", category=UserWarning)
import configs

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [23]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)

In [75]:
min_n = 40
max_n = 45

In [70]:
import importlib
importlib.reload(classifier_utils)
tuning_results = classifier_utils.IGAR_tuning(rf, projects[:6], non_features_columns, min_n, max_n, True)

In [71]:
tuning_results

Unnamed: 0,project,n,accuracy,accuracy_selected,orig_attr
0,Ramblurr__Anki-Android,40,0.740439,0.733842,129
1,apache__directory-server,40,0.938625,0.92331,96
2,android__platform_frameworks_base,40,0.818293,0.808943,566
3,freenet__fred,40,0.678839,0.656067,134
4,alexo__wro4j,40,0.585525,0.573819,107
5,apache__lucene-solr,40,0.649947,0.624258,124
6,Ramblurr__Anki-Android,41,0.740439,0.728579,129
7,apache__directory-server,41,0.938625,0.924825,96
8,android__platform_frameworks_base,41,0.818293,0.814228,566
9,freenet__fred,41,0.678839,0.660037,134


In [97]:
import importlib
importlib.reload(classifier_utils)
classifier_utils.compute_IGAR_tuning_summary(tuning_results, min_n, max_n)

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
0,40,0.735278,0.72004,-0.020724,1.088777,0
1,41,0.735278,0.721916,-0.018172,0.969886,0
2,42,0.735278,0.723226,-0.016391,0.772827,0
3,43,0.735278,0.725174,-0.013742,0.683278,1
4,44,0.735278,0.726596,-0.011808,0.371399,3
5,45,0.735278,0.725458,-0.013356,0.313743,2
