Goal of this notebook:

Tuning the n parameter for IGAR attribute selection algorithm:

Average number of attributes among the projects: 169.16

Minimum number of attributes among the projects: 84


## Tuning strategy

### Collect accuracy for each n

- Vary the n parameter from 1 to 84 (min attributes)
- For each n:
	- For each project:
		- Execute the IGAR using n
		- Execute the prediction (calculate accuracy and normalized improvement)


### Calculate the number of wins for each n:

- For each project:
	- Check which n achieved the best accuracy

IGAR reference: https://www.cs.waikato.ac.nz/~mhall/HallHolmesTKDE.pdf

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
import classifier_utils
warnings.filterwarnings("ignore", category=UserWarning)
import configs

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)

In [9]:
min_n = 1
max_n = 84

In [70]:
import importlib
importlib.reload(classifier_utils)
tuning_results = classifier_utils.IGAR_tuning(rf, projects[:6], non_features_columns, min_n, max_n, True)

In [7]:
tuning_results

Unnamed: 0,project,n,accuracy,accuracy_selected,orig_attr
0,Ramblurr__Anki-Android,1,0.728355,0.554788,129
1,apache__directory-server,1,0.938631,0.908143,96
2,android__platform_frameworks_base,1,0.813874,0.748824,566
3,freenet__fred,1,0.678846,0.593122,134
4,alexo__wro4j,1,0.587674,0.373409,107
...,...,...,...,...,...
2095,jgralab__jgralab,84,0.867922,0.866266,98
2096,sebastianbenz__Jnario,84,0.999021,0.999021,84
2097,CCI-MIT__XCoLab,84,0.973663,0.974995,99
2098,RealVNC__android-frameworks-base-with-screensh...,84,0.811660,0.800132,373


In [34]:
import importlib
importlib.reload(classifier_utils)
summary = classifier_utils.compute_IGAR_tuning_summary(tuning_results, min_n, max_n)

In [35]:
summary

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
0,1,0.807345,0.701809,-0.130719,74.08,1
1,2,0.807345,0.730314,-0.095412,73.98,0
2,3,0.807345,0.736112,-0.088231,74.12,0
3,4,0.807345,0.743390,-0.079216,73.42,0
4,5,0.807345,0.744786,-0.077487,72.14,0
...,...,...,...,...,...,...
79,80,0.807345,0.802029,-0.006585,26.80,0
80,81,0.807345,0.804984,-0.002925,17.26,2
81,82,0.807345,0.803093,-0.005267,22.22,1
82,83,0.807345,0.804021,-0.004117,19.20,0


In [36]:
summary.sort_values(['number_wins', 'average_accuracy', 'average_ranking'], ascending=False)

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
80,81,0.807345,0.804984,-0.002925,17.26,2
83,84,0.807345,0.803588,-0.004653,22.60,2
76,77,0.807345,0.803557,-0.004692,23.34,2
64,65,0.807345,0.803345,-0.004954,23.46,2
74,75,0.807345,0.804131,-0.003981,18.76,1
...,...,...,...,...,...,...
5,6,0.807345,0.754070,-0.065988,71.46,0
4,5,0.807345,0.744786,-0.077487,72.14,0
3,4,0.807345,0.743390,-0.079216,73.42,0
2,3,0.807345,0.736112,-0.088231,74.12,0


In [38]:
summary.sort_values(['average_ranking', 'number_wins', 'average_accuracy'], ascending=True)

Unnamed: 0,n,average_default_accuracy,average_accuracy,improvement,average_ranking,number_wins
80,81,0.807345,0.804984,-0.002925,17.26,2
74,75,0.807345,0.804131,-0.003981,18.76,1
82,83,0.807345,0.804021,-0.004117,19.20,0
68,69,0.807345,0.803846,-0.004333,19.92,1
78,79,0.807345,0.803803,-0.004387,20.84,0
...,...,...,...,...,...,...
6,7,0.807345,0.755548,-0.064157,72.40,0
3,4,0.807345,0.743390,-0.079216,73.42,0
1,2,0.807345,0.730314,-0.095412,73.98,0
0,1,0.807345,0.701809,-0.130719,74.08,1
