Goal of this notebook:

Perform feature selection on our dataset.

Strategy:

Iterate over each project and execute the feature selection

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
import warnings
import classifier_utils
warnings.filterwarnings("ignore")

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
rf = RandomForestClassifier(random_state=99, n_jobs=-2, n_estimators=400, max_features=0.3, min_samples_leaf=1)

# Tree-based feature selection

Uses the feature_importances_ attribute from the Random Forest model to select the most important features. It uses the mean of the importances of all features as a threshold.

In [5]:
import importlib
importlib.reload(classifier_utils)
results_tree, attributes_record_tree = classifier_utils.projects_feature_selection(projects, non_features_columns, rf, 'tree')

In [6]:
results_tree

Unnamed: 0,project,N,# attr.,# attr. fs,accuracy,accuracy_fs,improvement
0,Ramblurr__Anki-Android,759.0,129.0,33.0,0.742,0.747,0.021
1,apache__directory-server,652.0,96.0,18.0,0.937,0.929,-0.008
2,android__platform_frameworks_base,2460.0,566.0,54.0,0.817,0.815,-0.003
3,freenet__fred,1012.0,134.0,47.0,0.679,0.675,-0.006
4,alexo__wro4j,1368.0,107.0,31.0,0.584,0.582,-0.004
5,apache__lucene-solr,974.0,124.0,42.0,0.646,0.64,-0.01
6,elastic__elasticsearch,,,,,,
7,getrailo__railo,572.0,90.0,31.0,0.711,0.708,-0.005
8,atlasapi__atlas,782.0,124.0,40.0,0.673,0.647,-0.038
9,hibernate__hibernate-orm,716.0,131.0,33.0,0.601,0.583,-0.03


# Recursive feature elimination:

First, the estimator is trained on the initial set of features and the importance of each feature is obtained. Then, the least important features are pruned from current set of features. That procedure is recursively repeated on the pruned set until the desired number of features to select is eventually reached.
Currently we are discarding 1 feature per step, using a 5-fold-cross-validation to calculate the accuracy on each step.

In [7]:
import importlib
importlib.reload(classifier_utils)
results_recursive, attributes_record_rec = classifier_utils.projects_feature_selection(projects, non_features_columns, rf, 'recursive')

In [8]:
results_recursive

Unnamed: 0,project,N,# attr.,# attr. fs,accuracy,accuracy_fs,improvement
0,Ramblurr__Anki-Android,759.0,129.0,50.0,0.742,0.744,0.01
1,apache__directory-server,652.0,96.0,80.0,0.937,0.936,-0.002
2,android__platform_frameworks_base,2460.0,566.0,324.0,0.817,0.82,0.013
3,freenet__fred,1012.0,134.0,97.0,0.679,0.674,-0.007
4,alexo__wro4j,1368.0,107.0,68.0,0.584,0.582,-0.004
5,apache__lucene-solr,974.0,124.0,81.0,0.646,0.649,0.009
6,elastic__elasticsearch,,,,,,
7,getrailo__railo,572.0,90.0,68.0,0.711,0.713,0.006
8,atlasapi__atlas,782.0,124.0,123.0,0.673,0.668,-0.008
9,hibernate__hibernate-orm,716.0,131.0,91.0,0.601,0.594,-0.012


# IGAR
Selects attributes based on the ranking of their information gain.
Information gain measures the ability of a feature to separate the target classes. The greater the information gain, the better its importance for classification tasks.

Information Gain = Entropy(overall) - Entropy(attribute)

The algorithm has an input value 'n' that is used to select the 'n' attributes with the greatest information gain among all attributes. In this notebook we use n = 83, which was the found in the notebook IGAR_tuning.ipynb.

In [9]:
import importlib
importlib.reload(classifier_utils)
results_IGAR, attributes_record_IGAR = classifier_utils.projects_feature_selection(projects, non_features_columns, rf, 'IGAR')

In [10]:
results_IGAR

Unnamed: 0,project,N,# attr.,# attr. fs,accuracy,accuracy_fs,improvement
0,Ramblurr__Anki-Android,759.0,129.0,82.0,0.742,0.738,-0.005
1,apache__directory-server,652.0,96.0,82.0,0.937,0.937,0.0
2,android__platform_frameworks_base,2460.0,566.0,82.0,0.817,0.816,-0.001
3,freenet__fred,1012.0,134.0,82.0,0.679,0.68,0.003
4,alexo__wro4j,1368.0,107.0,82.0,0.584,0.589,0.012
5,apache__lucene-solr,974.0,124.0,82.0,0.646,0.641,-0.008
6,elastic__elasticsearch,,,,,,
7,getrailo__railo,572.0,90.0,82.0,0.711,0.71,-0.002
8,atlasapi__atlas,782.0,124.0,82.0,0.673,0.666,-0.01
9,hibernate__hibernate-orm,716.0,131.0,82.0,0.601,0.599,-0.002


## Comparison

In [11]:
df_inner = pd.merge(results_tree, results_recursive, on='project', how='inner', suffixes=('_tree', '_rec'))
df_inner_igar = results_IGAR.add_suffix("_IGAR").rename(columns={"project_IGAR": "project"})
df_inner = pd.merge(df_inner, df_inner_igar, on='project', how='inner')
df_inner.to_csv('feature_selection_comparison.csv', index=False)

accuracy_inner = df_inner.filter(regex=("project|accuracy.*")).copy()
accuracy_inner['improvement_tree'] = accuracy_inner.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_fs_tree'], x['accuracy_tree']), axis=1)
accuracy_inner['improvement_rec'] = accuracy_inner.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_fs_rec'], x['accuracy_rec']), axis=1)
accuracy_inner['improvement_IGAR'] = accuracy_inner.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_fs_IGAR'], x['accuracy_IGAR']), axis=1)
accuracy_inner = accuracy_inner.round(3)
accuracy_inner

Unnamed: 0,project,accuracy_tree,accuracy_fs_tree,accuracy_rec,accuracy_fs_rec,accuracy_IGAR,accuracy_fs_IGAR,improvement_tree,improvement_rec,improvement_IGAR
0,Ramblurr__Anki-Android,0.742,0.747,0.742,0.744,0.742,0.738,0.019,0.008,-0.005
1,apache__directory-server,0.937,0.929,0.937,0.936,0.937,0.937,-0.009,-0.001,0.0
2,android__platform_frameworks_base,0.817,0.815,0.817,0.82,0.817,0.816,-0.002,0.016,-0.001
3,freenet__fred,0.679,0.675,0.679,0.674,0.679,0.68,-0.006,-0.007,0.003
4,alexo__wro4j,0.584,0.582,0.584,0.582,0.584,0.589,-0.003,-0.003,0.012
5,apache__lucene-solr,0.646,0.64,0.646,0.649,0.646,0.641,-0.009,0.008,-0.008
6,elastic__elasticsearch,,,,,,,,,
7,getrailo__railo,0.711,0.708,0.711,0.713,0.711,0.71,-0.004,0.007,-0.001
8,atlasapi__atlas,0.673,0.647,0.673,0.668,0.673,0.666,-0.039,-0.007,-0.01
9,hibernate__hibernate-orm,0.601,0.583,0.601,0.594,0.601,0.599,-0.03,-0.012,-0.003


In [12]:
pd.DataFrame(attributes_record_rec, columns=['project', 'attribute', 'information_gain', 'method']).to_csv('attributes_record_rec.csv', index=False)

In [13]:
attributes_record = []
attributes_record.extend(attributes_record_tree)
attributes_record.extend(attributes_record_rec)
attributes_record.extend(attributes_record_IGAR)
attributes_record_df = pd.DataFrame(attributes_record, columns=['project', 'attribute', 'information_gain', 'method'])
attributes_record_df.to_csv('attributes_record.csv', index=False)

In [14]:
attributes_record = pd.read_csv('attributes_record.csv')

## Ranking of features selected by tree method

Counts in how many projects the respective feature was selected using the tree method.

In [15]:
ranking_tree = classifier_utils.get_attribute_selection_ranking(attributes_record, 'tree')
ranking_tree.sort_values(['average_information_gain', 'count_selected'], ascending=False).head(50)

Unnamed: 0,attribute,count_selected,average_information_gain,average_ranking
65,steve@hibernate.org,1.0,2.069798,11.0
9,keyword_remove,11.0,1.973972,9.818182
40,keyword_document,4.0,1.8329,8.0
5,keyword_bug,12.0,1.829089,11.083333
30,Changed files 2,15.0,1.751219,5.066667
25,Merge isolation time,15.0,1.744253,3.266667
10,keyword_use,14.0,1.73728,7.857143
8,keyword_add,16.0,1.731041,5.6875
26,Different devs,11.0,1.730046,8.181818
7,keyword_update,11.0,1.722313,10.363636


## Ranking of features selected by recursive method

Counts in how many projects the respective feature was selected using the recursive method.

In [16]:
ranking_recursive = classifier_utils.get_attribute_selection_ranking(attributes_record, 'recursive')
ranking_recursive.sort_values(['average_information_gain', 'count_selected'], ascending=False).head(50)

Unnamed: 0,attribute,count_selected,average_information_gain,average_ranking
377,alex.objelean@gmail.com,1.0,2.091704,6.0
480,steve@hibernate.org,1.0,2.069798,14.0
372,toad@amphibian.dyndns.org,1.0,2.007149,13.0
26,Merge isolation time,17.0,1.740536,3.470588
25,Branching time,17.0,1.739914,3.823529
30,Commits 1,17.0,1.730373,5.823529
33,Changed files 2,17.0,1.724729,6.823529
6,keyword_bug,16.0,1.685674,12.75
20,fileSize,18.0,1.677116,1.277778
11,keyword_use,17.0,1.665944,9.0


## Ranking of features selected by IGAR method

Counts in how many projects the respective feature was selected using the IGAR method. 

The information gain column is an average among all projects.

In [17]:
import importlib
importlib.reload(classifier_utils)
ranking_IGAR = classifier_utils.get_attribute_selection_ranking(attributes_record, 'IGAR')
ranking_IGAR.sort_values(['average_information_gain', 'count_selected'], ascending=False).head(50)

Unnamed: 0,attribute,count_selected,average_information_gain,average_ranking
107,steve@hibernate.org,1.0,2.069798,14.0
106,michael@getrailo.org,1.0,1.588456,18.0
37,fileSize,20.0,1.529837,1.75
0,chunkRelSize,20.0,1.529834,1.35
3,Merge isolation time,20.0,1.521324,3.65
5,Branching time,20.0,1.520796,3.85
19,Changed files 1,20.0,1.516113,5.15
39,fileCC,20.0,1.513428,4.45
22,Commits 1,20.0,1.512869,6.1
1,Changed files 2,20.0,1.508315,6.8


In [18]:
ranking_IGAR[ranking_IGAR['attribute'] == 'chunk_author']

Unnamed: 0,attribute,count_selected,average_information_gain,average_ranking
161,chunk_author,1.0,0.401176,35.050633


In [19]:
ranking_IGAR[ranking_IGAR['attribute'] == 'content_constructor']

Unnamed: 0,attribute,count_selected,average_information_gain,average_ranking
162,content_constructor,17.974359,0.093986,47.082964


In [20]:
ranking_IGAR.tail(10)

Unnamed: 0,attribute,count_selected,average_information_gain,average_ranking
153,steverab93@gmail.com,1.0,0.571784,35.0
154,jcabota@gmail.com,1.0,0.44115,39.0
155,klemens.mang@me.com,1.0,0.32525,40.0
156,tokn136@gmail.com,1.0,0.254306,41.0
157,manuel.thurner@gmail.com,1.0,0.180391,42.0
158,carlosbpf@gmail.com,1.0,0.039896,46.0
159,collab@mit.edu,1.0,0.013671,48.0
160,janusz@janusz.(none),1.0,0.007653,53.0
161,chunk_author,1.0,0.401176,35.050633
162,content_constructor,17.974359,0.093986,47.082964
