Goal of this notebook:

Investigate the impact on the classifier accuracy of removing groups of attributes related to the conflicting chunks.

Attributes groups:

- Authorship: self_conflict_perc, authors involved
- Complexity: leftCC, rightCC
- Size: chunkAbsSize, chunkRelSize, chunk_left_abs_size, chunk_left_rel_size, chunk_right_abs_size, chunk_right_rel_size
- Position: chunkPosition
- Content: Programming language constructs


In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier

import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [5]:
default_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns)
report_default = default_result.get_report_df(include_overall=True)

## Drop all chunk related columns

In [6]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'all'
all_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [7]:
report_all_columns = all_columns_result.get_report_df(include_overall=True)
report_all_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.973,0.978,0.975,0.978,0.573,0.948
1,apache__directory-server,845,652,0.922,0.923,0.922,0.923,0.512,0.843
2,Ramblurr__Anki-Android,892,759,0.681,0.706,0.691,0.706,0.439,0.477
3,apache__accumulo,4113,3148,0.788,0.797,0.792,0.797,0.635,0.444
4,jgralab__jgralab,2072,1802,0.709,0.713,0.709,0.713,0.491,0.435
5,getrailo__railo,815,572,0.621,0.633,0.625,0.633,0.378,0.41
6,apache__lucene-solr,1256,974,0.552,0.561,0.556,0.561,0.266,0.401
7,zkoss__zk,1087,881,0.75,0.759,0.754,0.759,0.602,0.396
8,freenet__fred,1268,1012,0.627,0.638,0.632,0.638,0.404,0.393
9,CloudStack-extras__CloudStack-archive,1424,1106,0.606,0.62,0.612,0.62,0.437,0.326


## Drop chunk authorship related columns

In [8]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'authorship'
authorship_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [9]:
report_authorship_columns = authorship_columns_result.get_report_df(include_overall=True)
report_authorship_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.973,0.976,0.573,0.943
1,apache__directory-server,845,652,0.931,0.934,0.932,0.934,0.512,0.865
2,jgralab__jgralab,2072,1802,0.859,0.861,0.859,0.861,0.491,0.727
3,CloudStack-extras__CloudStack-archive,1424,1106,0.796,0.802,0.797,0.802,0.437,0.648
4,apache__accumulo,4113,3148,0.852,0.857,0.854,0.857,0.635,0.609
5,Unidata__thredds,1154,950,0.902,0.909,0.903,0.909,0.777,0.594
6,getrailo__railo,815,572,0.696,0.712,0.702,0.712,0.378,0.537
7,Ramblurr__Anki-Android,892,759,0.714,0.739,0.72,0.739,0.439,0.535
8,apache__lucene-solr,1256,974,0.629,0.636,0.63,0.636,0.266,0.503
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.494


## Drop chunk complexity related columns

In [10]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'complexity'
complexity_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [11]:
report_complexity_columns = complexity_columns_result.get_report_df(include_overall=True)
report_complexity_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.974,0.976,0.573,0.943
1,apache__directory-server,845,652,0.936,0.94,0.938,0.94,0.512,0.877
2,jgralab__jgralab,2072,1802,0.867,0.869,0.867,0.869,0.491,0.743
3,CloudStack-extras__CloudStack-archive,1424,1106,0.799,0.804,0.8,0.804,0.437,0.652
4,Unidata__thredds,1154,950,0.916,0.92,0.916,0.92,0.777,0.642
5,apache__accumulo,4113,3148,0.855,0.861,0.857,0.861,0.635,0.618
6,Ramblurr__Anki-Android,892,759,0.719,0.74,0.723,0.74,0.439,0.538
7,getrailo__railo,815,572,0.691,0.706,0.697,0.706,0.378,0.528
8,apache__lucene-solr,1256,974,0.637,0.644,0.638,0.644,0.266,0.515
9,TeamDev-Ltd__OpenFaces,2979,2859,0.965,0.969,0.966,0.969,0.938,0.489


## Drop chunk size related columns

In [12]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'size'
size_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [13]:
report_size_columns = size_columns_result.get_report_df(include_overall=True)
report_size_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.977,0.978,0.977,0.978,0.573,0.949
1,apache__directory-server,845,652,0.945,0.946,0.945,0.946,0.512,0.89
2,jgralab__jgralab,2072,1802,0.866,0.868,0.866,0.868,0.491,0.742
3,CloudStack-extras__CloudStack-archive,1424,1106,0.796,0.801,0.797,0.801,0.437,0.647
4,apache__accumulo,4113,3148,0.848,0.858,0.853,0.858,0.635,0.611
5,Unidata__thredds,1154,950,0.893,0.902,0.897,0.902,0.777,0.561
6,Ramblurr__Anki-Android,892,759,0.712,0.735,0.72,0.735,0.439,0.528
7,apache__lucene-solr,1256,974,0.622,0.631,0.626,0.631,0.266,0.498
8,TeamDev-Ltd__OpenFaces,2979,2859,0.964,0.967,0.966,0.967,0.938,0.472
9,getrailo__railo,815,572,0.644,0.659,0.651,0.659,0.378,0.452


## Drop chunk position related columns

In [14]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'position'
position_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [15]:
report_position_columns = position_columns_result.get_report_df(include_overall=True)
report_position_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.975,0.973,0.975,0.573,0.941
1,apache__directory-server,845,652,0.937,0.94,0.939,0.94,0.512,0.877
2,jgralab__jgralab,2072,1802,0.863,0.865,0.863,0.865,0.491,0.735
3,CloudStack-extras__CloudStack-archive,1424,1106,0.8,0.806,0.8,0.806,0.437,0.655
4,Unidata__thredds,1154,950,0.913,0.919,0.914,0.919,0.777,0.637
5,apache__accumulo,4113,3148,0.857,0.864,0.86,0.864,0.635,0.627
6,getrailo__railo,815,572,0.709,0.722,0.713,0.722,0.378,0.553
7,Ramblurr__Anki-Android,892,759,0.715,0.74,0.722,0.74,0.439,0.538
8,apache__lucene-solr,1256,974,0.633,0.641,0.635,0.641,0.266,0.51
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.494


## Drop chunk content related columns

In [16]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'content'
content_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [17]:
report_content_columns = content_columns_result.get_report_df(include_overall=True)
report_content_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.97,0.973,0.971,0.973,0.573,0.938
1,apache__directory-server,845,652,0.92,0.923,0.921,0.923,0.512,0.843
2,jgralab__jgralab,2072,1802,0.855,0.857,0.855,0.857,0.491,0.72
3,CloudStack-extras__CloudStack-archive,1424,1106,0.802,0.807,0.802,0.807,0.437,0.658
4,apache__accumulo,4113,3148,0.85,0.858,0.853,0.858,0.635,0.612
5,Unidata__thredds,1154,950,0.901,0.907,0.902,0.907,0.777,0.585
6,Ramblurr__Anki-Android,892,759,0.718,0.743,0.725,0.743,0.439,0.542
7,getrailo__railo,815,572,0.692,0.708,0.699,0.708,0.378,0.531
8,apache__lucene-solr,1256,974,0.633,0.638,0.634,0.638,0.266,0.506
9,freenet__fred,1268,1012,0.66,0.671,0.664,0.671,0.404,0.448


In [18]:
df_inner_rf = pd.merge(report_default, report_all_columns, on='project', how='inner', suffixes=('_default', '_all'))
df_inner_rf_authorship = report_authorship_columns.add_suffix("_author").rename(columns={"project_author": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_authorship, on='project', how='inner')
df_inner_rf_complexity = report_complexity_columns.add_suffix("_complexity").rename(columns={"project_complexity": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_complexity, on='project', how='inner')
df_inner_rf_size = report_size_columns.add_suffix("_size").rename(columns={"project_size": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_size, on='project', how='inner')
df_inner_rf_position = report_position_columns.add_suffix("_position").rename(columns={"project_position": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_position, on='project', how='inner')
df_inner_rf_content = report_content_columns.add_suffix("_content").rename(columns={"project_content": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_content, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_all,accuracy_author,accuracy_complexity,accuracy_size,accuracy_position,accuracy_content
0,CCI-MIT__XCoLab,0.976,0.978,0.976,0.976,0.978,0.975,0.973
1,apache__directory-server,0.942,0.923,0.934,0.94,0.946,0.94,0.923
2,jgralab__jgralab,0.87,0.713,0.861,0.869,0.868,0.865,0.857
3,CloudStack-extras__CloudStack-archive,0.804,0.62,0.802,0.804,0.801,0.806,0.807
4,Unidata__thredds,0.921,0.822,0.909,0.92,0.902,0.919,0.907
5,apache__accumulo,0.863,0.797,0.857,0.861,0.858,0.864,0.858
6,Ramblurr__Anki-Android,0.736,0.706,0.739,0.74,0.735,0.74,0.743
7,apache__lucene-solr,0.65,0.561,0.636,0.644,0.631,0.641,0.638
8,getrailo__railo,0.703,0.633,0.712,0.706,0.659,0.722,0.708
9,TeamDev-Ltd__OpenFaces,0.97,0.952,0.969,0.969,0.967,0.969,0.965


In [19]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._all'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_all'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._author'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_author'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._complexity'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_complexity'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._size'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_size'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._position'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_position'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._content'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_content'], x['accuracy_default']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,20
project,Overall
accuracy_default,0.80535
accuracy_all,0.7482
accuracy_author,0.80085
accuracy_complexity,0.80465
accuracy_size,0.79625
accuracy_position,0.805
accuracy_content,0.79785
improv._all,-0.070963
improv._author,-0.005588
