Goal of this notebook:

Investigate the impact on the classifier accuracy of removing groups of attributes related to the conflicting chunks.

Attributes groups:

- Authorship: self_conflict_perc, authors involved
- Complexity: leftCC, rightCC
- Size: chunkAbsSize, chunkRelSize, chunk_left_abs_size, chunk_left_rel_size, chunk_right_abs_size, chunk_right_rel_size
- Position: chunkPosition
- Content: Programming language constructs


In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier

import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [5]:
default_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns)
report_default = default_result.get_report_df(include_overall=True)

## Drop all chunk related columns

In [6]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'all'
all_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [7]:
report_all_columns = all_columns_result.get_report_df(include_overall=True)
report_all_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.973,0.978,0.975,0.978,0.573,0.948
1,apache__directory-server,845,652,0.922,0.923,0.922,0.923,0.512,0.843
2,Ramblurr__Anki-Android,892,759,0.681,0.706,0.691,0.706,0.439,0.477
3,apache__accumulo,4113,3148,0.788,0.797,0.792,0.797,0.635,0.444
4,jgralab__jgralab,2072,1802,0.709,0.713,0.709,0.713,0.491,0.435
5,getrailo__railo,815,572,0.621,0.633,0.625,0.633,0.378,0.41
6,apache__lucene-solr,1256,974,0.552,0.561,0.556,0.561,0.266,0.401
7,zkoss__zk,1087,881,0.75,0.759,0.754,0.759,0.602,0.396
8,freenet__fred,1268,1012,0.627,0.638,0.632,0.638,0.404,0.393
9,CloudStack-extras__CloudStack-archive,1424,1106,0.606,0.62,0.612,0.62,0.437,0.326


## Drop chunk authorship related columns

In [8]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'authorship'
authorship_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [9]:
report_authorship_columns = authorship_columns_result.get_report_df(include_overall=True)
report_authorship_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.974,0.976,0.573,0.944
1,apache__directory-server,845,652,0.929,0.933,0.931,0.933,0.512,0.862
2,jgralab__jgralab,2072,1802,0.859,0.861,0.859,0.861,0.491,0.727
3,CloudStack-extras__CloudStack-archive,1424,1106,0.792,0.798,0.794,0.798,0.437,0.642
4,apache__accumulo,4113,3148,0.852,0.857,0.853,0.857,0.635,0.607
5,Unidata__thredds,1154,950,0.897,0.905,0.899,0.905,0.777,0.575
6,getrailo__railo,815,572,0.694,0.71,0.7,0.71,0.378,0.534
7,Ramblurr__Anki-Android,892,759,0.709,0.734,0.716,0.734,0.439,0.526
8,apache__lucene-solr,1256,974,0.629,0.639,0.632,0.639,0.266,0.508
9,TeamDev-Ltd__OpenFaces,2979,2859,0.967,0.97,0.968,0.97,0.938,0.506


## Drop chunk complexity related columns

In [10]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'complexity'
complexity_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [11]:
report_complexity_columns = complexity_columns_result.get_report_df(include_overall=True)
report_complexity_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.974,0.976,0.573,0.943
1,apache__directory-server,845,652,0.935,0.939,0.936,0.939,0.512,0.874
2,jgralab__jgralab,2072,1802,0.867,0.869,0.867,0.869,0.491,0.743
3,CloudStack-extras__CloudStack-archive,1424,1106,0.798,0.805,0.799,0.805,0.437,0.653
4,Unidata__thredds,1154,950,0.913,0.919,0.914,0.919,0.777,0.637
5,apache__accumulo,4113,3148,0.857,0.862,0.858,0.862,0.635,0.622
6,getrailo__railo,815,572,0.694,0.71,0.7,0.71,0.378,0.534
7,Ramblurr__Anki-Android,892,759,0.713,0.735,0.718,0.735,0.439,0.528
8,apache__lucene-solr,1256,974,0.641,0.648,0.641,0.648,0.266,0.52
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.494


## Drop chunk size related columns

In [12]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'size'
size_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [13]:
report_size_columns = size_columns_result.get_report_df(include_overall=True)
report_size_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.977,0.978,0.977,0.978,0.573,0.95
1,apache__directory-server,845,652,0.943,0.945,0.944,0.945,0.512,0.887
2,jgralab__jgralab,2072,1802,0.859,0.862,0.859,0.862,0.491,0.728
3,CloudStack-extras__CloudStack-archive,1424,1106,0.791,0.797,0.793,0.797,0.437,0.639
4,apache__accumulo,4113,3148,0.849,0.859,0.853,0.859,0.635,0.614
5,Unidata__thredds,1154,950,0.889,0.899,0.893,0.899,0.777,0.547
6,Ramblurr__Anki-Android,892,759,0.718,0.742,0.726,0.742,0.439,0.54
7,apache__lucene-solr,1256,974,0.623,0.634,0.628,0.634,0.266,0.502
8,TeamDev-Ltd__OpenFaces,2979,2859,0.964,0.968,0.966,0.968,0.938,0.477
9,getrailo__railo,815,572,0.651,0.664,0.657,0.664,0.378,0.461


## Drop chunk position related columns

In [14]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'position'
position_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [15]:
report_position_columns = position_columns_result.get_report_df(include_overall=True)
report_position_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.974,0.976,0.573,0.943
1,apache__directory-server,845,652,0.933,0.937,0.935,0.937,0.512,0.871
2,jgralab__jgralab,2072,1802,0.862,0.865,0.862,0.865,0.491,0.734
3,CloudStack-extras__CloudStack-archive,1424,1106,0.799,0.805,0.8,0.805,0.437,0.653
4,Unidata__thredds,1154,950,0.917,0.922,0.918,0.922,0.777,0.651
5,apache__accumulo,4113,3148,0.857,0.862,0.859,0.862,0.635,0.623
6,Ramblurr__Anki-Android,892,759,0.725,0.75,0.731,0.75,0.439,0.554
7,getrailo__railo,815,572,0.695,0.71,0.701,0.71,0.378,0.534
8,apache__lucene-solr,1256,974,0.636,0.642,0.637,0.642,0.266,0.512
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.5


## Drop chunk content related columns

In [16]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'content'
content_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group)

In [17]:
report_content_columns = content_columns_result.get_report_df(include_overall=True)
report_content_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.97,0.974,0.972,0.974,0.573,0.939
1,apache__directory-server,845,652,0.918,0.922,0.92,0.922,0.512,0.84
2,jgralab__jgralab,2072,1802,0.855,0.857,0.855,0.857,0.491,0.72
3,CloudStack-extras__CloudStack-archive,1424,1106,0.806,0.811,0.807,0.811,0.437,0.665
4,apache__accumulo,4113,3148,0.849,0.858,0.853,0.858,0.635,0.612
5,Unidata__thredds,1154,950,0.902,0.908,0.904,0.908,0.777,0.59
6,Ramblurr__Anki-Android,892,759,0.722,0.747,0.729,0.747,0.439,0.549
7,getrailo__railo,815,572,0.687,0.701,0.693,0.701,0.378,0.52
8,apache__lucene-solr,1256,974,0.628,0.633,0.629,0.633,0.266,0.501
9,freenet__fred,1268,1012,0.663,0.675,0.666,0.675,0.404,0.454


In [18]:
df_inner_rf = pd.merge(report_default, report_all_columns, on='project', how='inner', suffixes=('_default', '_all'))
df_inner_rf_authorship = report_authorship_columns.add_suffix("_author").rename(columns={"project_author": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_authorship, on='project', how='inner')
df_inner_rf_complexity = report_complexity_columns.add_suffix("_complexity").rename(columns={"project_complexity": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_complexity, on='project', how='inner')
df_inner_rf_size = report_size_columns.add_suffix("_size").rename(columns={"project_size": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_size, on='project', how='inner')
df_inner_rf_position = report_position_columns.add_suffix("_position").rename(columns={"project_position": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_position, on='project', how='inner')
df_inner_rf_content = report_content_columns.add_suffix("_content").rename(columns={"project_content": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_content, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_all,accuracy_author,accuracy_complexity,accuracy_size,accuracy_position,accuracy_content
0,CCI-MIT__XCoLab,0.976,0.978,0.976,0.976,0.978,0.976,0.974
1,apache__directory-server,0.937,0.923,0.933,0.939,0.945,0.937,0.922
2,jgralab__jgralab,0.866,0.713,0.861,0.869,0.862,0.865,0.857
3,CloudStack-extras__CloudStack-archive,0.806,0.62,0.798,0.805,0.797,0.805,0.811
4,apache__accumulo,0.863,0.797,0.857,0.862,0.859,0.862,0.858
5,Unidata__thredds,0.916,0.822,0.905,0.919,0.899,0.922,0.908
6,Ramblurr__Anki-Android,0.742,0.706,0.734,0.735,0.742,0.75,0.747
7,getrailo__railo,0.712,0.633,0.71,0.71,0.664,0.71,0.701
8,apache__lucene-solr,0.646,0.561,0.639,0.648,0.634,0.642,0.633
9,TeamDev-Ltd__OpenFaces,0.969,0.952,0.97,0.969,0.968,0.969,0.966


In [19]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._all'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_all'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._author'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_author'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._complexity'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_complexity'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._size'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_size'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._position'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_position'], x['accuracy_default']), axis=1)
accuracy_rf_improv['improv._content'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_content'], x['accuracy_default']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,23
project,Overall
accuracy_default,0.80485
accuracy_all,0.7482
accuracy_author,0.79905
accuracy_complexity,0.80445
accuracy_size,0.7957
accuracy_position,0.80355
accuracy_content,0.79855
improv._all,-0.0703858
improv._author,-0.00720631
