Goal of this notebook:

Investigate the impact on the classifier accuracy of adding each group of attributes related to the conflicting chunks.

Attributes groups:

- Authorship: self_conflict_perc, authors involved
- Complexity: leftCC, rightCC
- Size: chunkAbsSize, chunkRelSize, chunk_left_abs_size, chunk_left_rel_size, chunk_right_abs_size, chunk_right_rel_size
- Position: chunkPosition
- Content: Programming language constructs


In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [5]:
default_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns)
report_default = default_result.get_report_df(include_overall=True)

## Use no chunk related columns

In [6]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'all'
none_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [7]:
report_none_columns = none_columns_result.get_report_df(include_overall=True)
report_none_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.973,0.978,0.975,0.978,0.573,0.948
1,apache__directory-server,845,652,0.922,0.923,0.922,0.923,0.512,0.843
2,Ramblurr__Anki-Android,892,759,0.681,0.706,0.691,0.706,0.439,0.477
3,apache__accumulo,4113,3148,0.788,0.797,0.792,0.797,0.635,0.444
4,jgralab__jgralab,2072,1802,0.709,0.713,0.709,0.713,0.491,0.435
5,getrailo__railo,815,572,0.621,0.633,0.625,0.633,0.378,0.41
6,apache__lucene-solr,1256,974,0.552,0.561,0.556,0.561,0.266,0.401
7,zkoss__zk,1087,881,0.75,0.759,0.754,0.759,0.602,0.396
8,freenet__fred,1268,1012,0.627,0.638,0.632,0.638,0.404,0.393
9,CloudStack-extras__CloudStack-archive,1424,1106,0.606,0.62,0.612,0.62,0.437,0.326


## Use chunk authorship related columns

In [8]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'authorship'
authorship_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='add')

In [9]:
report_authorship_columns = authorship_columns_result.get_report_df(include_overall=True)
report_authorship_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.974,0.976,0.975,0.976,0.573,0.945
1,apache__directory-server,845,652,0.927,0.928,0.927,0.928,0.512,0.852
2,CloudStack-extras__CloudStack-archive,1424,1106,0.756,0.761,0.758,0.761,0.437,0.576
3,Ramblurr__Anki-Android,892,759,0.697,0.719,0.706,0.719,0.439,0.5
4,apache__accumulo,4113,3148,0.807,0.814,0.81,0.814,0.635,0.49
5,jgralab__jgralab,2072,1802,0.739,0.739,0.738,0.739,0.491,0.486
6,apache__lucene-solr,1256,974,0.589,0.595,0.592,0.595,0.266,0.449
7,getrailo__railo,815,572,0.634,0.645,0.638,0.645,0.378,0.43
8,freenet__fred,1268,1012,0.649,0.657,0.652,0.657,0.404,0.425
9,zkoss__zk,1087,881,0.759,0.77,0.764,0.77,0.602,0.422


## Use chunk complexity related columns

In [10]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'complexity'
complexity_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='add')

In [11]:
report_complexity_columns = complexity_columns_result.get_report_df(include_overall=True)
report_complexity_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.971,0.975,0.973,0.975,0.573,0.941
1,apache__directory-server,845,652,0.914,0.916,0.914,0.916,0.512,0.827
2,jgralab__jgralab,2072,1802,0.798,0.802,0.799,0.802,0.491,0.612
3,Ramblurr__Anki-Android,892,759,0.686,0.708,0.695,0.708,0.439,0.479
4,apache__accumulo,4113,3148,0.796,0.805,0.8,0.805,0.635,0.466
5,getrailo__railo,815,572,0.64,0.647,0.642,0.647,0.378,0.433
6,CloudStack-extras__CloudStack-archive,1424,1106,0.671,0.677,0.674,0.677,0.437,0.427
7,apache__lucene-solr,1256,974,0.569,0.578,0.573,0.578,0.266,0.425
8,zkoss__zk,1087,881,0.75,0.765,0.757,0.765,0.602,0.41
9,freenet__fred,1268,1012,0.626,0.633,0.628,0.633,0.404,0.385


## Use chunk size related columns

In [12]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'size'
size_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='add')

In [13]:
report_size_columns = size_columns_result.get_report_df(include_overall=True)
report_size_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.97,0.974,0.972,0.974,0.573,0.94
1,apache__directory-server,845,652,0.911,0.914,0.912,0.914,0.512,0.824
2,jgralab__jgralab,2072,1802,0.835,0.837,0.835,0.837,0.491,0.68
3,CloudStack-extras__CloudStack-archive,1424,1106,0.774,0.78,0.775,0.78,0.437,0.61
4,apache__accumulo,4113,3148,0.84,0.848,0.843,0.848,0.635,0.582
5,Unidata__thredds,1154,950,0.884,0.898,0.889,0.898,0.777,0.542
6,Ramblurr__Anki-Android,892,759,0.7,0.725,0.709,0.725,0.439,0.509
7,getrailo__railo,815,572,0.674,0.694,0.683,0.694,0.378,0.508
8,apache__lucene-solr,1256,974,0.611,0.622,0.616,0.622,0.266,0.485
9,freenet__fred,1268,1012,0.66,0.671,0.664,0.671,0.404,0.448


## Use chunk position related columns

In [14]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'position'
position_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='add')

In [15]:
report_position_columns = position_columns_result.get_report_df(include_overall=True)
report_position_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.971,0.975,0.973,0.975,0.573,0.942
1,apache__directory-server,845,652,0.922,0.923,0.923,0.923,0.512,0.843
2,jgralab__jgralab,2072,1802,0.76,0.764,0.761,0.764,0.491,0.537
3,apache__accumulo,4113,3148,0.825,0.827,0.826,0.827,0.635,0.526
4,Ramblurr__Anki-Android,892,759,0.69,0.711,0.699,0.711,0.439,0.486
5,CloudStack-extras__CloudStack-archive,1424,1106,0.677,0.687,0.681,0.687,0.437,0.445
6,zkoss__zk,1087,881,0.761,0.773,0.766,0.773,0.602,0.43
7,apache__lucene-solr,1256,974,0.559,0.568,0.563,0.568,0.266,0.411
8,TeamDev-Ltd__OpenFaces,2979,2859,0.962,0.964,0.962,0.964,0.938,0.409
9,getrailo__railo,815,572,0.613,0.626,0.618,0.626,0.378,0.399


## Use chunk content related columns

In [16]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'content'
content_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='add')

In [17]:
report_content_columns = content_columns_result.get_report_df(include_overall=True)
report_content_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.978,0.98,0.979,0.98,0.573,0.953
1,apache__directory-server,845,652,0.937,0.939,0.938,0.939,0.512,0.874
2,apache__accumulo,4113,3148,0.835,0.841,0.838,0.841,0.635,0.566
3,CloudStack-extras__CloudStack-archive,1424,1106,0.744,0.755,0.748,0.755,0.437,0.565
4,jgralab__jgralab,2072,1802,0.763,0.765,0.763,0.765,0.491,0.538
5,Ramblurr__Anki-Android,892,759,0.71,0.729,0.716,0.729,0.439,0.516
6,TeamDev-Ltd__OpenFaces,2979,2859,0.965,0.968,0.966,0.968,0.938,0.477
7,apache__lucene-solr,1256,974,0.608,0.615,0.611,0.615,0.266,0.476
8,Unidata__thredds,1154,950,0.864,0.879,0.867,0.879,0.777,0.458
9,zkoss__zk,1087,881,0.756,0.773,0.764,0.773,0.602,0.43


In [18]:
df_inner_rf = pd.merge(report_default, report_none_columns, on='project', how='inner', suffixes=('_all', '_none'))
df_inner_rf_authorship = report_authorship_columns.add_suffix("_author").rename(columns={"project_author": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_authorship, on='project', how='inner')
df_inner_rf_complexity = report_complexity_columns.add_suffix("_complexity").rename(columns={"project_complexity": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_complexity, on='project', how='inner')
df_inner_rf_size = report_size_columns.add_suffix("_size").rename(columns={"project_size": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_size, on='project', how='inner')
df_inner_rf_position = report_position_columns.add_suffix("_position").rename(columns={"project_position": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_position, on='project', how='inner')
df_inner_rf_content = report_content_columns.add_suffix("_content").rename(columns={"project_content": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_content, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_all,accuracy_none,accuracy_author,accuracy_complexity,accuracy_size,accuracy_position,accuracy_content
0,CCI-MIT__XCoLab,0.976,0.978,0.976,0.975,0.974,0.975,0.98
1,apache__directory-server,0.942,0.923,0.928,0.916,0.914,0.923,0.939
2,jgralab__jgralab,0.87,0.713,0.739,0.802,0.837,0.764,0.765
3,CloudStack-extras__CloudStack-archive,0.804,0.62,0.761,0.677,0.78,0.687,0.755
4,Unidata__thredds,0.921,0.822,0.839,0.827,0.898,0.824,0.879
5,apache__accumulo,0.863,0.797,0.814,0.805,0.848,0.827,0.841
6,Ramblurr__Anki-Android,0.736,0.706,0.719,0.708,0.725,0.711,0.729
7,apache__lucene-solr,0.65,0.561,0.595,0.578,0.622,0.568,0.615
8,getrailo__railo,0.703,0.633,0.645,0.647,0.694,0.626,0.636
9,TeamDev-Ltd__OpenFaces,0.97,0.952,0.949,0.953,0.966,0.964,0.968


In [19]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._all'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_all'], x['accuracy_none']), axis=1)
accuracy_rf_improv['improv._author'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_author'], x['accuracy_none']), axis=1)
accuracy_rf_improv['improv._complexity'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_complexity'], x['accuracy_none']), axis=1)
accuracy_rf_improv['improv._size'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_size'], x['accuracy_none']), axis=1)
accuracy_rf_improv['improv._position'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_position'], x['accuracy_none']), axis=1)
accuracy_rf_improv['improv._content'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_content'], x['accuracy_none']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,20
project,Overall
accuracy_all,0.80535
accuracy_none,0.7482
accuracy_author,0.7672
accuracy_complexity,0.75915
accuracy_size,0.7882
accuracy_position,0.75725
accuracy_content,0.77975
improv._all,0.226966
improv._author,0.075457
