Goal of this notebook:

Investigate the impact on the classifier accuracy of each group of attributes related to the conflicting chunks.

Attributes groups:
- Merge attributes
- File attributes
- Chunk attributes


- Merge + File -> measure Chunk attributes group importance
- Merge + Chunk -> measure File attributes group importance
- File + Chunk -> measure Merge attributes group importance

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)
import time

In [2]:
print(f'Notebook last run on: {time.ctime()}')

Notebook last run on: Tue Oct  5 12:27:44 2021


In [3]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [4]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [5]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [6]:
default_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns)
report_default = default_result.get_report_df(include_overall=True)

## Merge + file (no chunk attributes)

In [7]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'all'
chunk_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [8]:
report_chunk_columns = chunk_columns_result.get_report_df(include_overall=True)
report_chunk_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.973,0.978,0.975,0.978,0.573,0.948
1,apache__directory-server,845,652,0.922,0.923,0.922,0.923,0.512,0.843
2,Ramblurr__Anki-Android,892,759,0.681,0.706,0.691,0.706,0.439,0.477
3,apache__accumulo,4113,3148,0.788,0.797,0.792,0.797,0.635,0.444
4,jgralab__jgralab,2072,1802,0.709,0.713,0.709,0.713,0.491,0.435
5,getrailo__railo,815,572,0.621,0.633,0.625,0.633,0.378,0.41
6,apache__lucene-solr,1256,974,0.552,0.561,0.556,0.561,0.266,0.401
7,zkoss__zk,1087,881,0.75,0.759,0.754,0.759,0.602,0.396
8,freenet__fred,1268,1012,0.627,0.638,0.632,0.638,0.404,0.393
9,CloudStack-extras__CloudStack-archive,1424,1106,0.606,0.62,0.612,0.62,0.437,0.326


## Merge + Chunk (no file attributes)

In [9]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'file'
file_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [10]:
report_file_columns = file_columns_result.get_report_df(include_overall=True)
report_file_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.97,0.974,0.971,0.974,0.573,0.938
1,apache__directory-server,845,652,0.923,0.926,0.925,0.926,0.512,0.849
2,jgralab__jgralab,2072,1802,0.857,0.858,0.855,0.858,0.491,0.721
3,CloudStack-extras__CloudStack-archive,1424,1106,0.796,0.801,0.797,0.801,0.437,0.647
4,Unidata__thredds,1154,950,0.914,0.92,0.915,0.92,0.777,0.642
5,apache__accumulo,4113,3148,0.859,0.864,0.861,0.864,0.635,0.628
6,getrailo__railo,815,572,0.704,0.717,0.709,0.717,0.378,0.545
7,apache__lucene-solr,1256,974,0.648,0.654,0.649,0.654,0.266,0.529
8,Ramblurr__Anki-Android,892,759,0.716,0.735,0.718,0.735,0.439,0.528
9,TeamDev-Ltd__OpenFaces,2979,2859,0.968,0.97,0.969,0.97,0.938,0.517


## File + Chunk (no merge attributes)

In [11]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'merge'
merge_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [12]:
report_merge_columns = merge_columns_result.get_report_df(include_overall=True)
report_merge_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.966,0.969,0.967,0.969,0.573,0.928
1,apache__directory-server,845,652,0.915,0.919,0.916,0.919,0.512,0.833
2,jgralab__jgralab,2072,1802,0.85,0.851,0.844,0.851,0.491,0.707
3,CloudStack-extras__CloudStack-archive,1424,1106,0.795,0.802,0.792,0.802,0.437,0.648
4,Unidata__thredds,1154,950,0.903,0.909,0.903,0.909,0.777,0.594
5,apache__accumulo,4113,3148,0.787,0.805,0.786,0.805,0.635,0.465
6,Ramblurr__Anki-Android,892,759,0.672,0.7,0.678,0.7,0.439,0.465
7,apache__lucene-solr,1256,974,0.611,0.607,0.606,0.607,0.266,0.464
8,TeamDev-Ltd__OpenFaces,2979,2859,0.956,0.961,0.958,0.961,0.938,0.369
9,getrailo__railo,815,572,0.583,0.601,0.584,0.601,0.378,0.36


In [13]:
df_inner_rf = pd.merge(report_default, report_chunk_columns, on='project', how='inner', suffixes=('_all', '_chunk'))
df_inner_rf_file = report_file_columns.add_suffix("_file").rename(columns={"project_file": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_file, on='project', how='inner')
df_inner_rf_merge = report_merge_columns.add_suffix("_merge").rename(columns={"project_merge": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_merge, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_all,accuracy_chunk,accuracy_file,accuracy_merge
0,CCI-MIT__XCoLab,0.976,0.978,0.974,0.969
1,apache__directory-server,0.937,0.923,0.926,0.919
2,jgralab__jgralab,0.866,0.713,0.858,0.851
3,CloudStack-extras__CloudStack-archive,0.806,0.62,0.801,0.802
4,apache__accumulo,0.863,0.797,0.864,0.805
5,Unidata__thredds,0.916,0.822,0.92,0.909
6,Ramblurr__Anki-Android,0.742,0.706,0.735,0.7
7,getrailo__railo,0.712,0.633,0.717,0.601
8,apache__lucene-solr,0.646,0.561,0.654,0.607
9,TeamDev-Ltd__OpenFaces,0.969,0.952,0.97,0.961


In [14]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._chunk'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_chunk'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._file'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_file'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._merge'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_merge'], x['accuracy_all']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,23
project,Overall
accuracy_all,0.80485
accuracy_chunk,0.7482
accuracy_file,0.80185
accuracy_merge,0.7641
improv._chunk,-0.0703858
improv._file,-0.0037274
improv._merge,-0.0506306
