Goal of this notebook:

Investigate the impact on the classifier accuracy of each group of attributes related to the conflicting chunks.

Attributes groups:
- Merge attributes
- File attributes
- Chunk attributes


- Merge + File -> measure Chunk attributes group importance
- Merge + Chunk -> measure File attributes group importance
- File + Chunk -> measure Merge attributes group importance

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.ensemble import RandomForestClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)
import time

In [2]:
print(f'Notebook last run on: {time.ctime()}')

Notebook last run on: Mon Jul 25 10:38:47 2022


In [3]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [4]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [5]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [6]:
default_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns)
report_default = default_result.get_report_df(include_overall=True)

## Merge + file (no chunk attributes)

In [7]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'all'
chunk_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [8]:
report_chunk_columns = chunk_columns_result.get_report_df(include_overall=True)
report_chunk_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.973,0.978,0.975,0.978,0.573,0.948
1,apache__directory-server,845,652,0.922,0.923,0.922,0.923,0.512,0.843
2,Ramblurr__Anki-Android,892,759,0.681,0.706,0.691,0.706,0.439,0.477
3,apache__accumulo,4113,3148,0.788,0.797,0.792,0.797,0.635,0.444
4,jgralab__jgralab,2072,1802,0.709,0.713,0.709,0.713,0.491,0.435
5,getrailo__railo,815,572,0.621,0.633,0.625,0.633,0.378,0.41
6,apache__lucene-solr,1256,974,0.552,0.561,0.556,0.561,0.266,0.401
7,zkoss__zk,1087,881,0.75,0.759,0.754,0.759,0.602,0.396
8,freenet__fred,1268,1012,0.627,0.638,0.632,0.638,0.404,0.393
9,CloudStack-extras__CloudStack-archive,1424,1106,0.606,0.62,0.612,0.62,0.437,0.326


## Merge + Chunk (no file attributes)

In [9]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'file'
file_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [10]:
report_file_columns = file_columns_result.get_report_df(include_overall=True)
report_file_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.97,0.974,0.972,0.974,0.573,0.939
1,apache__directory-server,845,652,0.928,0.931,0.929,0.931,0.512,0.858
2,jgralab__jgralab,2072,1802,0.855,0.857,0.854,0.857,0.491,0.719
3,Unidata__thredds,1154,950,0.917,0.923,0.917,0.923,0.777,0.656
4,CloudStack-extras__CloudStack-archive,1424,1106,0.794,0.8,0.795,0.8,0.437,0.645
5,apache__accumulo,4113,3148,0.858,0.864,0.86,0.864,0.635,0.628
6,getrailo__railo,815,572,0.699,0.71,0.704,0.71,0.378,0.534
7,apache__lucene-solr,1256,974,0.648,0.653,0.648,0.653,0.266,0.527
8,Ramblurr__Anki-Android,892,759,0.707,0.726,0.71,0.726,0.439,0.512
9,TeamDev-Ltd__OpenFaces,2979,2859,0.968,0.97,0.969,0.97,0.938,0.511


## File + Chunk (no merge attributes)

In [11]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'merge'
merge_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [12]:
report_merge_columns = merge_columns_result.get_report_df(include_overall=True)
report_merge_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.967,0.97,0.968,0.97,0.573,0.931
1,apache__directory-server,845,652,0.912,0.916,0.913,0.916,0.512,0.827
2,jgralab__jgralab,2072,1802,0.848,0.848,0.841,0.848,0.491,0.701
3,CloudStack-extras__CloudStack-archive,1424,1106,0.792,0.799,0.789,0.799,0.437,0.644
4,Unidata__thredds,1154,950,0.906,0.912,0.906,0.912,0.777,0.604
5,Ramblurr__Anki-Android,892,759,0.698,0.709,0.689,0.709,0.439,0.481
6,apache__accumulo,4113,3148,0.786,0.804,0.786,0.804,0.635,0.463
7,apache__lucene-solr,1256,974,0.602,0.599,0.597,0.599,0.266,0.453
8,getrailo__railo,815,572,0.588,0.607,0.59,0.607,0.378,0.368
9,TeamDev-Ltd__OpenFaces,2979,2859,0.955,0.961,0.958,0.961,0.938,0.364


In [13]:
df_inner_rf = pd.merge(report_default, report_chunk_columns, on='project', how='inner', suffixes=('_all', '_chunk'))
df_inner_rf_file = report_file_columns.add_suffix("_file").rename(columns={"project_file": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_file, on='project', how='inner')
df_inner_rf_merge = report_merge_columns.add_suffix("_merge").rename(columns={"project_merge": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_merge, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_all,accuracy_chunk,accuracy_file,accuracy_merge
0,CCI-MIT__XCoLab,0.976,0.978,0.974,0.97
1,apache__directory-server,0.942,0.923,0.931,0.916
2,jgralab__jgralab,0.87,0.713,0.857,0.848
3,CloudStack-extras__CloudStack-archive,0.804,0.62,0.8,0.799
4,Unidata__thredds,0.921,0.822,0.923,0.912
5,apache__accumulo,0.863,0.797,0.864,0.804
6,Ramblurr__Anki-Android,0.736,0.706,0.726,0.709
7,apache__lucene-solr,0.65,0.561,0.653,0.599
8,getrailo__railo,0.703,0.633,0.71,0.607
9,TeamDev-Ltd__OpenFaces,0.97,0.952,0.97,0.961


In [14]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._chunk'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_chunk'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._file'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_file'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._merge'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_merge'], x['accuracy_all']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,20
project,Overall
accuracy_all,0.80535
accuracy_chunk,0.7482
accuracy_file,0.802
accuracy_merge,0.7644
improv._chunk,-0.070963
improv._file,-0.00416
improv._merge,-0.050847
