Goal of this notebook:

Investigate whether using a different classification algorithm (decision tree instead of random forest) would result in the merge attributes having more impact on the accuracy of the classifier than the chunk attributes.

Investigate the impact on the classifier accuracy of each group of attributes related to the conflicting chunks.

Attributes groups:
- Merge attributes
- File attributes
- Chunk attributes


- Merge + File -> measure Chunk attributes group importance
- Merge + Chunk -> measure File attributes group importance
- File + Chunk -> measure Merge attributes group importance

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)
import time

In [2]:
print(f'Notebook last run on: {time.ctime()}')

Notebook last run on: Mon Aug  1 15:26:29 2022


In [3]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [4]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [5]:
rf = DecisionTreeClassifier(random_state=99)

In [6]:
default_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns)
report_default = default_result.get_report_df(include_overall=True)

## Merge + file (no chunk attributes)

In [7]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'all'
chunk_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [8]:
report_chunk_columns = chunk_columns_result.get_report_df(include_overall=True)
report_chunk_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.973,0.976,0.974,0.976,0.573,0.943
1,apache__directory-server,845,652,0.914,0.916,0.915,0.916,0.512,0.827
2,jgralab__jgralab,2072,1802,0.731,0.734,0.729,0.734,0.491,0.478
3,Ramblurr__Anki-Android,892,759,0.696,0.696,0.695,0.696,0.439,0.458
4,apache__accumulo,4113,3148,0.79,0.786,0.788,0.786,0.635,0.413
5,CloudStack-extras__CloudStack-archive,1424,1106,0.654,0.654,0.654,0.654,0.437,0.385
6,freenet__fred,1268,1012,0.625,0.631,0.628,0.631,0.404,0.381
7,apache__lucene-solr,1256,974,0.545,0.544,0.544,0.544,0.266,0.379
8,getrailo__railo,815,572,0.604,0.601,0.602,0.601,0.378,0.36
9,zkoss__zk,1087,881,0.739,0.73,0.734,0.73,0.602,0.322


## Merge + Chunk (no file attributes)

In [9]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'file'
file_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [10]:
report_file_columns = file_columns_result.get_report_df(include_overall=True)
report_file_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.968,0.968,0.968,0.968,0.573,0.925
1,apache__directory-server,845,652,0.912,0.91,0.911,0.91,0.512,0.814
2,jgralab__jgralab,2072,1802,0.806,0.807,0.806,0.807,0.491,0.621
3,apache__accumulo,4113,3148,0.835,0.834,0.834,0.834,0.635,0.546
4,CloudStack-extras__CloudStack-archive,1424,1106,0.743,0.742,0.742,0.742,0.437,0.543
5,Unidata__thredds,1154,950,0.894,0.893,0.893,0.893,0.777,0.519
6,zkoss__zk,1087,881,0.765,0.772,0.768,0.772,0.602,0.427
7,getrailo__railo,815,572,0.643,0.643,0.643,0.643,0.378,0.427
8,TeamDev-Ltd__OpenFaces,2979,2859,0.963,0.964,0.963,0.964,0.938,0.409
9,apache__lucene-solr,1256,974,0.568,0.565,0.566,0.565,0.266,0.407


## File + Chunk (no merge attributes)

In [11]:
import importlib
importlib.reload(classifier_utils)
ablation_group = 'merge'
merge_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='remove')

In [12]:
report_merge_columns = merge_columns_result.get_report_df(include_overall=True)
report_merge_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.962,0.963,0.962,0.963,0.573,0.913
1,apache__directory-server,845,652,0.884,0.885,0.884,0.885,0.512,0.764
2,jgralab__jgralab,2072,1802,0.783,0.785,0.784,0.785,0.491,0.577
3,CloudStack-extras__CloudStack-archive,1424,1106,0.737,0.741,0.738,0.741,0.437,0.539
4,Unidata__thredds,1154,950,0.884,0.888,0.886,0.888,0.777,0.5
5,Ramblurr__Anki-Android,892,759,0.65,0.656,0.653,0.656,0.439,0.387
6,apache__lucene-solr,1256,974,0.516,0.52,0.518,0.52,0.266,0.345
7,apache__accumulo,4113,3148,0.746,0.749,0.747,0.749,0.635,0.312
8,TeamDev-Ltd__OpenFaces,2979,2859,0.956,0.955,0.955,0.955,0.938,0.261
9,getrailo__railo,815,572,0.541,0.538,0.538,0.538,0.378,0.258


In [13]:
df_inner_rf = pd.merge(report_default, report_chunk_columns, on='project', how='inner', suffixes=('_all', '_chunk'))
df_inner_rf_file = report_file_columns.add_suffix("_file").rename(columns={"project_file": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_file, on='project', how='inner')
df_inner_rf_merge = report_merge_columns.add_suffix("_merge").rename(columns={"project_merge": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_merge, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_all,accuracy_chunk,accuracy_file,accuracy_merge
0,CCI-MIT__XCoLab,0.972,0.976,0.968,0.963
1,apache__directory-server,0.926,0.916,0.91,0.885
2,jgralab__jgralab,0.834,0.734,0.807,0.785
3,CloudStack-extras__CloudStack-archive,0.754,0.654,0.742,0.741
4,Unidata__thredds,0.901,0.792,0.893,0.888
5,apache__accumulo,0.838,0.786,0.834,0.749
6,Ramblurr__Anki-Android,0.702,0.696,0.665,0.656
7,getrailo__railo,0.654,0.601,0.643,0.538
8,TeamDev-Ltd__OpenFaces,0.965,0.949,0.964,0.955
9,apache__lucene-solr,0.563,0.544,0.565,0.52


In [14]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._chunk'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_chunk'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._file'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_file'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._merge'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_merge'], x['accuracy_all']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,20
project,Overall
accuracy_all,0.7635
accuracy_chunk,0.7347
accuracy_file,0.7568
accuracy_merge,0.71205
improv._chunk,-0.037721
improv._file,-0.008775
improv._merge,-0.067387


Investigate the impact of removing 2 scopes

## Remove Chunk + merge (use only file attributes)

In [15]:
import random
import importlib
importlib.reload(classifier_utils)
ablation_group = 'file'
file_only_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group=ablation_group, ablation_mode='file_only')

In [16]:
report_file_only_columns = file_only_columns_result.get_report_df(include_overall=True)
report_file_only_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,apache__directory-server,845,652,0.874,0.876,0.875,0.876,0.512,0.745
1,jgralab__jgralab,2072,1802,0.725,0.729,0.722,0.729,0.491,0.467
2,Ramblurr__Anki-Android,892,759,0.68,0.681,0.68,0.681,0.439,0.432
3,CCI-MIT__XCoLab,5512,3757,0.763,0.75,0.729,0.75,0.573,0.413
4,CloudStack-extras__CloudStack-archive,1424,1106,0.647,0.646,0.646,0.646,0.437,0.371
5,Unidata__thredds,1154,950,0.837,0.852,0.843,0.852,0.777,0.335
6,apache__lucene-solr,1256,974,0.476,0.478,0.477,0.478,0.266,0.29
7,freenet__fred,1268,1012,0.566,0.571,0.568,0.571,0.404,0.28
8,alexo__wro4j,1663,1368,0.443,0.443,0.442,0.443,0.307,0.196
9,zkoss__zk,1087,881,0.677,0.679,0.677,0.679,0.602,0.194


## Remove Chunk + File (use only merge attributes)

In [17]:
import random
import importlib
importlib.reload(classifier_utils)
merge_only_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group='', ablation_mode='merge_only')

In [18]:
report_merge_only_columns = merge_only_columns_result.get_report_df(include_overall=True)
report_merge_only_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.965,0.971,0.968,0.971,0.573,0.932
1,apache__directory-server,845,652,0.894,0.906,0.9,0.906,0.512,0.808
2,Ramblurr__Anki-Android,892,759,0.701,0.715,0.707,0.715,0.439,0.493
3,apache__accumulo,4113,3148,0.773,0.788,0.778,0.788,0.635,0.419
4,getrailo__railo,815,572,0.618,0.626,0.62,0.626,0.378,0.399
5,apache__lucene-solr,1256,974,0.528,0.543,0.534,0.543,0.266,0.378
6,freenet__fred,1268,1012,0.604,0.613,0.606,0.613,0.404,0.35
7,zkoss__zk,1087,881,0.713,0.725,0.717,0.725,0.602,0.311
8,alkacon__opencms-core,923,840,0.945,0.961,0.952,0.961,0.944,0.298
9,android__platform_frameworks_base,3557,2460,0.784,0.784,0.784,0.784,0.708,0.261


## Remove Merge + File (use only chunk attributes)

In [19]:
import random
import importlib
importlib.reload(classifier_utils)
chunk_only_columns_result = classifier_utils.ProjectsResults(rf, projects, non_features_columns, ablation=True, ablation_group='', ablation_mode='chunk_only')

In [20]:
report_chunk_only_columns = chunk_only_columns_result.get_report_df(include_overall=True)
report_chunk_only_columns

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.96,0.96,0.96,0.96,0.573,0.906
1,apache__directory-server,845,652,0.848,0.848,0.847,0.848,0.512,0.689
2,jgralab__jgralab,2072,1802,0.775,0.775,0.775,0.775,0.491,0.557
3,CloudStack-extras__CloudStack-archive,1424,1106,0.723,0.726,0.724,0.726,0.437,0.514
4,Unidata__thredds,1154,950,0.885,0.886,0.885,0.886,0.777,0.491
5,apache__lucene-solr,1256,974,0.513,0.511,0.512,0.511,0.266,0.334
6,TeamDev-Ltd__OpenFaces,2979,2859,0.957,0.957,0.957,0.957,0.938,0.295
7,apache__accumulo,4113,3148,0.738,0.74,0.739,0.74,0.635,0.287
8,freenet__fred,1268,1012,0.548,0.554,0.551,0.554,0.404,0.252
9,Ramblurr__Anki-Android,892,759,0.576,0.57,0.573,0.57,0.439,0.235


In [21]:
df_inner_rf = pd.merge(report_default, report_chunk_only_columns, on='project', how='inner', suffixes=('_all', '_chunk'))
df_inner_rf_file_only = report_file_only_columns.add_suffix("_file").rename(columns={"project_file": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_file_only, on='project', how='inner')
df_inner_rf_merge_only = report_merge_only_columns.add_suffix("_merge").rename(columns={"project_merge": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_merge_only, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf

Unnamed: 0,project,accuracy_all,accuracy_chunk,accuracy_file,accuracy_merge
0,CCI-MIT__XCoLab,0.972,0.96,0.75,0.971
1,apache__directory-server,0.926,0.848,0.876,0.906
2,jgralab__jgralab,0.834,0.775,0.729,0.594
3,CloudStack-extras__CloudStack-archive,0.754,0.726,0.646,0.501
4,Unidata__thredds,0.901,0.886,0.852,0.786
5,apache__accumulo,0.838,0.74,0.694,0.788
6,Ramblurr__Anki-Android,0.702,0.57,0.681,0.715
7,getrailo__railo,0.654,0.521,0.479,0.626
8,TeamDev-Ltd__OpenFaces,0.965,0.957,0.918,0.95
9,apache__lucene-solr,0.563,0.511,0.478,0.543


In [22]:
accuracy_rf_improv = accuracy_rf[accuracy_rf['project']=='Overall'].copy()
accuracy_rf_improv['improv._chunk'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_chunk'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._file'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_file'], x['accuracy_all']), axis=1)
accuracy_rf_improv['improv._merge'] = accuracy_rf_improv.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_merge'], x['accuracy_all']), axis=1)
accuracy_rf_improv.transpose()

Unnamed: 0,20
project,Overall
accuracy_all,0.7635
accuracy_chunk,0.70215
accuracy_file,0.6842
accuracy_merge,0.7206
improv._chunk,-0.080354
improv._file,-0.103864
improv._merge,-0.056189
