Goal of this notebook:

Investigate the impact of replacing NA values in the dataset with a placeholder on the classification accuracy with decision tree and random forest.  (XGBoost Random Forest works with NA values natively)

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
import classifier_utils
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

# Decision Tree

In [4]:
dt = DecisionTreeClassifier(random_state=99)

In [5]:
result_dt = classifier_utils.ProjectsResults(dt, projects, non_features_columns)

In [6]:
report_dt = result_dt.get_report_df(include_overall=True)
report_dt

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.972,0.972,0.972,0.573,0.935
1,apache__directory-server,845,652,0.929,0.931,0.93,0.931,0.512,0.858
2,jgralab__jgralab,2072,1802,0.835,0.836,0.835,0.836,0.491,0.677
3,Unidata__thredds,1154,950,0.907,0.909,0.908,0.909,0.777,0.594
4,CloudStack-extras__CloudStack-archive,1424,1106,0.759,0.76,0.759,0.76,0.437,0.575
5,apache__accumulo,4113,3148,0.834,0.834,0.834,0.834,0.635,0.546
6,Ramblurr__Anki-Android,892,759,0.69,0.694,0.692,0.694,0.439,0.455
7,getrailo__railo,815,572,0.648,0.643,0.646,0.643,0.378,0.427
8,TeamDev-Ltd__OpenFaces,2979,2859,0.965,0.965,0.965,0.965,0.938,0.426
9,apache__lucene-solr,1256,974,0.557,0.559,0.558,0.559,0.266,0.399


In [7]:
import importlib
importlib.reload(classifier_utils)
result_dt2 = classifier_utils.ProjectsResults(dt, projects, non_features_columns, replace_na=True)

In [8]:
report_dt2 = result_dt2.get_report_df(include_overall=True)
report_dt2

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,5512,0.972,0.971,0.971,0.971,0.454,0.947
1,apache__directory-server,845,845,0.906,0.907,0.906,0.907,0.515,0.807
2,jgralab__jgralab,2072,2072,0.819,0.819,0.819,0.819,0.48,0.653
3,CloudStack-extras__CloudStack-archive,1424,1424,0.781,0.787,0.784,0.787,0.453,0.611
4,Unidata__thredds,1154,1154,0.896,0.896,0.896,0.896,0.775,0.538
5,apache__accumulo,4113,4113,0.848,0.848,0.848,0.848,0.687,0.514
6,getrailo__railo,815,815,0.727,0.73,0.728,0.73,0.459,0.501
7,Ramblurr__Anki-Android,892,892,0.695,0.68,0.687,0.68,0.436,0.433
8,TeamDev-Ltd__OpenFaces,2979,2979,0.954,0.955,0.955,0.955,0.924,0.4
9,apache__lucene-solr,1256,1256,0.541,0.54,0.54,0.54,0.289,0.353


In [9]:
df_inner_dt = pd.merge(report_dt, report_dt2, on='project', how='inner', suffixes=('_withoutNA', '_withNA'))

accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improvement'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_withNA'], x['accuracy_withoutNA']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_withoutNA,accuracy_withNA,improvement
0,CCI-MIT__XCoLab,0.972,0.971,-0.001029
1,apache__directory-server,0.931,0.907,-0.025779
2,jgralab__jgralab,0.836,0.819,-0.020335
3,Unidata__thredds,0.909,0.896,-0.014301
4,CloudStack-extras__CloudStack-archive,0.76,0.787,0.1125
5,apache__accumulo,0.834,0.848,0.084337
6,Ramblurr__Anki-Android,0.694,0.68,-0.020173
7,getrailo__railo,0.643,0.73,0.243697
8,TeamDev-Ltd__OpenFaces,0.965,0.955,-0.010363
9,apache__lucene-solr,0.559,0.54,-0.033989


The overall average accuracy with decision tree decreased from 0.76904 to 0.758103. It decreased in 15 of 25 projects and increased in 10.
On the other hand, inputing a constant to replace the NA values allowed to go from 25 to 29 projects.

# Random forest

In [10]:
model_3 = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [11]:
result_dt3 = classifier_utils.ProjectsResults(model_3, projects, non_features_columns, replace_na=False)
report_dt3 = result_dt3.get_report_df(include_overall=True)

In [12]:
report_dt3

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.971,0.976,0.973,0.976,0.573,0.943
1,apache__directory-server,845,652,0.935,0.939,0.936,0.939,0.512,0.874
2,jgralab__jgralab,2072,1802,0.867,0.869,0.867,0.869,0.491,0.743
3,CloudStack-extras__CloudStack-archive,1424,1106,0.798,0.803,0.798,0.803,0.437,0.65
4,Unidata__thredds,1154,950,0.913,0.919,0.914,0.919,0.777,0.637
5,apache__accumulo,4113,3148,0.86,0.864,0.86,0.864,0.635,0.627
6,Ramblurr__Anki-Android,892,759,0.718,0.74,0.722,0.74,0.439,0.538
7,getrailo__railo,815,572,0.687,0.701,0.693,0.701,0.378,0.52
8,apache__lucene-solr,1256,974,0.636,0.644,0.638,0.644,0.266,0.515
9,TeamDev-Ltd__OpenFaces,2979,2859,0.967,0.969,0.968,0.969,0.938,0.5


In [13]:
result_dt4 = classifier_utils.ProjectsResults(model_3, projects, non_features_columns, replace_na=True)
report_dt4 = result_dt4.get_report_df(include_overall=True)

In [14]:
report_dt4

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,5512,0.973,0.977,0.974,0.977,0.454,0.957
1,apache__directory-server,845,845,0.93,0.933,0.931,0.933,0.515,0.861
2,jgralab__jgralab,2072,2072,0.865,0.867,0.864,0.867,0.48,0.745
3,CloudStack-extras__CloudStack-archive,1424,1424,0.814,0.82,0.815,0.82,0.453,0.67
4,Unidata__thredds,1154,1154,0.907,0.913,0.908,0.913,0.775,0.615
5,apache__accumulo,4113,4113,0.872,0.879,0.874,0.879,0.687,0.612
6,getrailo__railo,815,815,0.756,0.774,0.763,0.774,0.459,0.583
7,Ramblurr__Anki-Android,892,892,0.741,0.763,0.745,0.763,0.436,0.581
8,apache__lucene-solr,1256,1256,0.637,0.642,0.633,0.642,0.289,0.496
9,zkoss__zk,1087,1087,0.786,0.804,0.792,0.804,0.622,0.482


In [15]:
df_inner_rf = pd.merge(report_dt3, report_dt4, on='project', how='inner', suffixes=('_withoutNA', '_withNA'))

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improvement'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_withNA'], x['accuracy_withoutNA']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_withoutNA,accuracy_withNA,improvement
0,CCI-MIT__XCoLab,0.976,0.977,0.041667
1,apache__directory-server,0.939,0.933,-0.00639
2,jgralab__jgralab,0.869,0.867,-0.002301
3,CloudStack-extras__CloudStack-archive,0.803,0.82,0.086294
4,Unidata__thredds,0.919,0.913,-0.006529
5,apache__accumulo,0.864,0.879,0.110294
6,Ramblurr__Anki-Android,0.74,0.763,0.088462
7,getrailo__railo,0.701,0.774,0.244147
8,apache__lucene-solr,0.644,0.642,-0.003106
9,TeamDev-Ltd__OpenFaces,0.969,0.961,-0.008256


The overall average accuracy with random forest decreased from 0.80804 to 0.80731. It decreased in 8 of 25 projects, increased in 16 and remained constant in 1.
On the other hand, inputing a constant to replace the NA values allowed to go from 25 to 29 projects.