Goal of this notebook:

Investigate the impact of replacing NA values in the dataset with a placeholder on the classification accuracy with decision tree and random forest.  (XGBoost Random Forest works with NA values natively)

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
import classifier_utils
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

# Decision Tree

In [4]:
dt = DecisionTreeClassifier(random_state=99)

In [5]:
result_dt = classifier_utils.ProjectsResults(dt, projects, non_features_columns)

In [6]:
report_dt = result_dt.get_report_df(include_overall=True)
report_dt

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.972,0.972,0.972,0.573,0.933
1,apache__directory-server,845,652,0.926,0.928,0.927,0.928,0.512,0.852
2,jgralab__jgralab,2072,1802,0.839,0.839,0.839,0.839,0.491,0.683
3,Unidata__thredds,1154,950,0.907,0.909,0.908,0.909,0.777,0.594
4,apache__accumulo,4113,3148,0.838,0.838,0.838,0.838,0.635,0.557
5,CloudStack-extras__CloudStack-archive,1424,1106,0.748,0.75,0.749,0.75,0.437,0.555
6,getrailo__railo,815,572,0.656,0.65,0.652,0.65,0.378,0.438
7,Ramblurr__Anki-Android,892,759,0.672,0.682,0.677,0.682,0.439,0.434
8,TeamDev-Ltd__OpenFaces,2979,2859,0.963,0.964,0.964,0.964,0.938,0.415
9,apache__lucene-solr,1256,974,0.558,0.555,0.556,0.555,0.266,0.394


In [7]:
import importlib
importlib.reload(classifier_utils)
result_dt2 = classifier_utils.ProjectsResults(dt, projects, non_features_columns, replace_na=True)

In [8]:
report_dt2 = result_dt2.get_report_df(include_overall=True)
report_dt2

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,5512,0.971,0.971,0.971,0.971,0.454,0.946
1,apache__directory-server,845,845,0.902,0.902,0.902,0.902,0.515,0.798
2,jgralab__jgralab,2072,2072,0.823,0.823,0.823,0.823,0.48,0.66
3,CloudStack-extras__CloudStack-archive,1424,1424,0.778,0.78,0.779,0.78,0.453,0.598
4,Unidata__thredds,1154,1154,0.903,0.904,0.903,0.904,0.775,0.573
5,apache__accumulo,4113,4113,0.852,0.854,0.853,0.854,0.687,0.535
6,getrailo__railo,815,815,0.725,0.721,0.723,0.721,0.459,0.485
7,Ramblurr__Anki-Android,892,892,0.684,0.674,0.679,0.674,0.436,0.421
8,TeamDev-Ltd__OpenFaces,2979,2979,0.952,0.952,0.952,0.952,0.924,0.369
9,apache__lucene-solr,1256,1256,0.537,0.538,0.538,0.538,0.289,0.351


In [9]:
df_inner_dt = pd.merge(report_dt, report_dt2, on='project', how='inner', suffixes=('_withoutNA', '_withNA'))

accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improvement'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_withNA'], x['accuracy_withoutNA']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_withoutNA,accuracy_withNA,improvement
0,CCI-MIT__XCoLab,0.972,0.971,-0.001029
1,apache__directory-server,0.928,0.902,-0.028017
2,jgralab__jgralab,0.839,0.823,-0.01907
3,Unidata__thredds,0.909,0.904,-0.005501
4,apache__accumulo,0.838,0.854,0.098765
5,CloudStack-extras__CloudStack-archive,0.75,0.78,0.12
6,getrailo__railo,0.65,0.721,0.202857
7,Ramblurr__Anki-Android,0.682,0.674,-0.01173
8,TeamDev-Ltd__OpenFaces,0.964,0.952,-0.012448
9,apache__lucene-solr,0.555,0.538,-0.030631


In [10]:
accuracy_dt

Unnamed: 0,project,accuracy_withoutNA,accuracy_withNA,improvement
0,CCI-MIT__XCoLab,0.972,0.971,-0.001029
1,apache__directory-server,0.928,0.902,-0.028017
2,jgralab__jgralab,0.839,0.823,-0.01907
3,Unidata__thredds,0.909,0.904,-0.005501
4,apache__accumulo,0.838,0.854,0.098765
5,CloudStack-extras__CloudStack-archive,0.75,0.78,0.12
6,getrailo__railo,0.65,0.721,0.202857
7,Ramblurr__Anki-Android,0.682,0.674,-0.01173
8,TeamDev-Ltd__OpenFaces,0.964,0.952,-0.012448
9,apache__lucene-solr,0.555,0.538,-0.030631


In [19]:
increased_n = len(accuracy_dt[(accuracy_dt['improvement'] > 0) &  (accuracy_dt['project'] != 'Overall')])
decreased_n = len(accuracy_dt[(accuracy_dt['improvement'] < 0) &  (accuracy_dt['project'] != 'Overall')])
print(f'Increased in {increased_n} and decreased in {decreased_n} projects.')

Increased in 8 and decreased in 12 projects.


The overall average accuracy with decision tree decreased from 0.76205 to 0.745957. It decreased in 12 of 20 projects and increased in 8.
On the other hand, inputing a constant to replace the NA values allowed to go from 20 to 23 projects.

# Random forest

In [11]:
model_3 = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [12]:
result_dt3 = classifier_utils.ProjectsResults(model_3, projects, non_features_columns, replace_na=False)
report_dt3 = result_dt3.get_report_df(include_overall=True)

In [13]:
report_dt3

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.973,0.976,0.573,0.943
1,apache__directory-server,845,652,0.934,0.937,0.936,0.937,0.512,0.871
2,jgralab__jgralab,2072,1802,0.864,0.866,0.864,0.866,0.491,0.737
3,CloudStack-extras__CloudStack-archive,1424,1106,0.799,0.806,0.8,0.806,0.437,0.655
4,apache__accumulo,4113,3148,0.858,0.863,0.859,0.863,0.635,0.625
5,Unidata__thredds,1154,950,0.908,0.916,0.911,0.916,0.777,0.623
6,Ramblurr__Anki-Android,892,759,0.722,0.742,0.724,0.742,0.439,0.54
7,getrailo__railo,815,572,0.699,0.712,0.703,0.712,0.378,0.537
8,apache__lucene-solr,1256,974,0.639,0.646,0.64,0.646,0.266,0.517
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.494


In [14]:
result_dt4 = classifier_utils.ProjectsResults(model_3, projects, non_features_columns, replace_na=True)
report_dt4 = result_dt4.get_report_df(include_overall=True)

In [15]:
report_dt4

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,5512,0.973,0.977,0.974,0.977,0.454,0.957
1,apache__directory-server,845,845,0.929,0.931,0.93,0.931,0.515,0.859
2,jgralab__jgralab,2072,2072,0.865,0.868,0.865,0.868,0.48,0.746
3,CloudStack-extras__CloudStack-archive,1424,1424,0.816,0.822,0.816,0.822,0.453,0.674
4,Unidata__thredds,1154,1154,0.907,0.913,0.908,0.913,0.775,0.615
5,apache__accumulo,4113,4113,0.871,0.879,0.874,0.879,0.687,0.613
6,getrailo__railo,815,815,0.762,0.779,0.769,0.779,0.459,0.592
7,Ramblurr__Anki-Android,892,892,0.741,0.763,0.744,0.763,0.436,0.581
8,TeamDev-Ltd__OpenFaces,2979,2979,0.959,0.962,0.96,0.962,0.924,0.502
9,apache__lucene-solr,1256,1256,0.635,0.64,0.632,0.64,0.289,0.494


In [16]:
df_inner_rf = pd.merge(report_dt3, report_dt4, on='project', how='inner', suffixes=('_withoutNA', '_withNA'))

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improvement'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_withNA'], x['accuracy_withoutNA']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_withoutNA,accuracy_withNA,improvement
0,CCI-MIT__XCoLab,0.976,0.977,0.041667
1,apache__directory-server,0.937,0.931,-0.006403
2,jgralab__jgralab,0.866,0.868,0.014925
3,CloudStack-extras__CloudStack-archive,0.806,0.822,0.082474
4,apache__accumulo,0.863,0.879,0.116788
5,Unidata__thredds,0.916,0.913,-0.003275
6,Ramblurr__Anki-Android,0.742,0.763,0.081395
7,getrailo__railo,0.712,0.779,0.232639
8,apache__lucene-solr,0.646,0.64,-0.009288
9,TeamDev-Ltd__OpenFaces,0.969,0.962,-0.007224


In [21]:
increased_n = len(accuracy_rf[(accuracy_rf['improvement'] > 0) &  (accuracy_rf['project'] != 'Overall')])
decreased_n = len(accuracy_rf[(accuracy_rf['improvement'] < 0) &  (accuracy_rf['project'] != 'Overall')])
constant_n = len(accuracy_rf[(accuracy_rf['improvement'] == 0) &  (accuracy_rf['project'] != 'Overall')])
print(f'Increased in {increased_n} and decreased in {decreased_n} projects. Constant in {constant_n}')

Increased in 13 and decreased in 6 projects. Constant in 1


The overall average accuracy with random forest decreased from 0.80485 to 0.798696. It decreased in 6 of 20 projects, increased in 13 and remained constant in 1.
On the other hand, inputing a constant to replace the NA values allowed to go from 20 to 23 projects.