Goal of this notebook:

Investigate the impact of using discretized datasets (log2, log10, and mdlp) on the classification accuracy with decision tree, random forest and XGBoost.

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

# Decision Tree

In [4]:
dt = DecisionTreeClassifier(random_state=99)

In [5]:
result_dt_default = classifier_utils.ProjectsResults(dt, projects, non_features_columns)

In [6]:
report_dt_default = result_dt_default.get_report_df(include_overall=True)
# report_dt_default

In [7]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_dt_log10 = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_log10 = result_dt_log10.get_report_df(include_overall=True)
# report_dt_log10

In [8]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_dt_log2 = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_log2 = result_dt_log2.get_report_df(include_overall=True)
# report_dt_log2

In [9]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_dt_mdlp = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_mdlp = result_dt_mdlp.get_report_df(include_overall=True)

In [10]:
df_inner_dt = pd.merge(report_dt_default, report_dt_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_dt_log2 = report_dt_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_log2, on='project', how='inner')
df_inner_dt_mdlp = report_dt_mdlp.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_mdlp, on='project', how='inner')

accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improv._log10'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_dt['improv._log2'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_dt['improv._mdlp'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.972,0.968,0.971,0.97,-0.004115,-0.001029,-0.002058
1,apache__directory-server,0.931,0.919,0.919,0.882,-0.012889,-0.012889,-0.052632
2,jgralab__jgralab,0.836,0.756,0.827,0.797,-0.095694,-0.010766,-0.046651
3,Unidata__thredds,0.909,0.899,0.9,0.856,-0.011001,-0.009901,-0.058306
4,CloudStack-extras__CloudStack-archive,0.76,0.756,0.748,0.742,-0.005263,-0.015789,-0.023684
5,apache__accumulo,0.834,0.834,0.832,0.831,0.0,-0.002398,-0.003597
6,Ramblurr__Anki-Android,0.694,0.656,0.675,0.68,-0.054755,-0.027378,-0.020173
7,getrailo__railo,0.643,0.636,0.664,0.65,-0.010886,0.058824,0.019608
8,TeamDev-Ltd__OpenFaces,0.965,0.963,0.964,0.964,-0.002073,-0.001036,-0.001036
9,apache__lucene-solr,0.559,0.57,0.56,0.544,0.024943,0.002268,-0.026834


## Including NA values

In [11]:
import importlib
importlib.reload(classifier_utils)
result_dt_default_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns, replace_na=True)

In [12]:
report_dt_default_wna = result_dt_default_wna.get_report_df(include_overall=True)
# report_dt_default_wna

In [13]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_dt_log2_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_log2_wna = result_dt_log2_wna.get_report_df(include_overall=True)
# report_dt_log2_wna

In [14]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_dt_log10_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_log10_wna = result_dt_log10_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [15]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_dt_mdlp_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_mdlp_wna = result_dt_mdlp_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [16]:
df_inner_dt = pd.merge(report_dt_default_wna, report_dt_log10_wna, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_dt_log2 = report_dt_log2_wna.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_log2, on='project', how='inner')
df_inner_dt_mdlp = report_dt_mdlp_wna.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_mdlp, on='project', how='inner')

# accuracy_dt = df_inner_dt.filter(regex=("project|observations_default|accuracy.*")).copy()
accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improv._log10'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_dt['improv._log2'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_dt['improv._mdlp'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.971,0.971,0.971,0.969,0.0,0.0,-0.00206
1,apache__directory-server,0.907,0.89,0.879,0.882,-0.018743,-0.030871,-0.027563
2,jgralab__jgralab,0.819,0.75,0.818,0.797,-0.084249,-0.001221,-0.026862
3,CloudStack-extras__CloudStack-archive,0.787,0.758,0.773,0.744,-0.036849,-0.017789,-0.054638
4,Unidata__thredds,0.896,0.893,0.886,0.856,-0.003348,-0.011161,-0.044643
5,apache__accumulo,0.848,0.848,0.85,0.849,0.0,0.013158,0.006579
6,getrailo__railo,0.73,0.715,0.728,0.718,-0.020548,-0.00274,-0.016438
7,Ramblurr__Anki-Android,0.68,0.676,0.667,0.688,-0.005882,-0.019118,0.025
8,TeamDev-Ltd__OpenFaces,0.955,0.958,0.954,0.957,0.066667,-0.001047,0.044444
9,apache__lucene-solr,0.54,0.537,0.523,0.534,-0.005556,-0.031481,-0.011111


The overall average accuracy did not increase using discretized datasets (log2, log10, and mdlp). The same was observed when considering the dataset with and without NA values.

# Random forest

In [17]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [18]:
result_rf_default = classifier_utils.ProjectsResults(rf, projects, non_features_columns)

In [19]:
report_rf_default = result_rf_default.get_report_df(include_overall=True)
# report_rf_default

In [20]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_rf_log10 = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_log10 = result_rf_log10.get_report_df(include_overall=True)
# report_rf_log10

In [21]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_rf_log2 = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_log2 = result_rf_log2.get_report_df(include_overall=True)
# report_dt_log2

In [22]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_rf_mdlp = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_mdlp = result_rf_mdlp.get_report_df(include_overall=True)
# report_dt_log2

In [30]:
df_inner_rf = pd.merge(report_rf_default, report_rf_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_rf_log2 = report_rf_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_log2, on='project', how='inner')
df_inner_rf_mdlp = report_rf_mdlp.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_mdlp, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improv._log10'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_rf['improv._log2'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_rf['improv._mdlp'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.976,0.973,0.975,0.973,-0.003074,-0.001025,-0.003074
1,apache__directory-server,0.939,0.922,0.931,0.91,-0.018104,-0.00852,-0.030884
2,jgralab__jgralab,0.869,0.812,0.867,0.834,-0.065593,-0.002301,-0.040276
3,CloudStack-extras__CloudStack-archive,0.803,0.807,0.807,0.774,0.020305,0.020305,-0.036115
4,Unidata__thredds,0.919,0.924,0.924,0.866,0.061728,0.061728,-0.057671
5,apache__accumulo,0.864,0.861,0.863,0.858,-0.003472,-0.001157,-0.006944
6,Ramblurr__Anki-Android,0.74,0.74,0.734,0.713,0.0,-0.008108,-0.036486
7,getrailo__railo,0.701,0.697,0.706,0.703,-0.005706,0.016722,0.006689
8,apache__lucene-solr,0.644,0.634,0.653,0.583,-0.015528,0.025281,-0.09472
9,TeamDev-Ltd__OpenFaces,0.969,0.969,0.969,0.972,0.0,0.0,0.096774


## Including NA values

In [24]:
import importlib
importlib.reload(classifier_utils)
result_rf_default_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns, replace_na=True)

In [25]:
report_rf_default_wna = result_rf_default_wna.get_report_df(include_overall=True)
# report_dt_default_wna

In [26]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_rf_log2_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_log2_wna = result_rf_log2_wna.get_report_df(include_overall=True)
# report_dt_log2_wna

In [27]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_rf_log10_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_log10_wna = result_rf_log10_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [28]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_rf_mdlp_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_mdlp_wna = result_rf_mdlp_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [31]:
df_inner_rf = pd.merge(report_rf_default_wna, report_rf_log10_wna, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_rf_log2 = report_rf_log2_wna.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_log2, on='project', how='inner')
df_inner_rf_mdlp = report_rf_mdlp_wna.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_mdlp, on='project', how='inner')

# accuracy_dt = df_inner_dt.filter(regex=("project|observations_default|accuracy.*")).copy()
accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improv._log10'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_rf['improv._log2'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_rf['improv._mdlp'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.977,0.975,0.977,0.975,-0.002047,0.0,-0.002047
1,apache__directory-server,0.933,0.923,0.925,0.91,-0.010718,-0.008574,-0.024652
2,jgralab__jgralab,0.867,0.813,0.863,0.834,-0.062284,-0.004614,-0.038062
3,CloudStack-extras__CloudStack-archive,0.82,0.829,0.824,0.79,0.05,0.022222,-0.036585
4,Unidata__thredds,0.913,0.919,0.915,0.877,0.068966,0.022989,-0.03943
5,apache__accumulo,0.879,0.876,0.877,0.873,-0.003413,-0.002275,-0.006826
6,getrailo__railo,0.774,0.761,0.777,0.767,-0.016796,0.013274,-0.009044
7,Ramblurr__Anki-Android,0.763,0.746,0.748,0.723,-0.02228,-0.019659,-0.052425
8,apache__lucene-solr,0.642,0.626,0.646,0.576,-0.024922,0.011173,-0.102804
9,zkoss__zk,0.804,0.791,0.804,0.772,-0.016169,0.0,-0.039801


The overall average accuracy did not increase using discretized datasets (log2, log10, and mdlp). The same was observed when considering the dataset with and without NA values.

# XGBoost Random Forest

In [33]:
xg = XGBRFClassifier(random_state=99, subsample=0.9, eval_metric='mlogloss', n_estimators=400, colsample_bynode=0.4)

In [34]:
result_xg_default = classifier_utils.ProjectsResults(xg, projects, non_features_columns, drop_na=False)

In [35]:
report_xg_default = result_xg_default.get_report_df(include_overall=True)
# report_rf_default

In [36]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_xg_log10 = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_log10 = result_xg_log10.get_report_df(include_overall=True)
# report_rf_log10

In [37]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_xg_log2 = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_log2 = result_xg_log2.get_report_df(include_overall=True)
# report_dt_log2

In [38]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_xg_mdlp = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_mdlp = result_xg_mdlp.get_report_df(include_overall=True)
# report_dt_log2

In [39]:
df_inner_xg = pd.merge(report_xg_default, report_xg_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_xg_log2 = report_xg_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_xg = pd.merge(df_inner_xg, df_inner_xg_log2, on='project', how='inner')
df_inner_xg_mdlp = report_xg_mdlp.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_xg = pd.merge(df_inner_xg, df_inner_xg_mdlp, on='project', how='inner')

accuracy_xg = df_inner_xg.filter(regex=("project|accuracy.*")).copy()
accuracy_xg['improv._log10'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_xg['improv._log2'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_xg['improv._mdlp'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_xg

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.974,0.972,0.972,0.974,-0.002053,-0.002053,0.0
1,apache__directory-server,0.914,0.908,0.909,0.896,-0.006565,-0.00547,-0.019694
2,jgralab__jgralab,0.816,0.719,0.798,0.782,-0.118873,-0.022059,-0.041667
3,Unidata__thredds,0.904,0.899,0.902,0.846,-0.005531,-0.002212,-0.064159
4,getrailo__railo,0.764,0.746,0.764,0.74,-0.02356,0.0,-0.031414
5,apache__accumulo,0.854,0.84,0.848,0.852,-0.016393,-0.007026,-0.002342
6,TeamDev-Ltd__OpenFaces,0.962,0.957,0.959,0.959,-0.005198,-0.003119,-0.003119
7,Ramblurr__Anki-Android,0.715,0.711,0.704,0.715,-0.005594,-0.015385,0.0
8,CloudStack-extras__CloudStack-archive,0.708,0.704,0.698,0.677,-0.00565,-0.014124,-0.043785
9,cgjones__android-frameworks-base,0.871,0.86,0.862,0.864,-0.012629,-0.010333,-0.008037


The overall average accuracy did not increase using discretized datasets (log2, log10, and mdlp). The same was observed when considering the dataset with and without NA values.