Goal of this notebook:

Investigate the impact of using discretized datasets (log2, log10, and mdlp) on the classification accuracy with decision tree, random forest and XGBoost.

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

# Decision Tree

In [4]:
dt = DecisionTreeClassifier(random_state=99)

In [5]:
result_dt_default = classifier_utils.ProjectsResults(dt, projects, non_features_columns)

In [6]:
report_dt_default = result_dt_default.get_report_df(include_overall=True)
# report_dt_default

In [7]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_dt_log10 = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_log10 = result_dt_log10.get_report_df(include_overall=True)
# report_dt_log10

In [8]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_dt_log2 = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_log2 = result_dt_log2.get_report_df(include_overall=True)
# report_dt_log2

In [9]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_dt_mdlp = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_mdlp = result_dt_mdlp.get_report_df(include_overall=True)

In [10]:
df_inner_dt = pd.merge(report_dt_default, report_dt_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_dt_log2 = report_dt_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_log2, on='project', how='inner')
df_inner_dt_mdlp = report_dt_mdlp.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_mdlp, on='project', how='inner')

accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improv._log10'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_dt['improv._log2'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_dt['improv._mdlp'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.972,0.967,0.972,0.969,-0.005144,0.0,-0.003086
1,apache__directory-server,0.928,0.92,0.923,0.885,-0.008621,-0.005388,-0.046336
2,jgralab__jgralab,0.839,0.756,0.833,0.795,-0.098927,-0.007151,-0.052443
3,Unidata__thredds,0.909,0.899,0.903,0.849,-0.011001,-0.006601,-0.066007
4,apache__accumulo,0.838,0.831,0.83,0.829,-0.008353,-0.009547,-0.01074
5,CloudStack-extras__CloudStack-archive,0.75,0.755,0.756,0.739,0.02,0.024,-0.014667
6,getrailo__railo,0.65,0.631,0.645,0.645,-0.029231,-0.007692,-0.007692
7,Ramblurr__Anki-Android,0.682,0.642,0.659,0.682,-0.058651,-0.033724,0.0
8,TeamDev-Ltd__OpenFaces,0.964,0.958,0.965,0.965,-0.006224,0.027778,0.027778
9,apache__lucene-solr,0.555,0.577,0.575,0.533,0.049438,0.044944,-0.03964


## Including NA values

In [11]:
import importlib
importlib.reload(classifier_utils)
result_dt_default_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns, replace_na=True)

In [12]:
report_dt_default_wna = result_dt_default_wna.get_report_df(include_overall=True)
# report_dt_default_wna

In [13]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_dt_log2_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_log2_wna = result_dt_log2_wna.get_report_df(include_overall=True)
# report_dt_log2_wna

In [14]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_dt_log10_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_log10_wna = result_dt_log10_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [15]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_dt_mdlp_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_mdlp_wna = result_dt_mdlp_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [16]:
df_inner_dt = pd.merge(report_dt_default_wna, report_dt_log10_wna, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_dt_log2 = report_dt_log2_wna.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_log2, on='project', how='inner')
df_inner_dt_mdlp = report_dt_mdlp_wna.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_mdlp, on='project', how='inner')

# accuracy_dt = df_inner_dt.filter(regex=("project|observations_default|accuracy.*")).copy()
accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improv._log10'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_dt['improv._log2'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_dt['improv._mdlp'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.971,0.971,0.971,0.969,0.0,0.0,-0.00206
1,apache__directory-server,0.902,0.89,0.877,0.885,-0.013304,-0.027716,-0.018847
2,jgralab__jgralab,0.823,0.751,0.818,0.795,-0.087485,-0.006075,-0.034022
3,CloudStack-extras__CloudStack-archive,0.78,0.763,0.772,0.747,-0.021795,-0.010256,-0.042308
4,Unidata__thredds,0.904,0.891,0.893,0.852,-0.014381,-0.012168,-0.057522
5,apache__accumulo,0.854,0.847,0.849,0.848,-0.008197,-0.005855,-0.007026
6,getrailo__railo,0.721,0.715,0.724,0.709,-0.008322,0.010753,-0.016644
7,Ramblurr__Anki-Android,0.674,0.675,0.666,0.688,0.003067,-0.011869,0.042945
8,TeamDev-Ltd__OpenFaces,0.952,0.958,0.953,0.955,0.125,0.020833,0.0625
9,apache__lucene-solr,0.538,0.538,0.537,0.543,0.0,-0.001859,0.010823


The overall average accuracy did not increase using discretized datasets (log2, log10, and mdlp). The same was observed when considering the dataset with and without NA values.

# Random forest

In [17]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [18]:
result_rf_default = classifier_utils.ProjectsResults(rf, projects, non_features_columns)

In [19]:
report_rf_default = result_rf_default.get_report_df(include_overall=True)
# report_rf_default

In [20]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_rf_log10 = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_log10 = result_rf_log10.get_report_df(include_overall=True)
# report_rf_log10

In [21]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_rf_log2 = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_log2 = result_rf_log2.get_report_df(include_overall=True)
# report_dt_log2

In [22]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_rf_mdlp = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_mdlp = result_rf_mdlp.get_report_df(include_overall=True)
# report_dt_log2

In [23]:
df_inner_rf = pd.merge(report_rf_default, report_rf_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_rf_log2 = report_rf_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_log2, on='project', how='inner')
df_inner_rf_mdlp = report_rf_mdlp.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_mdlp, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improv._log10'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_rf['improv._log2'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_rf['improv._mdlp'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.976,0.974,0.975,0.973,-0.002049,-0.001025,-0.003074
1,apache__directory-server,0.937,0.923,0.933,0.914,-0.014941,-0.004269,-0.024546
2,jgralab__jgralab,0.866,0.814,0.867,0.833,-0.060046,0.007463,-0.038106
3,CloudStack-extras__CloudStack-archive,0.806,0.803,0.805,0.771,-0.003722,-0.001241,-0.043424
4,apache__accumulo,0.863,0.865,0.866,0.855,0.014599,0.021898,-0.00927
5,Unidata__thredds,0.916,0.926,0.921,0.862,0.119048,0.059524,-0.058952
6,Ramblurr__Anki-Android,0.742,0.734,0.742,0.715,-0.010782,0.0,-0.036388
7,getrailo__railo,0.712,0.692,0.711,0.703,-0.02809,-0.001404,-0.01264
8,apache__lucene-solr,0.646,0.642,0.652,0.586,-0.006192,0.016949,-0.092879
9,TeamDev-Ltd__OpenFaces,0.969,0.968,0.969,0.971,-0.001032,0.0,0.064516


## Including NA values

In [24]:
import importlib
importlib.reload(classifier_utils)
result_rf_default_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns, replace_na=True)

In [25]:
report_rf_default_wna = result_rf_default_wna.get_report_df(include_overall=True)
# report_dt_default_wna

In [26]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_rf_log2_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_log2_wna = result_rf_log2_wna.get_report_df(include_overall=True)
# report_dt_log2_wna

In [27]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_rf_log10_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_log10_wna = result_rf_log10_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [28]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_rf_mdlp_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_mdlp_wna = result_rf_mdlp_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [29]:
df_inner_rf = pd.merge(report_rf_default_wna, report_rf_log10_wna, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_rf_log2 = report_rf_log2_wna.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_log2, on='project', how='inner')
df_inner_rf_mdlp = report_rf_mdlp_wna.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_mdlp, on='project', how='inner')

# accuracy_dt = df_inner_dt.filter(regex=("project|observations_default|accuracy.*")).copy()
accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improv._log10'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_rf['improv._log2'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_rf['improv._mdlp'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.977,0.975,0.976,0.975,-0.002047,-0.001024,-0.002047
1,apache__directory-server,0.931,0.924,0.927,0.914,-0.007519,-0.004296,-0.01826
2,jgralab__jgralab,0.868,0.816,0.861,0.833,-0.059908,-0.008065,-0.040323
3,CloudStack-extras__CloudStack-archive,0.822,0.824,0.82,0.786,0.011236,-0.002433,-0.043796
4,Unidata__thredds,0.913,0.923,0.918,0.878,0.114943,0.057471,-0.038335
5,apache__accumulo,0.879,0.875,0.88,0.872,-0.004551,0.008264,-0.007964
6,getrailo__railo,0.779,0.766,0.777,0.766,-0.016688,-0.002567,-0.016688
7,Ramblurr__Anki-Android,0.763,0.748,0.748,0.726,-0.019659,-0.019659,-0.048493
8,TeamDev-Ltd__OpenFaces,0.962,0.961,0.959,0.961,-0.00104,-0.003119,-0.00104
9,apache__lucene-solr,0.64,0.623,0.648,0.584,-0.026563,0.022222,-0.0875


The overall average accuracy did not increase using discretized datasets (log2, log10, and mdlp). The same was observed when considering the dataset with and without NA values.

# XGBoost Random Forest

In [30]:
xg = XGBRFClassifier(random_state=99, subsample=0.9, eval_metric='mlogloss', n_estimators=400, colsample_bynode=0.4)

In [31]:
result_xg_default = classifier_utils.ProjectsResults(xg, projects, non_features_columns, drop_na=False)

In [32]:
report_xg_default = result_xg_default.get_report_df(include_overall=True)
# report_rf_default

In [33]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_xg_log10 = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_log10 = result_xg_log10.get_report_df(include_overall=True)
# report_rf_log10

In [34]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_xg_log2 = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_log2 = result_xg_log2.get_report_df(include_overall=True)
# report_dt_log2

In [35]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_mdlp'
result_xg_mdlp = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_mdlp = result_xg_mdlp.get_report_df(include_overall=True)
# report_dt_log2

In [36]:
df_inner_xg = pd.merge(report_xg_default, report_xg_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_xg_log2 = report_xg_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_xg = pd.merge(df_inner_xg, df_inner_xg_log2, on='project', how='inner')
df_inner_xg_mdlp = report_xg_mdlp.add_suffix("_mdlp").rename(columns={"project_mdlp": "project"})
df_inner_xg = pd.merge(df_inner_xg, df_inner_xg_mdlp, on='project', how='inner')

accuracy_xg = df_inner_xg.filter(regex=("project|accuracy.*")).copy()
accuracy_xg['improv._log10'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_xg['improv._log2'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_xg['improv._mdlp'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_mdlp'], x['accuracy_default']), axis=1)
accuracy_xg

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,accuracy_mdlp,improv._log10,improv._log2,improv._mdlp
0,CCI-MIT__XCoLab,0.974,0.972,0.972,0.974,-0.002053,-0.002053,0.0
1,apache__directory-server,0.916,0.909,0.909,0.896,-0.007642,-0.007642,-0.021834
2,jgralab__jgralab,0.816,0.72,0.797,0.783,-0.117647,-0.023284,-0.040441
3,Unidata__thredds,0.904,0.899,0.902,0.847,-0.005531,-0.002212,-0.063053
4,getrailo__railo,0.766,0.745,0.761,0.74,-0.027415,-0.006527,-0.033943
5,apache__accumulo,0.856,0.841,0.85,0.852,-0.017523,-0.007009,-0.004673
6,Ramblurr__Anki-Android,0.717,0.713,0.706,0.715,-0.005579,-0.015342,-0.002789
7,TeamDev-Ltd__OpenFaces,0.962,0.958,0.959,0.959,-0.004158,-0.003119,-0.003119
8,CloudStack-extras__CloudStack-archive,0.706,0.703,0.696,0.675,-0.004249,-0.014164,-0.043909
9,apache__lucene-solr,0.595,0.581,0.592,0.58,-0.023529,-0.005042,-0.02521


The overall average accuracy did not increase using discretized datasets (log2, log10, and mdlp). The same was observed when considering the dataset with and without NA values.