Goal of this notebook:

Investigate the impact of using discretized datasets (log2 and log10) on the classification accuracy with decision tree, random forest and XGBoost.

In [57]:
import numpy as np
import pandas as pd
from IPython.core.display import display

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
import warnings
import classifier_utils
import configs
warnings.filterwarnings("ignore", category=UserWarning)

In [58]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [59]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

# Decision Tree

In [60]:
dt = DecisionTreeClassifier(random_state=99)

In [61]:
result_dt_default = classifier_utils.ProjectsResults(dt, projects, non_features_columns)

In [62]:
report_dt_default = result_dt_default.get_report_df(include_overall=True)
# report_dt_default

In [63]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_dt_log10 = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_log10 = result_dt_log10.get_report_df(include_overall=True)
# report_dt_log10

In [64]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_dt_log2 = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path)
report_dt_log2 = result_dt_log2.get_report_df(include_overall=True)
# report_dt_log2

In [65]:
df_inner_dt = pd.merge(report_dt_default, report_dt_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_dt_log2 = report_dt_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_log2, on='project', how='inner')

accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improv._log10'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_dt['improv._log2'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,improv._log10,improv._log2
0,CCI-MIT__XCoLab,0.972,0.968,0.971,-0.004115,-0.001029
1,apache__directory-server,0.931,0.919,0.919,-0.012889,-0.012889
2,jgralab__jgralab,0.836,0.756,0.827,-0.095694,-0.010766
3,Unidata__thredds,0.909,0.899,0.9,-0.011001,-0.009901
4,CloudStack-extras__CloudStack-archive,0.76,0.756,0.748,-0.005263,-0.015789
5,apache__accumulo,0.834,0.834,0.832,0.0,-0.002398
6,Ramblurr__Anki-Android,0.694,0.656,0.675,-0.054755,-0.027378
7,getrailo__railo,0.643,0.636,0.664,-0.010886,0.058824
8,TeamDev-Ltd__OpenFaces,0.965,0.963,0.964,-0.002073,-0.001036
9,apache__lucene-solr,0.559,0.57,0.56,0.024943,0.002268


## Including NA values

In [66]:
import importlib
importlib.reload(classifier_utils)
result_dt_default_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns, replace_na=True)

In [67]:
report_dt_default_wna = result_dt_default_wna.get_report_df(include_overall=True)
# report_dt_default_wna

In [68]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_dt_log2_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_log2_wna = result_dt_log2_wna.get_report_df(include_overall=True)
# report_dt_log2_wna

In [74]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_dt_log10_wna = classifier_utils.ProjectsResults(dt, projects, non_features_columns,discretized_path, replace_na=True)
report_dt_log10_wna = result_dt_log10_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [73]:
df_inner_dt = pd.merge(report_dt_default_wna, report_dt_log10_wna, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_dt_log2 = report_dt_log2_wna.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_dt = pd.merge(df_inner_dt, df_inner_dt_log2, on='project', how='inner')

# accuracy_dt = df_inner_dt.filter(regex=("project|observations_default|accuracy.*")).copy()
accuracy_dt = df_inner_dt.filter(regex=("project|accuracy.*")).copy()
accuracy_dt['improv._log10'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_dt['improv._log2'] = accuracy_dt.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_dt

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,improv._log10,improv._log2
0,CCI-MIT__XCoLab,0.971,0.971,0.971,0.0,0.0
1,apache__directory-server,0.907,0.89,0.879,-0.018743,-0.030871
2,jgralab__jgralab,0.819,0.75,0.818,-0.084249,-0.001221
3,CloudStack-extras__CloudStack-archive,0.787,0.758,0.773,-0.036849,-0.017789
4,Unidata__thredds,0.896,0.893,0.886,-0.003348,-0.011161
5,apache__accumulo,0.848,0.848,0.85,0.0,0.013158
6,getrailo__railo,0.73,0.715,0.728,-0.020548,-0.00274
7,Ramblurr__Anki-Android,0.68,0.676,0.667,-0.005882,-0.019118
8,TeamDev-Ltd__OpenFaces,0.955,0.958,0.954,0.066667,-0.001047
9,apache__lucene-solr,0.54,0.537,0.523,-0.005556,-0.031481


The overall average accuracy did not increase using discretized datasets (log2 and log10). The same was observed when considering the dataset with and without NA values.

# Random forest

In [75]:
rf = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=400, max_features=0.3, min_samples_leaf=1)# Decision Tree

In [76]:
result_rf_default = classifier_utils.ProjectsResults(rf, projects, non_features_columns)

In [77]:
report_rf_default = result_rf_default.get_report_df(include_overall=True)
# report_rf_default

In [78]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_rf_log10 = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_log10 = result_rf_log10.get_report_df(include_overall=True)
# report_rf_log10

In [79]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_rf_log2 = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path)
report_rf_log2 = result_rf_log2.get_report_df(include_overall=True)
# report_dt_log2

In [80]:
df_inner_rf = pd.merge(report_rf_default, report_rf_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_rf_log2 = report_rf_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_log2, on='project', how='inner')

accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improv._log10'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_rf['improv._log2'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,improv._log10,improv._log2
0,CCI-MIT__XCoLab,0.976,0.973,0.975,-0.003074,-0.001025
1,apache__directory-server,0.939,0.922,0.931,-0.018104,-0.00852
2,jgralab__jgralab,0.869,0.812,0.867,-0.065593,-0.002301
3,CloudStack-extras__CloudStack-archive,0.803,0.807,0.807,0.020305,0.020305
4,Unidata__thredds,0.919,0.924,0.924,0.061728,0.061728
5,apache__accumulo,0.864,0.861,0.863,-0.003472,-0.001157
6,Ramblurr__Anki-Android,0.74,0.74,0.734,0.0,-0.008108
7,getrailo__railo,0.701,0.697,0.706,-0.005706,0.016722
8,apache__lucene-solr,0.644,0.634,0.653,-0.015528,0.025281
9,TeamDev-Ltd__OpenFaces,0.969,0.969,0.969,0.0,0.0


## Including NA values

In [81]:
import importlib
importlib.reload(classifier_utils)
result_rf_default_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns, replace_na=True)

In [82]:
report_rf_default_wna = result_rf_default_wna.get_report_df(include_overall=True)
# report_dt_default_wna

In [83]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_rf_log2_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_log2_wna = result_rf_log2_wna.get_report_df(include_overall=True)
# report_dt_log2_wna

In [84]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_rf_log10_wna = classifier_utils.ProjectsResults(rf, projects, non_features_columns,discretized_path, replace_na=True)
report_rf_log10_wna = result_rf_log10_wna.get_report_df(include_overall=True)
# report_dt_log10_wna

In [85]:
df_inner_rf = pd.merge(report_rf_default_wna, report_rf_log10_wna, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_rf_log2 = report_rf_log2_wna.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_rf = pd.merge(df_inner_rf, df_inner_rf_log2, on='project', how='inner')

# accuracy_dt = df_inner_dt.filter(regex=("project|observations_default|accuracy.*")).copy()
accuracy_rf = df_inner_rf.filter(regex=("project|accuracy.*")).copy()
accuracy_rf['improv._log10'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_rf['improv._log2'] = accuracy_rf.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_rf

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,improv._log10,improv._log2
0,CCI-MIT__XCoLab,0.977,0.975,0.977,-0.002047,0.0
1,apache__directory-server,0.933,0.923,0.925,-0.010718,-0.008574
2,jgralab__jgralab,0.867,0.813,0.863,-0.062284,-0.004614
3,CloudStack-extras__CloudStack-archive,0.82,0.829,0.824,0.05,0.022222
4,Unidata__thredds,0.913,0.919,0.915,0.068966,0.022989
5,apache__accumulo,0.879,0.876,0.877,-0.003413,-0.002275
6,getrailo__railo,0.774,0.761,0.777,-0.016796,0.013274
7,Ramblurr__Anki-Android,0.763,0.746,0.748,-0.02228,-0.019659
8,apache__lucene-solr,0.642,0.626,0.646,-0.024922,0.011173
9,zkoss__zk,0.804,0.791,0.804,-0.016169,0.0


The overall average accuracy did not increase using discretized datasets (log2 and log10). The same was observed when considering the dataset with and without NA values.

# XGBoost Random Forest

In [98]:
xg = XGBRFClassifier(random_state=99, subsample=0.9, eval_metric='mlogloss', n_estimators=400, colsample_bynode=0.4)

In [99]:
result_xg_default = classifier_utils.ProjectsResults(xg, projects, non_features_columns, drop_na=False)

In [100]:
report_xg_default = result_xg_default.get_report_df(include_overall=True)
# report_rf_default

In [101]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log10'
result_xg_log10 = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_log10 = result_xg_log10.get_report_df(include_overall=True)
# report_rf_log10

In [102]:
import importlib
importlib.reload(classifier_utils)
discretized_path = f'{configs.PROJECTS_DATA}/discretized_log2'
result_xg_log2 = classifier_utils.ProjectsResults(xg, projects, non_features_columns,discretized_path, drop_na=False)
report_xg_log2 = result_xg_log2.get_report_df(include_overall=True)
# report_dt_log2

In [103]:
df_inner_xg = pd.merge(report_xg_default, report_xg_log10, on='project', how='inner', suffixes=('_default', '_log10'))
df_inner_xg_log2 = report_xg_log2.add_suffix("_log2").rename(columns={"project_log2": "project"})
df_inner_xg = pd.merge(df_inner_xg, df_inner_xg_log2, on='project', how='inner')

accuracy_xg = df_inner_xg.filter(regex=("project|accuracy.*")).copy()
accuracy_xg['improv._log10'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log10'], x['accuracy_default']), axis=1)
accuracy_xg['improv._log2'] = accuracy_xg.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_log2'], x['accuracy_default']), axis=1)
accuracy_xg

Unnamed: 0,project,accuracy_default,accuracy_log10,accuracy_log2,improv._log10,improv._log2
0,CCI-MIT__XCoLab,0.974,0.972,0.972,-0.002053,-0.002053
1,apache__directory-server,0.914,0.908,0.909,-0.006565,-0.00547
2,jgralab__jgralab,0.816,0.719,0.798,-0.118873,-0.022059
3,Unidata__thredds,0.904,0.899,0.902,-0.005531,-0.002212
4,getrailo__railo,0.764,0.746,0.764,-0.02356,0.0
5,apache__accumulo,0.854,0.84,0.848,-0.016393,-0.007026
6,TeamDev-Ltd__OpenFaces,0.962,0.957,0.959,-0.005198,-0.003119
7,Ramblurr__Anki-Android,0.715,0.711,0.704,-0.005594,-0.015385
8,CloudStack-extras__CloudStack-archive,0.708,0.704,0.698,-0.00565,-0.014124
9,cgjones__android-frameworks-base,0.871,0.86,0.862,-0.012629,-0.010333


The overall average accuracy did not increase using discretized datasets (log2 and log10). The same was observed when considering the dataset with and without NA values.