Goal of this notebook:

Explore Decision Tree hyperparameters to find the best common combination among all projects.

Outputs: 
- Validation curves for each hyperparameter
- Best combination of decision tree hyperparameters

In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore", category=UserWarning)

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
df_training = pd.read_csv("../../data/dataset-training.csv")
df_na = df_training[df_training.isna().any(axis=1)]

len(df_na) / len(df_training)

0.28144947636066214

# Decision Tree

### Base classifier

In [5]:
dt = DecisionTreeClassifier(min_samples_split=5, random_state=99)

In [6]:
result_dt = classifier_utils.ProjectsResults(dt, projects, non_features_columns)

In [7]:
report_dt = result_dt.get_report_df()

report_dt


Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
26,CCI-MIT__XCoLab,5512,3757,0.973,0.971,0.972,0.971,0.573,0.931
1,apache__directory-server,845,652,0.922,0.923,0.923,0.923,0.512,0.843
24,jgralab__jgralab,2072,1802,0.823,0.819,0.82,0.819,0.491,0.643
19,Unidata__thredds,1154,950,0.902,0.901,0.901,0.901,0.777,0.557
18,apache__accumulo,4113,3148,0.838,0.835,0.836,0.835,0.635,0.548
10,CloudStack-extras__CloudStack-archive,1424,1106,0.699,0.7,0.699,0.7,0.437,0.467
0,Ramblurr__Anki-Android,892,759,0.678,0.681,0.679,0.681,0.439,0.432
5,apache__lucene-solr,1256,974,0.573,0.568,0.569,0.568,0.266,0.411
7,getrailo__railo,815,572,0.636,0.631,0.633,0.631,0.378,0.407
11,TeamDev-Ltd__OpenFaces,2979,2859,0.964,0.963,0.964,0.963,0.938,0.403


### Tuning hyperparameters


In [8]:
print("Hyperparameters of Decision Tree:")
dt.get_params()

Hyperparameters of Decision Tree:


{'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5,
 'min_weight_fraction_leaf': 0.0,
 'presort': 'deprecated',
 'random_state': 99,
 'splitter': 'best'}

In [9]:
parameters = {'criterion':('gini', 'entropy'),
              'min_samples_leaf':[1, 2, 3],
              # 'max_depth':[None, 3, 5, 10],
              'max_depth':[None, 5, 20, 30]
                 }

In [10]:
results = classifier_utils.grid_search_all(projects, dt, parameters, non_features_columns)
results.sort_values(['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals'], ascending=False)

Unnamed: 0,criterion,min_samples_leaf,max_depth,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals
0,gini,1,,0.74132,11,8,3,0
3,gini,1,30.0,0.74132,11,8,3,0
2,gini,1,20.0,0.726462,9,7,0,2
12,entropy,1,,0.74483,6,5,1,0
14,entropy,1,20.0,0.74483,6,5,1,0
15,entropy,1,30.0,0.74483,6,5,1,0
9,gini,3,5.0,0.961008,3,2,1,0
17,entropy,2,5.0,0.735483,3,2,0,1
21,entropy,3,5.0,0.751769,3,2,0,1
20,entropy,3,,0.767138,2,2,0,0


#### Validation curves

##### min_samples_leaf

min_samples_leaf : int or float, default=1

The minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression.

In [None]:
for i, project in enumerate(projects[:3]):
    classifier_utils.get_validation_curve(project, dt, 'min_samples_leaf',
                                        np.arange(1, 10, 1),
                                        non_features_columns)

##### criterion

criterion : {“gini”, “entropy”}, default=”gini”

The function to measure the quality of a split. Supported criteria are “gini” for the Gini impurity and “entropy” for the information gain.

In [None]:
for i, project in enumerate(projects[:3]):
    classifier_utils.get_validation_curve(project, dt, 'criterion',
                                        ('gini', 'entropy'),
                                        non_features_columns)

#### max_depth

max_depth : int, default=None

The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

In [None]:
for i, project in enumerate(projects[:3]):
    classifier_utils.get_validation_curve(project, dt, 'max_depth',
                                        [1, 3, 5, 7, 9, 11, 13, 15],
                                        non_features_columns)

# Random Forest

### Base classifier

In [None]:
rf = RandomForestClassifier(random_state=99)
result_rf = classifier_utils.ProjectsResults(rf, projects, non_features_columns)

In [None]:
report_rf = result_rf.get_report_df()

report_rf

### Inspect project

In [None]:
sample_project_rf = result_rf.get_project('alexo__wro4j')

In [None]:
display(sample_project_rf.get_scores_df())
display(classifier_utils.plot_classification_report(sample_project_rf.scores))

### Tuning hyperparameters

In [None]:
print("Hyperparameters of Random Forest:")

rf.get_params()

#### Using GridSearch to explore combinations of hyperparameters in one project

In [None]:
parameters_rf = {
                # 'criterion':('gini', 'entropy'),
              'min_samples_leaf':[1, 2, 3],
              'max_depth':[None, 10, 20, 30],
              'n_estimators': [10, 50, 100, 150, 200]
              # 'n_estimators': [100, 200]
                 }

proj = projects[16]

result_rf = classifier_utils.grid_search(proj, rf, parameters_rf, non_features_columns)

df_gridsearch_rf = pd.DataFrame(result_rf)\
    .filter(regex=("param_.*|mean_test_score|std_test_score|rank_test_score"))\
    .sort_values(by=['rank_test_score'])

df_gridsearch_rf.insert(0, 'project', proj)

df_gridsearch_rf



### GridSearch of parameter n_estimators on first 10 projects

In [None]:
parameters_rf = {
                # 'criterion':('gini', 'entropy'),
              # 'min_samples_leaf':[1, 2, 3],
              # 'max_depth':[None, 10, 20, 30],
              # 'n_estimators': [10, 50, 90, 100, 110, 150, 200]
              'n_estimators': [50, 100, 150, 200]
                 }

dfs = []

for project in projects[:10]:
    print("\n=== ", project, " ===")
    result = classifier_utils.grid_search(project, rf, parameters_rf, non_features_columns)
    if result != None:
        df_gridsearch_proj = pd.DataFrame(result)\
            .filter(regex=("param_.*|mean_test_score|std_test_score|rank_test_score"))\
            .sort_values(by=['rank_test_score'])
        df_gridsearch_proj.insert(0, 'project', project.replace("/", "__"))
        # print(df_gridsearch_proj)
        display(df_gridsearch_proj)
        dfs.append(df_gridsearch_proj)

large_df = pd.concat(dfs, ignore_index=True)

large_df

In [None]:
# large_df.groupby(large_df['project']).mean()

#### Validation Curves

##### parameter: n_estimators

n_estimators : int, default=100

The number of trees in the forest.

In [None]:
for project in projects[:5]:
    classifier_utils.get_validation_curve(project, rf, 'n_estimators',
                                      # [10, 50, 100, 150, 200, 300],
                                      np.arange(10, 300, 50),
                                      non_features_columns)



In [None]:
import importlib
importlib.reload(classifier_utils)

In [None]:
# # result = classifier_utils.grid_search(projects[6], rf, parameters_rf, non_features_columns)
# # result
# estimator = rf
# parameters = parameters_rf
#
# proj = projects[6].replace("/", "__")
# print("\n", proj)
# proj_dataset = f"../../data/projects/{proj}-training.csv"
# df_proj = pd.read_csv(proj_dataset)
# df_clean = df_proj.dropna()
# print(f"Length of df_clean: {len(df_clean)}")
# if len(df_clean) >= 10:
#     y = df_clean["developerdecision"].copy()
#     df_clean_features = df_clean.drop(columns=['developerdecision']) \
#                                 .drop(columns=non_features_columns)
#     features = list(df_clean_features.columns)
#     X = df_clean_features[features]
#     clf = GridSearchCV(estimator, parameters, verbose=1, cv=10)
#     clf.fit(X, y)
#     print('\n', "Best params and score:", clf.best_params_, clf.best_score_, '\n',
#           # clf.cv_results_,
#           sep='\n')
# else:
#     print(None)

# Random Forest (XGBoost)

### Base classifier

In [None]:
rf_xg = XGBRFClassifier(random_state=99, n_estimators=100, subsample=0.9, colsample_bynode=0.2, eval_metric='mlogloss')


In [None]:
result_rf_xg = classifier_utils.ProjectsResults(rf_xg, projects, non_features_columns, drop_na=False)

In [None]:
report_rf_xg = result_rf_xg.get_report_df()

report_rf_xg

### Inspect project

In [None]:
project = result_rf_xg.get_project('alexo__wro4j')

In [None]:
display(project.get_scores_df())
display(display(classifier_utils.plot_classification_report(project.scores)))

### Tuning hyperparameters (TODO)

In [None]:
print("Hyperparameters of Random Forest:")

rf_xg.get_params()

#### Using GridSearch to explore combinations of hyperparameters in one project

In [None]:
parameters_rf_xg = {
                # 'criterion':('gini', 'entropy'),
#               'min_samples_leaf':[1, 2, 3],
              'max_depth':[None, 10, 20, 30],
              'n_estimators': [50, 100, 150]
              # 'n_estimators': [100, 200]
                 }

proj = projects[0]

result_rf = classifier_utils.grid_search(proj, rf_xg, parameters_rf_xg, non_features_columns)

df_gridsearch_rf = pd.DataFrame(result_rf)\
    .filter(regex=("param_.*|mean_test_score|std_test_score|rank_test_score"))\
    .sort_values(by=['rank_test_score'])

df_gridsearch_rf.insert(0, 'project', proj)

df_gridsearch_rf



### GridSearch of parameter n_estimators on first 10 projects

In [None]:
parameters_rf_xg = {
                # 'criterion':('gini', 'entropy'),
              # 'min_samples_leaf':[1, 2, 3],
              # 'max_depth':[None, 10, 20, 30],
              # 'n_estimators': [10, 50, 90, 100, 110, 150, 200]
              'n_estimators': [50, 100, 150, 200]
                 }

dfs = []

for project in projects[:10]:
    print("\n=== ", project, " ===")
    result = classifier_utils.grid_search(project, rf_xg, parameters_rf_xg, non_features_columns)
    if result != None:
        df_gridsearch_proj = pd.DataFrame(result)\
            .filter(regex=("param_.*|mean_test_score|std_test_score|rank_test_score"))\
            .sort_values(by=['rank_test_score'])
        df_gridsearch_proj.insert(0, 'project', project.replace("/", "__"))
        # print(df_gridsearch_proj)
        display(df_gridsearch_proj)
        dfs.append(df_gridsearch_proj)

large_df = pd.concat(dfs, ignore_index=True)

large_df

#### Validation Curves

##### parameter: n_estimators

n_estimators : int, default=100

The number of trees in the forest.

In [None]:
for project in projects[:3]:
    classifier_utils.get_validation_curve(project, rf_xg, 'n_estimators',
                                      # [10, 50, 100, 150, 200, 300],
                                      np.arange(10, 300, 50),
                                      non_features_columns)



# Tuned algorithms' comparison (TODO)

In [None]:
# import importlib
# importlib.reload(classifier_utils)
#
# # setup the algorithms with the desired parameters
# algorithms = {}
# algorithms['sklearn-decisionTree'] = DecisionTreeClassifier(min_samples_split=5, random_state=99)
# # algorithms['sklearn-adaboost'] = AdaBoostClassifier(n_estimators=100, random_state=99)
# algorithms['sklearn-randomForest'] = RandomForestClassifier(random_state=99)
# # algorithms['xgBoost-randomForest'] = XGBRFClassifier(random_state=99, n_estimators=100, subsample=0.9, colsample_bynode=0.2, eval_metric='mlogloss')
#
#
# results = []
# for project in projects:
#     for algorithm_name, algorithm in algorithms.items():
#         row = []
#         project_results = classifier_utils.evaluate_project(project, non_features_columns, algorithm)
#         result = project_results.results
#         precision = result.iloc[0]['precision']
#         recall = result.iloc[0]['recall']
#         f1_score = result.iloc[0]['f1-score']
#         accuracy = result.iloc[0]['accuracy']
#         majority_class = result.iloc[0]['baseline (majority)']
#         improvement = result.iloc[0]['improvement']
#         row = [project, algorithm_name, precision, recall, f1_score, accuracy, majority_class, improvement]
#         results.append(row)
#
# results = pd.DataFrame(results, columns=['project', 'algorithm', 'precision', 'recall', 'f1-score', 'accuracy', 'baseline (majority)', 'improvement'])
# results.sort_values('improvement', ascending=False)
# # print(results)
# # pd.concat(results, ignore_index=True)

## Comparing algorithms' reports

First let's merge the reports from Decision Tree, Random Forest and Random Forest XGBoost in a big table.

In [None]:
df_inner_dt_rf = pd.merge(report_dt, report_rf, on='project', how='inner', suffixes=('_dt', '_rf'))

df_xg = report_rf_xg.add_suffix("_xg").rename(columns={"project_xg": "project"})

df_inner_dt_rf_xg = pd.merge(df_inner_dt_rf,
                             df_xg,
                             on='project', how='inner')

df_inner_dt_rf_xg


### How much accuracy improvement does each algorithm provide to the projects?

In [None]:
improvements = df_inner_dt_rf_xg.filter(regex=("project|improvement.*"))

# improvements

In [None]:
cm = sns.light_palette("green", as_cmap=True)

improvements.style.background_gradient(cmap = cm).format({'improvement_dt' : "{:.2%}",
                           'improvement_rf' : "{:.2%}",
                           'improvement_xg': "{:.2%}"})

## For each project, which algorithm provides the highest accuracy improvement?

In [None]:
winner_algo = improvements.set_index('project').idxmax(axis=1)

winner_algo

## In how many projects does each algorithm provide the highest accuracy improvement?

In [None]:
winner_algo.groupby(winner_algo).size().sort_values(ascending=False)


In [None]:
def get_combination_key(row, parameters):
    key = ''
    for value in row[:len(parameters)]:
        key+=str(value)+'-'
    return key

def grid_search(project, estimator, parameters, non_features_columns):
    proj = project.replace("/", "__")
    proj_dataset = f"../../data/projects/{proj}-training.csv"
    df_proj = pd.read_csv(proj_dataset)
    df_clean = df_proj.dropna()
    print(f"Length of df_clean: {len(df_clean)}")
    if len(df_clean) >= 10:
        # majority_class = get_majority_class_percentage(df_clean, 'developerdecision')
        y = df_clean["developerdecision"].copy()
        df_clean_features = df_clean.drop(columns=['developerdecision']) \
                                    .drop(columns=non_features_columns)
        features = list(df_clean_features.columns)
        X = df_clean_features[features]
        clf = GridSearchCV(estimator, parameters, verbose=1, cv=10)
        clf.fit(X, y)
        print("Best params and score:", clf.best_params_, clf.best_score_, '\n',
              # clf.cv_results_,
              sep='\n')
        return clf.cv_results_
    else:
        return None

In [None]:
estimator = dt
projects = ['jgralab__jgralab', 'Unidata__thredds']
# projects = ['jgralab__jgralab']
import itertools
results = {}
results_columns = list(parameters.keys())
results_columns.extend(['mean_accuracy', 'sum_accuracy', 'total_medals', 'gold_medals', 'silver_medals', 'bronze_medals'])
combinations = []

for combination in itertools.product(*parameters.values()):
    row = []
    key=''
    for parameter_value in combination:
        row.append(parameter_value)
        key+=str(parameter_value)
    row.extend([0,0,0,0,0,0])
    combinations.append(row)
print(combinations)
results = pd.DataFrame(combinations, columns=results_columns)
display(results)
print(parameters)
    
for project in projects:
    project_results = grid_search(project, estimator, parameters, non_features_columns)
    if project_results != None:
        df_gridsearch_dt = pd.DataFrame(project_results)\
            .filter(regex=("param_.*|mean_test_score|std_test_score|rank_test_score"))\
            .sort_values(by=['rank_test_score'])
        # get the top 3 combinations 
        top_3 = df_gridsearch_dt[df_gridsearch_dt['rank_test_score']<=3]
        display(top_3)
        # for each combination in top 3, find them in the results list and distribute the respective medals
        for index, combination in top_3.iterrows():
            print(combination)
            combination_index = None
            filtered_rows = results
            for parameter in list(parameters.keys()):
                parameter_key = f'param_{parameter}'
                combination_value = combination[parameter_key]
                print(parameter, combination_value)
                if combination[parameter_key] == None:
                    filtered_rows = filtered_rows[filtered_rows[parameter].isnull()]
                else:
                    filtered_rows = filtered_rows[filtered_rows[parameter]==combination_value]
                
                
            if len(filtered_rows) > 0:
                row = results.loc[filtered_rows.index]
                sum_accuracy = row['sum_accuracy']
                gold_medals = row['gold_medals']
                silver_medals = row['silver_medals']
                bronze_medals = row['bronze_medals']
                
                results.at[filtered_rows.index, 'sum_accuracy'] = sum_accuracy + combination['mean_test_score']
                if combination['rank_test_score'] == 1:
                    results.at[filtered_rows.index, 'gold_medals'] = gold_medals + 1
                elif combination['rank_test_score'] == 2:
                   results.at[filtered_rows.index, 'silver_medals'] = silver_medals + 1
                elif combination['rank_test_score'] == 3:
                    results.at[filtered_rows.index, 'bronze_medals'] = bronze_medals + 1


display(results)
# print(results)
df = pd.DataFrame(results)


In [None]:
import importlib
importlib.reload(classifier_utils)

In [None]:
estimator = dt
# projects = ['jgralab__jgralab', 'Unidata__thredds']
results = classifier_utils.grid_search_all(projects[:5], dt, parameters, non_features_columns)

In [None]:
results.sort_values(['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals'], ascending=False)

In [None]:
df = df.transpose()
df.columns = results_columns

In [None]:
df