In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.dummy import DummyClassifier
from ConstantClassifier import ConstantClassifier
import configs
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
df_training = pd.read_csv("../../data/dataset-training.csv")
target_names = sorted(list(df_training['developerdecision'].unique()))
df_na = df_training[df_training.isna().any(axis=1)]

len(df_na) / len(df_training)

0.28144947636066214

# Class distributions

### Normalized (%)

In [5]:
import importlib
importlib.reload(classifier_utils)
class_distribution_normalized = classifier_utils.get_projects_class_distribution(projects, include_overall=True)

class_distribution_normalized

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,43.87,29.25,8.43,1.32,0.92,16.07,0.13
1,apache__directory-server,47.39,51.23,0.15,0.0,0.0,1.23,0.0
2,android__platform_frameworks_base,70.77,7.64,9.02,2.68,1.42,8.13,0.33
3,freenet__fred,40.42,16.5,23.32,6.23,0.49,12.94,0.1
4,alexo__wro4j,30.7,14.77,28.65,0.88,1.02,23.61,0.37
5,apache__lucene-solr,25.05,26.59,20.33,5.24,0.31,21.66,0.82
6,elastic__elasticsearch,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,getrailo__railo,31.99,37.76,8.74,7.17,0.35,13.81,0.17
8,atlasapi__atlas,24.04,44.5,12.4,5.37,0.9,12.53,0.26
9,hibernate__hibernate-orm,23.88,20.11,31.15,7.4,1.12,14.11,2.23


### Count

In [6]:
class_distribution_count = classifier_utils.get_projects_class_distribution(projects, False, include_overall=True)

class_distribution_count

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,333,222,64,10,7,122,1
1,apache__directory-server,309,334,1,0,0,8,0
2,android__platform_frameworks_base,1741,188,222,66,35,200,8
3,freenet__fred,409,167,236,63,5,131,1
4,alexo__wro4j,420,202,392,12,14,323,5
5,apache__lucene-solr,244,259,198,51,3,211,8
6,elastic__elasticsearch,0,0,0,0,0,0,0
7,getrailo__railo,183,216,50,41,2,79,1
8,atlasapi__atlas,188,348,97,42,7,98,2
9,hibernate__hibernate-orm,171,144,223,53,8,101,16


# Models

In [7]:
baseline = DummyClassifier(strategy="most_frequent", random_state=99)
baseline_combination = ConstantClassifier('Combination')
baseline_version1 = ConstantClassifier('Version 1')
baseline_version2 = ConstantClassifier('Version 2')
baseline_concatenationv1v2 = ConstantClassifier('ConcatenationV1V2')
baseline_concatenationv2v1 = ConstantClassifier('ConcatenationV2V1')
baseline_manual = ConstantClassifier('Manual')
baseline_none = ConstantClassifier('None')
model1 = DecisionTreeClassifier(random_state=99)
model2 = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)
model3 = XGBRFClassifier(random_state=99, subsample=0.9, eval_metric='mlogloss', n_estimators=400, colsample_bynode=0.4)
models = [baseline, baseline_combination, baseline_version1, baseline_version2,
          baseline_concatenationv1v2, baseline_concatenationv2v1, baseline_manual,
          baseline_none, model1, model2, model3]
all_models_names = ['baseline', 'baseline_combination', 'baseline_version1',
                'baseline_version2', 'baseline_concatenationv1v2',
                'baseline_concatenationv2v1', 'baseline_manual', 'baseline_none',
               'decision tree', 'random forest', 'xg boost']

## Overall scores per class
Compare the models considering a binary classifier for each class. How good is a classifier for predicting a specific class?

In [8]:
models_results = {}
for model, model_name in zip(models, all_models_names):
    models_results[model_name] = classifier_utils.ProjectsResults(model, projects, non_features_columns)

## Combination

In [9]:
models_names = ['baseline_combination', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Combination', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_combination,0.093469,1.0,0.160921
1,decision tree,0.417311,0.429341,0.422194
2,random forest,0.489929,0.469206,0.474462
3,xg boost,0.477433,0.437854,0.442081


## Version 1

In [10]:
models_names = ['baseline_version1', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Version 1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version1,0.563708,1.0,0.68889
1,decision tree,0.854604,0.849497,0.851944
2,random forest,0.864644,0.895911,0.879648
3,xg boost,0.84731,0.882713,0.86342


## Version 2

In [11]:
models_names = ['baseline_version2', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Version 2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version2,0.209794,1.0,0.316688
1,decision tree,0.646872,0.642724,0.644403
2,random forest,0.726145,0.719707,0.721195
3,xg boost,0.694187,0.685041,0.684644


## ConcatenationV1V2

In [12]:
models_names = ['baseline_concatenationv1v2', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('ConcatenationV1V2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv1v2,0.033573,1.0,0.062764
1,decision tree,0.379547,0.327256,0.34938
2,random forest,0.565034,0.351631,0.421524
3,xg boost,0.621959,0.360935,0.43992


## ConcatenationV2V1

In [13]:
models_names = ['baseline_concatenationv2v1', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('ConcatenationV2V1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv2v1,0.008347,1.0,0.016501
1,decision tree,0.373176,0.410017,0.387261
2,random forest,0.621457,0.46485,0.511638
3,xg boost,0.522976,0.407243,0.433781


## Manual

In [14]:
models_names = ['baseline_manual', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Manual', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_manual,0.102448,1.0,0.179609
1,decision tree,0.483008,0.482056,0.48213
2,random forest,0.560012,0.500664,0.526022
3,xg boost,0.547885,0.470915,0.50092


## None

In [15]:
models_names = ['baseline_none', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('None', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_none,0.003681,1.0,0.007293
1,decision tree,0.35,0.363542,0.355128
2,random forest,0.541667,0.2125,0.305195
3,xg boost,0.333333,0.125,0.181818


## Models ranking

In [16]:
import importlib
importlib.reload(classifier_utils)
results = classifier_utils.compare_models_medals(models, all_models_names, projects, non_features_columns)

### Ranking models by medals

In [17]:
results.sort_values(['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals'], ascending=False)

Unnamed: 0,model_name,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals,mean_rank
9,random forest,0.80888,25,22,3,0,1.12
10,xg boost,0.7904,25,4,17,4,2.0
8,decision tree,0.76904,23,1,4,18,2.92
0,baseline,0.6054,3,1,0,2,3.8
2,baseline_version1,0.56376,3,1,0,2,4.28
1,baseline_combination,0.09344,0,0,0,0,7.32
3,baseline_version2,0.2014,0,0,0,0,6.12
4,baseline_concatenationv1v2,0.02816,0,0,0,0,9.08
5,baseline_concatenationv2v1,0.00768,0,0,0,0,9.72
6,baseline_manual,0.10244,0,0,0,0,7.08


### Ranking models by mean rank

In [18]:
results.sort_values(['mean_rank'])

Unnamed: 0,model_name,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals,mean_rank
9,random forest,0.80888,25,22,3,0,1.12
10,xg boost,0.7904,25,4,17,4,2.0
8,decision tree,0.76904,23,1,4,18,2.92
0,baseline,0.6054,3,1,0,2,3.8
2,baseline_version1,0.56376,3,1,0,2,4.28
3,baseline_version2,0.2014,0,0,0,0,6.12
6,baseline_manual,0.10244,0,0,0,0,7.08
1,baseline_combination,0.09344,0,0,0,0,7.32
4,baseline_concatenationv1v2,0.02816,0,0,0,0,9.08
5,baseline_concatenationv2v1,0.00768,0,0,0,0,9.72
