In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.dummy import DummyClassifier
from ConstantClassifier import ConstantClassifier
import configs
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
df_training = pd.read_csv("../../data/dataset-training.csv")
target_names = sorted(list(df_training['developerdecision'].unique()))
df_na = df_training[df_training.isna().any(axis=1)]

len(df_na) / len(df_training)

0.17787511605208656

# Class distributions

### Normalized (%)

In [5]:
import importlib
importlib.reload(classifier_utils)
class_distribution_normalized = classifier_utils.get_projects_class_distribution(projects, include_overall=True)

class_distribution_normalized

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,43.87,29.25,8.43,1.32,0.92,16.07,0.13
1,apache__directory-server,47.39,51.23,0.15,0.0,0.0,1.23,0.0
2,android__platform_frameworks_base,70.77,7.64,9.02,2.68,1.42,8.13,0.33
3,freenet__fred,40.42,16.5,23.32,6.23,0.49,12.94,0.1
4,alexo__wro4j,30.7,14.77,28.65,0.88,1.02,23.61,0.37
5,apache__lucene-solr,25.05,26.59,20.33,5.24,0.31,21.66,0.82
6,getrailo__railo,31.99,37.76,8.74,7.17,0.35,13.81,0.17
7,atlasapi__atlas,24.04,44.5,12.4,5.37,0.9,12.53,0.26
8,hibernate__hibernate-orm,23.88,20.11,31.15,7.4,1.12,14.11,2.23
9,CloudStack-extras__CloudStack-archive,43.67,17.72,9.49,16.0,1.27,11.39,0.45


### Count

In [6]:
class_distribution_count = classifier_utils.get_projects_class_distribution(projects, False, include_overall=True)

class_distribution_count

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,333,222,64,10,7,122,1
1,apache__directory-server,309,334,1,0,0,8,0
2,android__platform_frameworks_base,1741,188,222,66,35,200,8
3,freenet__fred,409,167,236,63,5,131,1
4,alexo__wro4j,420,202,392,12,14,323,5
5,apache__lucene-solr,244,259,198,51,3,211,8
6,getrailo__railo,183,216,50,41,2,79,1
7,atlasapi__atlas,188,348,97,42,7,98,2
8,hibernate__hibernate-orm,171,144,223,53,8,101,16
9,CloudStack-extras__CloudStack-archive,483,196,105,177,14,126,5


# Models

In [7]:
baseline = DummyClassifier(strategy="most_frequent", random_state=99)
baseline_combination = ConstantClassifier('Combination')
baseline_version1 = ConstantClassifier('Version 1')
baseline_version2 = ConstantClassifier('Version 2')
baseline_concatenationv1v2 = ConstantClassifier('ConcatenationV1V2')
baseline_concatenationv2v1 = ConstantClassifier('ConcatenationV2V1')
baseline_manual = ConstantClassifier('Manual')
baseline_none = ConstantClassifier('None')
model1 = DecisionTreeClassifier(random_state=99, min_samples_leaf=1, max_depth=30)
model2 = RandomForestClassifier(random_state=99, n_jobs=-2, n_estimators=400, max_features=0.3, min_samples_leaf=1)
model3 = XGBRFClassifier(random_state=99, subsample=0.9, eval_metric='mlogloss', n_estimators=100, colsample_bynode=0.4)
models = [baseline, baseline_combination, baseline_version1, baseline_version2,
          baseline_concatenationv1v2, baseline_concatenationv2v1, baseline_manual,
          baseline_none, model1, model2, model3]
all_models_names = ['baseline', 'baseline_combination', 'baseline_version1',
                'baseline_version2', 'baseline_concatenationv1v2',
                'baseline_concatenationv2v1', 'baseline_manual', 'baseline_none',
               'decision tree', 'random forest', 'xg boost']

## Overall scores per class
Compare the models considering a binary classifier for each class. How good is a classifier for predicting a specific class?

In [8]:
models_results = {}
for model, model_name in zip(models, all_models_names):
    models_results[model_name] = classifier_utils.ProjectsResults(model, projects, non_features_columns)

## Combination

In [9]:
models_names = ['baseline_combination', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Combination', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_combination,0.098399,1.0,0.166855
1,decision tree,0.439115,0.434413,0.436051
2,random forest,0.518415,0.479938,0.492388
3,xg boost,0.515936,0.441387,0.459657


## Version 1

In [10]:
models_names = ['baseline_version1', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Version 1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version1,0.524258,1.0,0.651574
1,decision tree,0.828369,0.829195,0.828705
2,random forest,0.845363,0.879475,0.861606
3,xg boost,0.82661,0.864717,0.843673


## Version 2

In [11]:
models_names = ['baseline_version2', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Version 2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version2,0.239671,1.0,0.353836
1,decision tree,0.669286,0.666911,0.667256
2,random forest,0.751995,0.746245,0.74721
3,xg boost,0.718167,0.709465,0.707613


## ConcatenationV1V2

In [12]:
models_names = ['baseline_concatenationv1v2', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('ConcatenationV1V2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv1v2,0.037593,1.0,0.069711
1,decision tree,0.377501,0.341597,0.357253
2,random forest,0.640571,0.39797,0.473962
3,xg boost,0.593772,0.376393,0.438892


## ConcatenationV2V1

In [13]:
models_names = ['baseline_concatenationv2v1', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('ConcatenationV2V1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv2v1,0.006743,1.0,0.013358
1,decision tree,0.420911,0.496429,0.451713
2,random forest,0.689698,0.487302,0.554784
3,xg boost,0.52881,0.395238,0.422076


## Manual

In [14]:
models_names = ['baseline_manual', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('Manual', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_manual,0.110169,1.0,0.191139
1,decision tree,0.50428,0.498926,0.500688
2,random forest,0.622602,0.554909,0.584528
3,xg boost,0.594872,0.497271,0.535165


## None

In [15]:
models_names = ['baseline_none', 'decision tree', 'random forest', 'xg boost']
classifier_utils.compare_models_per_class('None', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_none,0.003933,1.0,0.007785
1,decision tree,0.227778,0.193333,0.197525
2,random forest,0.535714,0.196875,0.286025
3,xg boost,0.291667,0.09375,0.140909


## Models ranking

In [16]:
import importlib
importlib.reload(classifier_utils)
results = classifier_utils.compare_models_medals(models, all_models_names, projects, non_features_columns)

### Ranking models by medals

In [17]:
results.sort_values(['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals'], ascending=False)

Unnamed: 0,model_name,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals,mean_rank
9,random forest,0.80535,20,19,1,0,1.05
10,xg boost,0.784,20,3,15,2,1.95
8,decision tree,0.76355,20,1,3,16,2.75
0,baseline,0.57635,1,1,0,0,3.85
2,baseline_version1,0.5243,1,1,0,0,4.45
1,baseline_combination,0.09835,0,0,0,0,7.3
3,baseline_version2,0.22765,0,0,0,0,6.15
4,baseline_concatenationv1v2,0.0301,0,0,0,0,9.1
5,baseline_concatenationv2v1,0.00605,0,0,0,0,9.65
6,baseline_manual,0.11015,0,0,0,0,6.95


### Ranking models by mean rank

In [18]:
results.sort_values(['mean_rank'])

Unnamed: 0,model_name,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals,mean_rank
9,random forest,0.80535,20,19,1,0,1.05
10,xg boost,0.784,20,3,15,2,1.95
8,decision tree,0.76355,20,1,3,16,2.75
0,baseline,0.57635,1,1,0,0,3.85
2,baseline_version1,0.5243,1,1,0,0,4.45
3,baseline_version2,0.22765,0,0,0,0,6.15
6,baseline_manual,0.11015,0,0,0,0,6.95
1,baseline_combination,0.09835,0,0,0,0,7.3
4,baseline_concatenationv1v2,0.0301,0,0,0,0,9.1
5,baseline_concatenationv2v1,0.00605,0,0,0,0,9.65
