In [23]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.dummy import DummyClassifier
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [24]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [25]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [26]:
df_training = pd.read_csv("../../data/dataset-training.csv")
target_names = sorted(list(df_training['developerdecision'].unique()))
df_na = df_training[df_training.isna().any(axis=1)]

len(df_na) / len(df_training)

0.28144947636066214

# Class distributions

### Normalized (%)

In [27]:
class_distribution_normalized = classifier_utils.get_projects_class_distribution(projects)

class_distribution_normalized

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr/Anki-Android,43.87,29.25,8.43,1.32,0.92,16.07,0.13
1,apache/directory-server,47.39,51.23,0.15,0.0,0.0,1.23,0.0
2,android/platform_frameworks_base,70.77,7.64,9.02,2.68,1.42,8.13,0.33
3,freenet/fred,40.42,16.5,23.32,6.23,0.49,12.94,0.1
4,alexo/wro4j,30.7,14.77,28.65,0.88,1.02,23.61,0.37
5,apache/lucene-solr,25.05,26.59,20.33,5.24,0.31,21.66,0.82
6,elastic/elasticsearch,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,getrailo/railo,31.99,37.76,8.74,7.17,0.35,13.81,0.17
8,atlasapi/atlas,24.04,44.5,12.4,5.37,0.9,12.53,0.26
9,hibernate/hibernate-orm,23.88,20.11,31.15,7.4,1.12,14.11,2.23


### Count

In [28]:
class_distribution_count = classifier_utils.get_projects_class_distribution(projects, False)

class_distribution_count

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr/Anki-Android,333,222,64,10,7,122,1
1,apache/directory-server,309,334,1,0,0,8,0
2,android/platform_frameworks_base,1741,188,222,66,35,200,8
3,freenet/fred,409,167,236,63,5,131,1
4,alexo/wro4j,420,202,392,12,14,323,5
5,apache/lucene-solr,244,259,198,51,3,211,8
6,elastic/elasticsearch,0,0,0,0,0,0,0
7,getrailo/railo,183,216,50,41,2,79,1
8,atlasapi/atlas,188,348,97,42,7,98,2
9,hibernate/hibernate-orm,171,144,223,53,8,101,16


# Models

In [29]:
baseline = DummyClassifier(strategy="most_frequent", random_state=99)
model1 = DecisionTreeClassifier(random_state=99)
model2 = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)
model3 = XGBRFClassifier(random_state=99, subsample=0.9, eval_metric='mlogloss', n_estimators=400, colsample_bynode=0.4)
models = [baseline, model1, model2, model3]
models_names = ['baseline', 'decision tree', 'random forest', 'xg boost']

## Overall accuracy per class

In [30]:
import importlib
importlib.reload(classifier_utils)
results = classifier_utils.compare_models_per_class(models, models_names, projects, non_features_columns, target_names)
results

Unnamed: 0,model,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None,Version 1,Version 2
0,baseline,0.04,0.0,0.0,0.0,0.0,0.76,0.208333
1,decision tree,0.381405,0.274554,0.288087,0.447381,0.046627,0.853784,0.64708
2,random forest,0.409317,0.281445,0.291721,0.468696,0.0375,0.89506,0.684252
3,xg boost,0.366754,0.282134,0.175417,0.431183,0.005952,0.881695,0.656738


## Models ranking

In [31]:
import importlib
importlib.reload(classifier_utils)
results = classifier_utils.compare_models_medals(models, models_names, projects, non_features_columns)

### Ranking models by medals

In [32]:
results.sort_values(['gold_medals', 'silver_medals', 'bronze_medals', 'total_medals'], ascending=False)

Unnamed: 0,model_name,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals,mean_rank
2,random forest,0.80772,25,22,3,0,1.12
3,xg boost,0.79052,25,5,15,5,2.0
0,baseline,0.6054,3,1,0,2,3.8
1,decision tree,0.77044,24,0,5,19,2.84


### Ranking models by mean rank

In [33]:
results.sort_values(['mean_rank'])

Unnamed: 0,model_name,mean_accuracy,total_medals,gold_medals,silver_medals,bronze_medals,mean_rank
2,random forest,0.80772,25,22,3,0,1.12
3,xg boost,0.79052,25,5,15,5,2.0
1,decision tree,0.77044,24,0,5,19,2.84
0,baseline,0.6054,3,1,0,2,3.8
