In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.dummy import DummyClassifier
from ConstantClassifier import ConstantClassifier
import configs
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
df_test = pd.read_csv("../../data/dataset-test.csv")
target_names = sorted(list(df_test['developerdecision'].unique()))
df_na = df_test[df_test.isna().any(axis=1)]

len(df_na) / len(df_test)

0.2853684585261659

# Class distributions

### Normalized (%)

In [5]:
import importlib
importlib.reload(classifier_utils)
class_distribution_normalized = classifier_utils.get_projects_class_distribution(projects, include_overall=True, training=False)

class_distribution_normalized

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,44.1,27.18,8.21,0.51,0.51,19.49,0.0
1,apache__directory-server,44.91,53.29,0.0,0.0,0.0,1.8,0.0
2,android__platform_frameworks_base,66.05,8.95,8.61,3.72,1.18,11.15,0.17
3,freenet__fred,43.08,17.79,20.16,7.91,0.4,10.28,0.4
4,alexo__wro4j,32.46,11.3,27.83,1.45,0.0,25.8,1.16
5,apache__lucene-solr,26.72,22.27,18.22,7.29,1.21,21.05,3.24
6,elastic__elasticsearch,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,getrailo__railo,27.87,39.34,9.84,9.02,0.0,13.11,0.82
8,atlasapi__atlas,20.0,46.67,10.77,5.64,2.05,13.85,1.03
9,hibernate__hibernate-orm,23.53,18.82,35.29,6.47,0.0,14.12,1.76


### Count

In [6]:
class_distribution_count = classifier_utils.get_projects_class_distribution(projects, False, include_overall=True, training=False)

class_distribution_count

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,86,53,16,1,1,38,0
1,apache__directory-server,75,89,0,0,0,3,0
2,android__platform_frameworks_base,391,53,51,22,7,66,1
3,freenet__fred,109,45,51,20,1,26,1
4,alexo__wro4j,112,39,96,5,0,89,4
5,apache__lucene-solr,66,55,45,18,3,52,8
6,elastic__elasticsearch,0,0,0,0,0,0,0
7,getrailo__railo,34,48,12,11,0,16,1
8,atlasapi__atlas,39,91,21,11,4,27,2
9,hibernate__hibernate-orm,40,32,60,11,0,24,3


# Classification results

In [7]:
model = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)

### Training/validation validation data

Uses 80% of the data with a 10-fold cross-validation to calculate the average accuracy for each project

In [8]:
results = classifier_utils.ProjectsResults(model, projects, non_features_columns)

In [9]:
report = results.get_report_df(include_overall=True)
report

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.975,0.973,0.975,0.573,0.942
1,apache__directory-server,845,652,0.935,0.939,0.936,0.939,0.512,0.874
2,jgralab__jgralab,2072,1802,0.869,0.871,0.869,0.871,0.491,0.747
3,CloudStack-extras__CloudStack-archive,1424,1106,0.802,0.807,0.803,0.807,0.437,0.658
4,Unidata__thredds,1154,950,0.918,0.923,0.919,0.923,0.777,0.656
5,apache__accumulo,4113,3148,0.856,0.86,0.857,0.86,0.635,0.616
6,Ramblurr__Anki-Android,892,759,0.714,0.74,0.722,0.74,0.439,0.538
7,getrailo__railo,815,572,0.697,0.71,0.702,0.71,0.378,0.534
8,apache__lucene-solr,1256,974,0.645,0.65,0.645,0.65,0.266,0.523
9,TeamDev-Ltd__OpenFaces,2979,2859,0.965,0.968,0.967,0.968,0.938,0.483


### Test

Uses 80% of the data to train the model and the remaining 20% of the data to test it. 
The accuracy is calculated based on the 20% of the data that the model has never seen during training.

In [10]:
results_test = classifier_utils.ProjectsResults(model, projects, non_features_columns, training=False)

In [11]:
report_test = results_test.get_report_df(include_overall=True)
report_test

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.976,0.979,0.977,0.979,0.573,0.951
1,apache__directory-server,845,652,0.927,0.928,0.927,0.928,0.533,0.846
2,jgralab__jgralab,2072,1802,0.867,0.871,0.868,0.871,0.504,0.739
3,Unidata__thredds,1154,950,0.908,0.913,0.908,0.913,0.722,0.687
4,CloudStack-extras__CloudStack-archive,1424,1106,0.83,0.818,0.82,0.818,0.428,0.682
5,Ramblurr__Anki-Android,892,759,0.79,0.81,0.797,0.81,0.441,0.661
6,apache__accumulo,4113,3148,0.849,0.857,0.85,0.857,0.642,0.6
7,getrailo__railo,815,572,0.727,0.73,0.718,0.73,0.393,0.554
8,TeamDev-Ltd__OpenFaces,2979,2859,0.971,0.972,0.97,0.972,0.942,0.512
9,zkoss__zk,1087,881,0.777,0.782,0.777,0.782,0.565,0.5


# Summary of the results

In [12]:
df_inner = pd.merge(report, report_test, on='project', how='inner', suffixes=('_cv', '_test'))
df = df_inner.filter(regex=("project|accuracy.*|precision_test|recall_test|f1-score_test")).copy()
df = df.rename(columns={"precision_test": "precision", 'recall_test':'recall', 'f1-score_test':'f1-score'})
df['norm._improv.'] = df.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_test'], x['accuracy_cv']), axis=1)
columns = ['project', 'accuracy_cv', 'accuracy_test', 'norm._improv.', 'precision', 'recall', 'f1-score']
df[columns]

Unnamed: 0,project,accuracy_cv,accuracy_test,norm._improv.,precision,recall,f1-score
0,CCI-MIT__XCoLab,0.975,0.979,0.16,0.976,0.979,0.977
1,apache__directory-server,0.939,0.928,-0.011715,0.927,0.928,0.927
2,jgralab__jgralab,0.871,0.871,0.0,0.867,0.871,0.868
3,CloudStack-extras__CloudStack-archive,0.807,0.818,0.056995,0.83,0.818,0.82
4,Unidata__thredds,0.923,0.913,-0.010834,0.908,0.913,0.908
5,apache__accumulo,0.86,0.857,-0.003488,0.849,0.857,0.85
6,Ramblurr__Anki-Android,0.74,0.81,0.269231,0.79,0.81,0.797
7,getrailo__railo,0.71,0.73,0.068966,0.727,0.73,0.718
8,apache__lucene-solr,0.65,0.599,-0.078462,0.587,0.599,0.587
9,TeamDev-Ltd__OpenFaces,0.968,0.972,0.125,0.971,0.972,0.97


# Results per class

In [13]:
baseline_combination = ConstantClassifier('Combination')
baseline_version1 = ConstantClassifier('Version 1')
baseline_version2 = ConstantClassifier('Version 2')
baseline_concatenationv1v2 = ConstantClassifier('ConcatenationV1V2')
baseline_concatenationv2v1 = ConstantClassifier('ConcatenationV2V1')
baseline_manual = ConstantClassifier('Manual')
baseline_none = ConstantClassifier('None')
model = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)
models = [baseline_combination, baseline_version1, baseline_version2,
          baseline_concatenationv1v2, baseline_concatenationv2v1, baseline_manual,
          baseline_none, model]
all_models_names = ['baseline_combination', 'baseline_version1',
                'baseline_version2', 'baseline_concatenationv1v2',
                'baseline_concatenationv2v1', 'baseline_manual', 'baseline_none',
                'random forest']

In [14]:
models_results = {}
for model, model_name in zip(models, all_models_names):
    models_results[model_name] = classifier_utils.ProjectsResults(model, projects, non_features_columns, training=False)

## Combination

In [15]:
models_names = ['baseline_combination', 'random forest']
classifier_utils.compare_models_per_class('Combination', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_combination,0.09283,1.0,0.159925
1,random forest,0.524936,0.543399,0.521583


## Version 1

In [16]:
models_names = ['baseline_version1', 'random forest']
classifier_utils.compare_models_per_class('Version 1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version1,0.560208,1.0,0.687091
1,random forest,0.86691,0.894088,0.879521


## Version 2

In [17]:
models_names = ['baseline_version2', 'random forest']
classifier_utils.compare_models_per_class('Version 2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version2,0.209655,1.0,0.317844
1,random forest,0.728455,0.714802,0.718313


## ConcatenationV1V2

In [18]:
models_names = ['baseline_concatenationv1v2', 'random forest']
classifier_utils.compare_models_per_class('ConcatenationV1V2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv1v2,0.037805,1.0,0.070036
1,random forest,0.711437,0.453653,0.533302


## ConcatenationV2V1

In [19]:
models_names = ['baseline_concatenationv2v1', 'random forest']
classifier_utils.compare_models_per_class('ConcatenationV2V1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv2v1,0.009071,1.0,0.017923
1,random forest,0.572917,0.436012,0.462807


## Manual

In [20]:
models_names = ['baseline_manual', 'random forest']
classifier_utils.compare_models_per_class('Manual', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_manual,0.109701,1.0,0.190792
1,random forest,0.592045,0.528561,0.543306


## None

In [21]:
models_names = ['baseline_none', 'random forest']
classifier_utils.compare_models_per_class('None', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_none,0.009719,1.0,0.019112
1,random forest,0.5,0.333333,0.4
