In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.dummy import DummyClassifier
from ConstantClassifier import ConstantClassifier
import configs
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
df_test = pd.read_csv("../../data/dataset-test.csv")
target_names = sorted(list(df_test['developerdecision'].unique()))
df_na = df_test[df_test.isna().any(axis=1)]

len(df_na) / len(df_test)

0.2530394472142045

# Class distributions

### Normalized (%)

In [5]:
import importlib
importlib.reload(classifier_utils)
class_distribution_normalized = classifier_utils.get_projects_class_distribution(projects, include_overall=True, training=False)

class_distribution_normalized

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,44.1,27.18,8.21,0.51,0.51,19.49,0.0
1,apache__directory-server,44.91,53.29,0.0,0.0,0.0,1.8,0.0
2,android__platform_frameworks_base,66.05,8.95,8.61,3.72,1.18,11.15,0.17
3,freenet__fred,43.08,17.79,20.16,7.91,0.4,10.28,0.4
4,alexo__wro4j,32.46,11.3,27.83,1.45,0.0,25.8,1.16
5,apache__lucene-solr,26.72,22.27,18.22,7.29,1.21,21.05,3.24
6,elastic__elasticsearch,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,getrailo__railo,27.87,39.34,9.84,9.02,0.0,13.11,0.82
8,atlasapi__atlas,20.0,46.67,10.77,5.64,2.05,13.85,1.03
9,hibernate__hibernate-orm,23.53,18.82,35.29,6.47,0.0,14.12,1.76


### Count

In [6]:
class_distribution_count = classifier_utils.get_projects_class_distribution(projects, False, include_overall=True, training=False)

class_distribution_count

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,86,53,16,1,1,38,0
1,apache__directory-server,75,89,0,0,0,3,0
2,android__platform_frameworks_base,391,53,51,22,7,66,1
3,freenet__fred,109,45,51,20,1,26,1
4,alexo__wro4j,112,39,96,5,0,89,4
5,apache__lucene-solr,66,55,45,18,3,52,8
6,elastic__elasticsearch,0,0,0,0,0,0,0
7,getrailo__railo,34,48,12,11,0,16,1
8,atlasapi__atlas,39,91,21,11,4,27,2
9,hibernate__hibernate-orm,40,32,60,11,0,24,3


# Classification results

In [7]:
model = RandomForestClassifier(random_state=99, n_jobs=-2, n_estimators=400, max_features=0.3, min_samples_leaf=1)

### Training/validation validation data

Uses 80% of the data with a 10-fold cross-validation to calculate the average accuracy for each project

In [8]:
results = classifier_utils.ProjectsResults(model, projects, non_features_columns)

In [9]:
report = results.get_report_df(include_overall=True)
report

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.973,0.976,0.573,0.943
1,apache__directory-server,845,652,0.934,0.937,0.936,0.937,0.512,0.871
2,jgralab__jgralab,2072,1802,0.864,0.866,0.864,0.866,0.491,0.737
3,CloudStack-extras__CloudStack-archive,1424,1106,0.799,0.806,0.8,0.806,0.437,0.655
4,apache__accumulo,4113,3148,0.858,0.863,0.859,0.863,0.635,0.625
5,Unidata__thredds,1154,950,0.908,0.916,0.911,0.916,0.777,0.623
6,Ramblurr__Anki-Android,892,759,0.722,0.742,0.724,0.742,0.439,0.54
7,getrailo__railo,815,572,0.699,0.712,0.703,0.712,0.378,0.537
8,apache__lucene-solr,1256,974,0.639,0.646,0.64,0.646,0.266,0.517
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.494


### Test

Uses 80% of the data to train the model and the remaining 20% of the data to test it. 
The accuracy is calculated based on the 20% of the data that the model has never seen during training.

In [10]:
results_test = classifier_utils.ProjectsResults(model, projects, non_features_columns, training=False)

In [11]:
report_test = results_test.get_report_df(include_overall=True)
report_test

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.978,0.98,0.978,0.98,0.573,0.953
1,apache__directory-server,845,652,0.922,0.922,0.921,0.922,0.533,0.833
2,jgralab__jgralab,2072,1802,0.877,0.882,0.877,0.882,0.504,0.761
3,Unidata__thredds,1154,950,0.908,0.913,0.908,0.913,0.722,0.687
4,CloudStack-extras__CloudStack-archive,1424,1106,0.83,0.814,0.817,0.814,0.428,0.675
5,Ramblurr__Anki-Android,892,759,0.794,0.81,0.8,0.81,0.441,0.661
6,apache__accumulo,4113,3148,0.863,0.87,0.864,0.87,0.642,0.636
7,getrailo__railo,815,572,0.705,0.705,0.695,0.705,0.393,0.514
8,zkoss__zk,1087,881,0.781,0.787,0.781,0.787,0.565,0.511
9,apache__lucene-solr,1256,974,0.613,0.623,0.609,0.623,0.267,0.486


# Summary of the results

In [35]:
df_inner = pd.merge(report, report_test, on='project', how='inner', suffixes=('_cv', '_test'))
df = df_inner.filter(regex=("project|accuracy.*|baseline.*_test|precision_test|recall_test|f1-score_test")).copy()
df = df.rename(columns={"precision_test": "precision", 'recall_test':'recall', 'f1-score_test':'f1-score', 'baseline (majority)_test': 'baseline'})
df['norm._improv.'] = df.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_test'], x['baseline']), axis=1)
columns = ['project', 'accuracy_cv', 'baseline', 'accuracy_test', 'norm._improv.', 'precision', 'recall', 'f1-score']
def fix_project_name(project_name):
    return project_name.replace('__','/')
summary = df[columns]
summary['project'] = summary['project'].apply(fix_project_name)
summary = summary.round(3).dropna()
summary

Unnamed: 0,project,accuracy_cv,baseline,accuracy_test,norm._improv.,precision,recall,f1-score
0,CCI-MIT/XCoLab,0.976,0.573,0.98,0.953,0.978,0.98,0.978
1,apache/directory-server,0.937,0.533,0.922,0.833,0.922,0.922,0.921
2,jgralab/jgralab,0.866,0.504,0.882,0.762,0.877,0.882,0.877
3,CloudStack-extras/CloudStack-archive,0.806,0.428,0.814,0.675,0.83,0.814,0.817
4,apache/accumulo,0.863,0.642,0.87,0.637,0.863,0.87,0.864
5,Unidata/thredds,0.916,0.722,0.913,0.687,0.908,0.913,0.908
6,Ramblurr/Anki-Android,0.742,0.441,0.81,0.66,0.794,0.81,0.8
7,getrailo/railo,0.712,0.393,0.705,0.514,0.705,0.705,0.695
8,apache/lucene-solr,0.646,0.267,0.623,0.486,0.613,0.623,0.609
9,TeamDev-Ltd/OpenFaces,0.969,0.942,0.968,0.448,0.961,0.968,0.964


In [36]:
summary.to_csv('../../data/results/experiment_results.csv', index=None)

# Results per class

In [14]:
baseline_combination = ConstantClassifier('Combination')
baseline_version1 = ConstantClassifier('Version 1')
baseline_version2 = ConstantClassifier('Version 2')
baseline_concatenationv1v2 = ConstantClassifier('ConcatenationV1V2')
baseline_concatenationv2v1 = ConstantClassifier('ConcatenationV2V1')
baseline_manual = ConstantClassifier('Manual')
baseline_none = ConstantClassifier('None')
model = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)
models = [baseline_combination, baseline_version1, baseline_version2,
          baseline_concatenationv1v2, baseline_concatenationv2v1, baseline_manual,
          baseline_none, model]
all_models_names = ['baseline_combination', 'baseline_version1',
                'baseline_version2', 'baseline_concatenationv1v2',
                'baseline_concatenationv2v1', 'baseline_manual', 'baseline_none',
                'random forest']

In [15]:
models_results = {}
for model, model_name in zip(models, all_models_names):
    models_results[model_name] = classifier_utils.ProjectsResults(model, projects, non_features_columns, training=False)

## Combination

In [16]:
models_names = ['baseline_combination', 'random forest']
classifier_utils.compare_models_per_class('Combination', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_combination,0.100855,1.0,0.170993
1,random forest,0.580946,0.557375,0.55248


## Version 1

In [17]:
models_names = ['baseline_version1', 'random forest']
classifier_utils.compare_models_per_class('Version 1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version1,0.519473,1.0,0.649132
1,random forest,0.845934,0.868452,0.856048


## Version 2

In [18]:
models_names = ['baseline_version2', 'random forest']
classifier_utils.compare_models_per_class('Version 2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_version2,0.237367,1.0,0.351846
1,random forest,0.746494,0.739228,0.74035


## ConcatenationV1V2

In [19]:
models_names = ['baseline_concatenationv1v2', 'random forest']
classifier_utils.compare_models_per_class('ConcatenationV1V2', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv1v2,0.041066,1.0,0.075335
1,random forest,0.786429,0.510038,0.593751


## ConcatenationV2V1

In [20]:
models_names = ['baseline_concatenationv2v1', 'random forest']
classifier_utils.compare_models_per_class('ConcatenationV2V1', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_concatenationv2v1,0.008445,1.0,0.016695
1,random forest,0.766667,0.501984,0.541667


## Manual

In [21]:
models_names = ['baseline_manual', 'random forest']
classifier_utils.compare_models_per_class('Manual', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_manual,0.118856,1.0,0.204728
1,random forest,0.622234,0.561165,0.582043


## None

In [22]:
models_names = ['baseline_none', 'random forest']
classifier_utils.compare_models_per_class('None', models_results, models_names)

Unnamed: 0,model,precision,recall,f1-score
0,baseline_none,0.009719,1.0,0.019112
1,random forest,0.25,0.333333,0.285714
