In [1]:
import numpy as np
import pandas as pd
from IPython.core.display import display
from matplotlib import pyplot as plt

pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRFClassifier
from sklearn.dummy import DummyClassifier
from ConstantClassifier import ConstantClassifier
import configs
import warnings
import classifier_utils
import seaborn as sns
warnings.filterwarnings("ignore")

from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_predict, GridSearchCV, validation_curve
from sklearn.impute import SimpleImputer
import math

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])
non_features_columns.extend(["has_branch_merge_message_indicator"])

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [4]:
df_test = pd.read_csv("../../data/dataset-test.csv")
target_names = sorted(list(df_test['developerdecision'].unique()))
df_na = df_test[df_test.isna().any(axis=1)]

len(df_na) / len(df_test)

0.18596892574587742

# Class distributions

### Normalized (%)

In [5]:
import importlib
importlib.reload(classifier_utils)
class_distribution_normalized = classifier_utils.get_projects_class_distribution(projects, include_overall=True, training=False)

class_distribution_normalized

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,44.1,27.18,8.21,0.51,0.51,19.49,0.0
1,apache__directory-server,44.91,53.29,0.0,0.0,0.0,1.8,0.0
2,android__platform_frameworks_base,66.05,8.95,8.61,3.72,1.18,11.15,0.17
3,freenet__fred,43.08,17.79,20.16,7.91,0.4,10.28,0.4
4,alexo__wro4j,32.46,11.3,27.83,1.45,0.0,25.8,1.16
5,apache__lucene-solr,26.72,22.27,18.22,7.29,1.21,21.05,3.24
6,getrailo__railo,27.87,39.34,9.84,9.02,0.0,13.11,0.82
7,atlasapi__atlas,20.0,46.67,10.77,5.64,2.05,13.85,1.03
8,hibernate__hibernate-orm,23.53,18.82,35.29,6.47,0.0,14.12,1.76
9,CloudStack-extras__CloudStack-archive,42.8,18.18,7.95,18.18,1.14,11.36,0.38


### Count

In [6]:
class_distribution_count = classifier_utils.get_projects_class_distribution(projects, False, include_overall=True, training=False)

class_distribution_count

Unnamed: 0,Project,Version 1,Version 2,Combination,ConcatenationV1V2,ConcatenationV2V1,Manual,None
0,Ramblurr__Anki-Android,86,53,16,1,1,38,0
1,apache__directory-server,75,89,0,0,0,3,0
2,android__platform_frameworks_base,391,53,51,22,7,66,1
3,freenet__fred,109,45,51,20,1,26,1
4,alexo__wro4j,112,39,96,5,0,89,4
5,apache__lucene-solr,66,55,45,18,3,52,8
6,getrailo__railo,34,48,12,11,0,16,1
7,atlasapi__atlas,39,91,21,11,4,27,2
8,hibernate__hibernate-orm,40,32,60,11,0,24,3
9,CloudStack-extras__CloudStack-archive,113,48,21,48,3,30,1


# Classification results

In [7]:
model = RandomForestClassifier(random_state=99, n_jobs=-2, n_estimators=400, max_features=0.3, min_samples_leaf=1)

### Training/validation validation data

Uses 80% of the data with a 10-fold cross-validation to calculate the average accuracy for each project

In [8]:
results = classifier_utils.ProjectsResults(model, projects, non_features_columns)

In [9]:
report = results.get_report_df(include_overall=True)
report

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.972,0.976,0.974,0.976,0.573,0.943
1,apache__directory-server,845,652,0.938,0.942,0.94,0.942,0.512,0.881
2,jgralab__jgralab,2072,1802,0.864,0.867,0.864,0.867,0.491,0.738
3,CloudStack-extras__CloudStack-archive,1424,1106,0.799,0.806,0.8,0.806,0.437,0.655
4,Unidata__thredds,1154,950,0.912,0.918,0.913,0.918,0.777,0.632
5,apache__accumulo,4113,3148,0.856,0.862,0.858,0.862,0.635,0.621
6,Ramblurr__Anki-Android,892,759,0.717,0.739,0.72,0.739,0.439,0.535
7,getrailo__railo,815,572,0.696,0.71,0.701,0.71,0.378,0.534
8,apache__lucene-solr,1256,974,0.639,0.646,0.639,0.646,0.266,0.517
9,TeamDev-Ltd__OpenFaces,2979,2859,0.966,0.969,0.967,0.969,0.938,0.494


### Test

Uses 80% of the data to train the model and the remaining 20% of the data to test it. 
The accuracy is calculated based on the 20% of the data that the model has never seen during training.

In [10]:
results_test = classifier_utils.ProjectsResults(model, projects, non_features_columns, training=False)

In [11]:
report_test = results_test.get_report_df(include_overall=True)
report_test

Unnamed: 0,project,observations,observations (wt NaN),precision,recall,f1-score,accuracy,baseline (majority),improvement
0,CCI-MIT__XCoLab,5512,3757,0.979,0.981,0.979,0.981,0.573,0.956
1,apache__directory-server,845,652,0.922,0.922,0.921,0.922,0.533,0.833
2,jgralab__jgralab,2072,1802,0.879,0.884,0.88,0.884,0.504,0.765
3,CloudStack-extras__CloudStack-archive,1424,1106,0.833,0.822,0.824,0.822,0.428,0.689
4,Unidata__thredds,1154,950,0.903,0.909,0.905,0.909,0.722,0.672
5,Ramblurr__Anki-Android,892,759,0.793,0.81,0.799,0.81,0.441,0.661
6,apache__accumulo,4113,3148,0.859,0.866,0.859,0.866,0.642,0.625
7,getrailo__railo,815,572,0.729,0.73,0.721,0.73,0.393,0.554
8,zkoss__zk,1087,881,0.785,0.787,0.783,0.787,0.565,0.511
9,apache__lucene-solr,1256,974,0.608,0.611,0.598,0.611,0.267,0.47


# Summary of the results

In [12]:
df_inner = pd.merge(report, report_test, on='project', how='inner', suffixes=('_cv', '_test'))
df = df_inner.filter(regex=("project|accuracy.*|baseline.*_test|precision_test|recall_test|f1-score_test")).copy()
df = df.rename(columns={"precision_test": "precision", 'recall_test':'recall', 'f1-score_test':'f1-score', 'baseline (majority)_test': 'baseline'})
df['norm._improv.'] = df.apply(lambda x: classifier_utils.get_normalized_improvement(x['accuracy_test'], x['baseline']), axis=1)
columns = ['project', 'accuracy_cv', 'baseline', 'accuracy_test', 'norm._improv.', 'precision', 'recall', 'f1-score']
def fix_project_name(project_name):
    return project_name.replace('__','/')
summary = df[columns]
summary['project'] = summary['project'].apply(fix_project_name)
summary = summary.round(3).dropna()
summary

Unnamed: 0,project,accuracy_cv,baseline,accuracy_test,norm._improv.,precision,recall,f1-score
0,CCI-MIT/XCoLab,0.976,0.573,0.981,0.956,0.979,0.981,0.979
1,apache/directory-server,0.942,0.533,0.922,0.833,0.922,0.922,0.921
2,jgralab/jgralab,0.867,0.504,0.884,0.766,0.879,0.884,0.88
3,CloudStack-extras/CloudStack-archive,0.806,0.428,0.822,0.689,0.833,0.822,0.824
4,Unidata/thredds,0.918,0.722,0.909,0.673,0.903,0.909,0.905
5,apache/accumulo,0.862,0.642,0.866,0.626,0.859,0.866,0.859
6,Ramblurr/Anki-Android,0.739,0.441,0.81,0.66,0.793,0.81,0.799
7,getrailo/railo,0.71,0.393,0.73,0.555,0.729,0.73,0.721
8,apache/lucene-solr,0.646,0.267,0.611,0.469,0.608,0.611,0.598
9,TeamDev-Ltd/OpenFaces,0.969,0.942,0.968,0.448,0.961,0.968,0.964


In [13]:
summary.to_csv('../../data/results/experiment_results.csv', index=None)

# Results per class

In [14]:
baseline_combination = ConstantClassifier('Combination')
baseline_version1 = ConstantClassifier('Version 1')
baseline_version2 = ConstantClassifier('Version 2')
baseline_concatenationv1v2 = ConstantClassifier('ConcatenationV1V2')
baseline_concatenationv2v1 = ConstantClassifier('ConcatenationV2V1')
baseline_manual = ConstantClassifier('Manual')
baseline_none = ConstantClassifier('None')
model = RandomForestClassifier(random_state=99, n_jobs=5, n_estimators=100, max_features=0.3, min_samples_leaf=1)
models = [baseline_combination, baseline_version1, baseline_version2,
          baseline_concatenationv1v2, baseline_concatenationv2v1, baseline_manual,
          baseline_none, model]
all_models_names = ['baseline_combination', 'baseline_version1',
                'baseline_version2', 'baseline_concatenationv1v2',
                'baseline_concatenationv2v1', 'baseline_manual', 'baseline_none',
                'random forest']

In [15]:
models_results = {}
for model, model_name in zip(models, all_models_names):
    models_results[model_name] = classifier_utils.ProjectsResults(model, projects, non_features_columns, training=False)

## Combination

In [16]:
models_names = ['baseline_combination', 'random forest']
combination = classifier_utils.compare_models_per_class('Combination', models_results, models_names)
combination['target_class'] = 'Combination'
combination

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_combination,0.100855,1.0,0.170993,Combination
1,random forest,0.574964,0.553709,0.545107,Combination


## Version 1

In [17]:
models_names = ['baseline_version1', 'random forest']
version1 = classifier_utils.compare_models_per_class('Version 1', models_results, models_names)
version1['target_class'] = 'Version 1'
version1

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_version1,0.519473,1.0,0.649132,Version 1
1,random forest,0.847366,0.874226,0.859793,Version 1


## Version 2

In [18]:
models_names = ['baseline_version2', 'random forest']
version2 = classifier_utils.compare_models_per_class('Version 2', models_results, models_names)
version2['target_class'] = 'Version 2'
version2

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_version2,0.237367,1.0,0.351846,Version 2
1,random forest,0.740505,0.733832,0.73432,Version 2


## ConcatenationV1V2

In [19]:
models_names = ['baseline_concatenationv1v2', 'random forest']
concatv1v2 = classifier_utils.compare_models_per_class('ConcatenationV1V2', models_results, models_names)
concatv1v2['target_class'] = 'ConcatenationV1V2'
concatv1v2

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_concatenationv1v2,0.041066,1.0,0.075335,ConcatenationV1V2
1,random forest,0.793799,0.545609,0.632351,ConcatenationV1V2


## ConcatenationV2V1

In [20]:
models_names = ['baseline_concatenationv2v1', 'random forest']
concatv2v1 = classifier_utils.compare_models_per_class('ConcatenationV2V1', models_results, models_names)
concatv2v1['target_class'] = 'ConcatenationV2V1'
concatv2v1

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_concatenationv2v1,0.008445,1.0,0.016695,ConcatenationV2V1
1,random forest,0.666667,0.446429,0.495238,ConcatenationV2V1


## Manual

In [21]:
models_names = ['baseline_manual', 'random forest']
manual = classifier_utils.compare_models_per_class('Manual', models_results, models_names)
manual['target_class'] = 'Manual'
manual

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_manual,0.118856,1.0,0.204728,Manual
1,random forest,0.626303,0.556366,0.580444,Manual


## None

In [22]:
models_results['random forest'].get_class_score_df('None', 'precision')

Unnamed: 0,project,None
0,Ramblurr__Anki-Android,
1,apache__directory-server,
2,android__platform_frameworks_base,
3,freenet__fred,
4,alexo__wro4j,
5,apache__lucene-solr,1.0
6,getrailo__railo,
7,atlasapi__atlas,
8,hibernate__hibernate-orm,
9,CloudStack-extras__CloudStack-archive,


In [23]:
models_names = ['baseline_none', 'random forest']
none = classifier_utils.compare_models_per_class('None', models_results, models_names)
none['target_class'] = 'None'
none

Unnamed: 0,model,precision,recall,f1-score,target_class
0,baseline_none,0.009719,1.0,0.019112,
1,random forest,1.0,0.125,0.222222,


Not enough "None" instances in the projects.

In [24]:
summary_classes = pd.concat([combination, version1, version2, concatv1v2, concatv2v1, manual, none])
summary_classes = summary_classes[['target_class', 'model', 'precision', 'recall', 'f1-score']]
display(summary_classes)
summary_classes.to_csv('../../data/results/experiment_by_class.csv', index=False)

Unnamed: 0,target_class,model,precision,recall,f1-score
0,Combination,baseline_combination,0.100855,1.0,0.170993
1,Combination,random forest,0.574964,0.553709,0.545107
0,Version 1,baseline_version1,0.519473,1.0,0.649132
1,Version 1,random forest,0.847366,0.874226,0.859793
0,Version 2,baseline_version2,0.237367,1.0,0.351846
1,Version 2,random forest,0.740505,0.733832,0.73432
0,ConcatenationV1V2,baseline_concatenationv1v2,0.041066,1.0,0.075335
1,ConcatenationV1V2,random forest,0.793799,0.545609,0.632351
0,ConcatenationV2V1,baseline_concatenationv2v1,0.008445,1.0,0.016695
1,ConcatenationV2V1,random forest,0.666667,0.446429,0.495238
