In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

# Decision tree

## Per project

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = selected_dataset['project'].unique()
results = []
for project in projects:
    project = project.replace("/", "__")
    project_dataset = f"../../data/projects/{project}-training.csv"
    df = pd.read_csv(project_dataset)
    df_clean = df.dropna()
    if len(df_clean) >= 10:
        y = df_clean["developerdecision"].copy()
    #     classes = list(y.unique())
        df_clean = df_clean.drop(columns=['developerdecision'])
        df_clean = df_clean.drop(columns=non_features_columns)
        features = list(df_clean.columns)
        X = df_clean[features]
#         print(f"project: {project} \t len df: {len(df)} \t len df clean: {len(df_clean)} \t len x: {len(X)}  \t len y: {len(y)}")
        dt = DecisionTreeClassifier(min_samples_split=5, random_state=99)
        scores = cross_val_score(dt, X, y, cv=10)
        accuracy = scores.mean()
        std_dev = scores.std()
        results.append([project, len(df), len(df_clean), accuracy, std_dev])
    else:
        results.append([project, len(df), len(df_clean), 0, 0])

In [4]:
accuracy_results = pd.DataFrame(results, columns=['project', 'observations', 'observations (without NaN)', 'accuracy', 'std_dev'])
accuracy_results.sort_values('accuracy', ascending=False)

Unnamed: 0,project,observations,observations (without NaN),accuracy,std_dev
25,sebastianbenz__Jnario,8298,8173,0.998654,0.000856
26,CCI-MIT__XCoLab,5512,3757,0.971259,0.012398
11,TeamDev-Ltd__OpenFaces,2979,2859,0.963282,0.011613
15,alkacon__opencms-core,923,840,0.941667,0.016366
16,SINTEF-9012__ThingML,1022,977,0.937576,0.022146
1,apache__directory-server,845,652,0.920233,0.028958
19,Unidata__thredds,1154,950,0.904211,0.031736
18,apache__accumulo,4113,3148,0.833878,0.028614
17,cgjones__android-frameworks-base,828,565,0.81416,0.027711
24,jgralab__jgralab,2072,1802,0.814095,0.038592


## Overall

In [5]:
overall_dataset = pd.read_csv("../../data/dataset-training.csv")
df_clean = overall_dataset.dropna()
results_overall = []
if len(df_clean) >= 10:
    y = df_clean["developerdecision"].copy()
#     classes = list(y.unique())
    df_clean = df_clean.drop(columns=['developerdecision'])
    df_clean = df_clean.drop(columns=non_features_columns)
    features = list(df_clean.columns)
    X = df_clean[features]
    for splits in range(5,50,5):
    #         print(f"project: {project} \t len df: {len(df)} \t len df clean: {len(df_clean)} \t len x: {len(X)}  \t len y: {len(y)}")
        dt = DecisionTreeClassifier(min_samples_split=splits, random_state=99)
        scores = cross_val_score(dt, X, y, cv=10)
        accuracy = scores.mean()
        std_dev = scores.std()
        results_overall.append(['Overall', len(overall_dataset), len(df_clean), splits, accuracy, std_dev])
else:
    results_overall.append(['Overall', len(overall_dataset), len(df_clean), splits, 0, 0])

In [6]:
accuracy_results = pd.DataFrame(results_overall, columns=['project', 'observations', 'observations (without NaN)', 'splits', 'accuracy', 'std_dev'])
accuracy_results.sort_values('accuracy', ascending=False)

Unnamed: 0,project,observations,observations (without NaN),splits,accuracy,std_dev
0,Overall,56241,40412,5,0.718855,0.187843
3,Overall,56241,40412,20,0.707102,0.18189
1,Overall,56241,40412,10,0.706459,0.186651
4,Overall,56241,40412,25,0.706112,0.177977
2,Overall,56241,40412,15,0.705569,0.185633
5,Overall,56241,40412,30,0.705196,0.177131
6,Overall,56241,40412,35,0.703167,0.175569
7,Overall,56241,40412,40,0.700024,0.176493
8,Overall,56241,40412,45,0.694976,0.176407


In [7]:
clean_df = overall_dataset.dropna()
print(f"Clean dataset size (without NaN values): {len(clean_df)}")
print("Dataset target classes distribution: ")
print(clean_df['developerdecision'].value_counts(normalize=True))

Clean dataset size (without NaN values): 40412
Dataset target classes distribution: 
Version 1            0.674503
Version 2            0.143596
Manual               0.089132
Combination          0.066020
ConcatenationV1V2    0.018435
ConcatenationV2V1    0.006211
None                 0.002103
Name: developerdecision, dtype: float64


In [8]:
y_pred = cross_val_predict(dt, X, y, cv=10)
conf_mat = confusion_matrix(y, y_pred, labels = y.unique())
# print(conf_mat)
display(pd.DataFrame(conf_mat, columns=y.unique(), index=y.unique()))

Unnamed: 0,Manual,Version 2,Combination,Version 1,ConcatenationV1V2,None,ConcatenationV2V1
Manual,1483,391,899,734,77,2,16
Version 2,431,3101,889,1281,85,3,13
Combination,523,396,936,717,70,3,23
Version 1,1967,1712,1068,22258,226,20,7
ConcatenationV1V2,82,109,157,151,234,1,11
,11,12,23,17,7,10,5
ConcatenationV2V1,37,31,49,46,25,0,63


# Random forest

In [9]:
clf = RandomForestClassifier(random_state=99)
# clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=10)
accuracy = scores.mean()
std_dev = scores.std()
print(accuracy)
print(std_dev)

0.7215775375630857
0.1972118956701879


# Ada boost

In [10]:
clf = AdaBoostClassifier(n_estimators=100, random_state=99)
# clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=10)
accuracy = scores.mean()
std_dev = scores.std()
print(accuracy)
print(std_dev)

0.6686967612158454
0.13731481839612333
