In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

# Decision tree

## Per project

In [3]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = selected_dataset['project'].unique()
results = []
for project in projects:
    project = project.replace("/", "__")
    project_dataset = f"../../data/projects/{project}-training.csv"
    df = pd.read_csv(project_dataset)
    df_clean = df.dropna()
    if len(df_clean) >= 10:
        y = df_clean["developerdecision"].copy()
    #     classes = list(y.unique())
        df_clean = df_clean.drop(columns=['developerdecision'])
        df_clean = df_clean.drop(columns=non_features_columns)
        features = list(df_clean.columns)
        X = df_clean[features]
#         print(f"project: {project} \t len df: {len(df)} \t len df clean: {len(df_clean)} \t len x: {len(X)}  \t len y: {len(y)}")
        dt = DecisionTreeClassifier(min_samples_split=5, random_state=99)
        scores = cross_val_score(dt, X, y, cv=10)
        accuracy = scores.mean()
        std_dev = scores.std()
        results.append([project, len(df), len(df_clean), accuracy, std_dev])
    else:
        results.append([project, len(df), len(df_clean), 0, 0])

In [4]:
accuracy_results = pd.DataFrame(results, columns=['project', 'observations', 'observations (without NaN)', 'accuracy', 'std_dev'])
accuracy_results.sort_values('accuracy', ascending=False)

Unnamed: 0,project,observations,observations (without NaN),accuracy,std_dev
25,sebastianbenz__Jnario,8298,8259,0.998789,0.000938
26,CCI-MIT__XCoLab,5512,3747,0.97091,0.008311
11,TeamDev-Ltd__OpenFaces,2979,2854,0.968468,0.010609
15,alkacon__opencms-core,923,837,0.939056,0.015617
16,SINTEF-9012__ThingML,1022,983,0.921697,0.015621
1,apache__directory-server,845,710,0.916901,0.035881
17,cgjones__android-frameworks-base,828,591,0.823927,0.049475
24,jgralab__jgralab,2072,1851,0.815248,0.030368
22,AOKP__frameworks_base_disabled,1999,1396,0.79517,0.034238
28,allwinner-ics__platform_frameworks_base,1780,1230,0.785366,0.018611


## Overall

In [5]:
overall_dataset = pd.read_csv("../../data/dataset-training.csv")
df_clean = overall_dataset.dropna()
results_overall = []
if len(df_clean) >= 10:
    y = df_clean["developerdecision"].copy()
#     classes = list(y.unique())
    df_clean = df_clean.drop(columns=['developerdecision'])
    df_clean = df_clean.drop(columns=non_features_columns)
    features = list(df_clean.columns)
    X = df_clean[features]
    for splits in range(5,50,5):
    #         print(f"project: {project} \t len df: {len(df)} \t len df clean: {len(df_clean)} \t len x: {len(X)}  \t len y: {len(y)}")
        dt = DecisionTreeClassifier(min_samples_split=splits, random_state=99)
        scores = cross_val_score(dt, X, y, cv=10)
        accuracy = scores.mean()
        std_dev = scores.std()
        results_overall.append(['Overall', len(overall_dataset), len(df_clean), splits, accuracy, std_dev])
else:
    results_overall.append(['Overall', len(overall_dataset), len(df_clean), splits, 0, 0])

In [6]:
accuracy_results = pd.DataFrame(results_overall, columns=['project', 'observations', 'observations (without NaN)', 'splits', 'accuracy', 'std_dev'])
accuracy_results.sort_values('accuracy', ascending=False)

Unnamed: 0,project,observations,observations (without NaN),splits,accuracy,std_dev
0,Overall,56241,34833,5,0.700352,0.248087
1,Overall,56241,34833,10,0.699604,0.242961
3,Overall,56241,34833,20,0.696877,0.23742
2,Overall,56241,34833,15,0.696619,0.240708
4,Overall,56241,34833,25,0.692915,0.237948
5,Overall,56241,34833,30,0.688436,0.239959
6,Overall,56241,34833,35,0.686196,0.238314
7,Overall,56241,34833,40,0.680828,0.23876
8,Overall,56241,34833,45,0.678244,0.238925


In [7]:
clean_df = overall_dataset.dropna()
clean_df['developerdecision'].value_counts()

Version 1            24190
Version 2             4426
Manual                3141
Combination           2142
ConcatenationV1V2      680
ConcatenationV2V1      183
None                    71
Name: developerdecision, dtype: int64

In [8]:
y_pred = cross_val_predict(dt, X, y, cv=10)
conf_mat = confusion_matrix(y, y_pred, labels = y.unique())
# print(conf_mat)
display(pd.DataFrame(conf_mat, columns=y.unique(), index=y.unique()))

Unnamed: 0,Version 1,Version 2,Manual,Combination,ConcatenationV2V1,ConcatenationV1V2,None
Version 1,18756,2333,1994,865,16,220,6
Version 2,1068,2381,334,505,16,118,4
Manual,643,378,1479,534,13,92,2
Combination,498,344,455,750,9,85,1
ConcatenationV2V1,42,15,29,28,46,23,0
ConcatenationV1V2,133,89,97,133,14,213,1
,23,12,10,16,1,9,0


# Random forest

In [9]:
clf = RandomForestClassifier(random_state=99)
# clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=10)
accuracy = scores.mean()
std_dev = scores.std()
print(accuracy)
print(std_dev)

0.7150199113753436
0.25028594569801765


# Ada boost

In [10]:
clf = AdaBoostClassifier(n_estimators=100, random_state=99)
# clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=10)
accuracy = scores.mean()
std_dev = scores.std()
print(accuracy)
print(std_dev)

0.6320617725656484
0.1697651099464596
