In [78]:
import pandas as pd
pd.set_option('display.max_columns', None)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import warnings
import classifier_utils
import random
warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
non_features_columns = ["chunk_id", "line_start", "line_end", "line_separator", "kind_conflict", "url", "project"]
non_features_columns.extend(["project_user", "project_name", "path", "file_name", "sha", "leftsha", "rightsha", "basesha"])

In [59]:
selected_dataset = pd.read_csv("../../data/SELECTED_LABELLED_DATASET.csv")
projects = list(selected_dataset['project'].unique())

In [90]:
import importlib
importlib.reload(classifier_utils)
models = {}
models['sklearn-decisionTree'] = DecisionTreeClassifier(min_samples_split=5, random_state=99)
models['sklearn-adaboost'] = AdaBoostClassifier(n_estimators=100, random_state=99)
models['sklearn-randomForest'] = RandomForestClassifier(random_state=99)
models['xgBoost-randomForest'] = XGBRFClassifier(random_state=99, n_estimators=100, subsample=0.9, colsample_bynode=0.2, eval_metric='mlogloss')
# random.seed(99)
# project = random.choice(projects)

results = []
for project in projects:
    for model_name, model in models.items():
        row = []
        result = classifier_utils.evaluate_model([project], non_features_columns, model)
        accuracy = result.iloc[0]['accuracy']
        std_dev = result.iloc[0]['std_dev']
        baseline = result.iloc[0]['baseline (majority)']
        improvement = result.iloc[0]['improvement']
        row = [project, model_name, accuracy, std_dev, baseline, improvement]
        results.append(row)
    
results = pd.DataFrame(results, columns=['project', 'model', 'accuracy', 'std_dev', 'baseline', 'improvement'])
results.sort_values('improvement', ascending=False)
# print(results)
# pd.concat(results, ignore_index=True)

Unnamed: 0,project,model,accuracy,std_dev,baseline,improvement
22,apache/lucene-solr,sklearn-randomForest,0.650,0.045,0.266,144.424
23,apache/lucene-solr,xgBoost-randomForest,0.603,0.047,0.266,126.646
20,apache/lucene-solr,sklearn-decisionTree,0.568,0.049,0.266,113.543
38,hibernate/hibernate-orm,sklearn-randomForest,0.590,0.057,0.311,89.311
18,alexo/wro4j,sklearn-randomForest,0.581,0.029,0.307,89.282
...,...,...,...,...,...,...
49,eucalyptus/eucalyptus,sklearn-adaboost,0.325,0.088,0.423,-23.081
73,apache/accumulo,sklearn-adaboost,0.477,0.191,0.635,-24.876
41,CloudStack-extras/CloudStack-archive,sklearn-adaboost,0.297,0.091,0.437,-32.088
97,jgralab/jgralab,sklearn-adaboost,0.326,0.124,0.491,-33.569


In [91]:
dt = DecisionTreeClassifier(min_samples_split=5, random_state=99)
classifier_utils.evaluate_model(projects, non_features_columns, dt)

Unnamed: 0,project,observations,observations (without NaN),accuracy,std_dev,baseline (majority),improvement
5,apache__lucene-solr,1256,974,0.568,0.049,0.266,113.543
1,apache__directory-server,845,652,0.923,0.028,0.512,80.234
26,CCI-MIT__XCoLab,5512,3757,0.971,0.013,0.573,69.392
7,getrailo__railo,815,572,0.631,0.08,0.378,67.131
24,jgralab__jgralab,2072,1802,0.819,0.044,0.491,66.666
9,hibernate__hibernate-orm,1000,716,0.503,0.042,0.311,61.487
4,alexo__wro4j,1663,1368,0.495,0.037,0.307,61.172
10,CloudStack-extras__CloudStack-archive,1424,1106,0.7,0.027,0.437,60.245
0,Ramblurr__Anki-Android,892,759,0.681,0.059,0.439,55.231
3,freenet__fred,1268,1012,0.606,0.038,0.404,49.874
