In [6]:
# Data Load 
import pandas as pd
import os
import traceback

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def prepare_and_split_data(commits_df:pd.DataFrame ):
    #removed zero valued columns
    commits_df = commits_df.loc[:, (commits_df != 0).any(axis=0)]
    #split data
    X = commits_df.drop(['LABEL'], axis = 1)
    y = commits_df['LABEL']
    training, testing, training_labels, testing_labels = train_test_split(X, y, test_size = .25, random_state = 42)
    # return training, training_labels, testing, testing_labels
    # Normalize the data
    sc = StandardScaler()
    normed_train_data = pd.DataFrame(sc.fit_transform(training), columns = X.columns)
    normed_test_data = pd.DataFrame(sc.fit_transform(testing), columns = X.columns)

    return normed_train_data,training_labels,normed_test_data,testing_labels

In [8]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,f1_score,roc_auc_score,precision_score,recall_score,balanced_accuracy_score

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "Logistic Regression",
    "Ridge Classifier",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear"),
    SVC(),
    GaussianProcessClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    LogisticRegression(),
    RidgeClassifier(),
]


In [9]:
def generate_all_clf_perf(normed_train_data,training_labels,normed_test_data,testing_labels):
    classifiers_perf = []
    for i, clf in enumerate(classifiers):
        try:
            clf.fit(normed_train_data, training_labels)
            preds = clf.predict(normed_test_data)
            
            print(f'\t\t {clf} accuracy = {accuracy_score(testing_labels,preds)}')
            #print(classification_report(testing_labels,preds))
            #print(confusion_matrix(testing_labels, preds))
            classifiers_perf.append({
                "classifier":names[i],
                "accuracy_score":accuracy_score(testing_labels,preds),
                "balanced_accuracy_score":balanced_accuracy_score(testing_labels,preds),
                "f1_score":f1_score(testing_labels,preds),
                "roc_auc_score":roc_auc_score(testing_labels,preds),
                "precision_score":precision_score(testing_labels,preds),
                "recall_score":recall_score(testing_labels,preds),
                })
        
        except Exception as e:
            print(f'\tProcessing Classifer {names[i]} failed with error {e}')
            traceback.print_exc()
    return classifiers_perf

In [10]:
import numpy as np

processed_files_path = 'processed_data/'
processed_files = os.listdir(processed_files_path)

for p_file in processed_files:
    try:
        print(f'Processing {p_file} started ...')
        commits_df = pd.read_csv (processed_files_path+p_file)
        print(f'\tData Shape : {commits_df.shape}')
        print(f'\tAvailable cols : {commits_df.columns}')
        #print(f'Available cols : {commits_df.describe()}')
        notna=np.all(commits_df.notna())
        if not notna:
            print(f"\tDataset has Nan in following columns :")
            for f in commits_df.columns:
                if not np.all(commits_df[f].notna()):
                    print(f"\t\t\t{f} ")
            continue
                
        x_train,y_train,x_test,y_test  = prepare_and_split_data(commits_df)
        classifiers_perf = generate_all_clf_perf(x_train,y_train,x_test,y_test)
        pd.DataFrame(classifiers_perf).to_csv( f'ml_perf_report/{p_file}',index=False)
        print(f'Processing {p_file} successfully complete!\n')
    except Exception as e:
        print(f'Processing {p_file} failed with error {e}')
        traceback.print_exc()
    

Processing dropwizard_dropwizard.csv started ...
	Data Shape : (5150, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_bot', 'CM_com', 'CM_dependabot', 'CM_github', 'CM_https',
       'CM_maven', 'CM_merge', 'CM_pull', 'CM_release', 'CM_request'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.9728260869565217
		 SVC(kernel='linear') accuracy = 0.9751552795031055


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 SVC() accuracy = 0.9751552795031055
		 GaussianProcessClassifier() accuracy = 0.9743788819875776
		 DecisionTreeClassifier() accuracy = 0.968167701863354
		 RandomForestClassifier() accuracy = 0.9720496894409938
		 MLPClassifier() accuracy = 0.9751552795031055
		 AdaBoostClassifier() accuracy = 0.9751552795031055
		 GaussianNB() accuracy = 0.9751552795031055
		 QuadraticDiscriminantAnalysis() accuracy = 0.9751552795031055


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 LogisticRegression() accuracy = 0.9751552795031055
		 RidgeClassifier() accuracy = 0.9751552795031055
Processing dropwizard_dropwizard.csv successfully complete!

Processing eBay_parallec.csv started ...
	Data Shape : (146, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_add', 'CM_cn', 'CM_doc', 'CM_fix', 'CM_merge', 'CM_pull',
       'CM_readme', 'CM_request', 'CM_test', 'CM_update'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.972972972972973
		 SVC(kernel='linear') accuracy = 1.0
		 SVC() accuracy = 1.0
		 GaussianProcessClassifier() accuracy = 0.972972972972973
		 DecisionTreeClassifier() accuracy = 0.9459459459459459
		 RandomForestClassifier() accuracy = 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "C:\Users\debon\AppData\Local\Temp\ipykernel_20108\1945774016.py", line 21, in <module>
    x_train,y_train,x_test,y_test  = prepare_and_split_data(commits_df)
  File "C:\Users\debon\AppData\Local\Temp\ipykernel_20108\2754141126.py", line 8, in prepare_and_split_data
    X = commits_df.drop(['LABEL'], axis = 1)
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py", line 331, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\core\frame.py", line 5388, in drop
    return super().drop(
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py", lin

		 MLPClassifier() accuracy = 1.0
		 AdaBoostClassifier() accuracy = 0.9459459459459459
		 GaussianNB() accuracy = 0.5675675675675675
		 QuadraticDiscriminantAnalysis() accuracy = 0.5675675675675675
		 LogisticRegression() accuracy = 1.0
		 RidgeClassifier() accuracy = 1.0
Processing eBay_parallec.csv successfully complete!

Processing gradle_tooling-commons.csv started ...
	Data Shape : (893, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_add', 'CM_api', 'CM_build', 'CM_fix', 'CM_gradle',
       'CM_review', 'CM_test', 'CM_toolingcommons', 'CM_use', 'CM_version'],
      dtype='object')
Processing gradle_tooling-commons.csv failed with error "['LABEL'] not found in axis"
Processing GrammarViz2_grammarviz2_src.csv started ...
	Data Shape : (446, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.8839285714285714
		 AdaBoostClassifier() accuracy = 0.8214285714285714
		 GaussianNB() accuracy = 0.9107142857142857
		 QuadraticDiscriminantAnalysis() accuracy = 0.9107142857142857
		 LogisticRegression() accuracy = 0.875
		 RidgeClassifier() accuracy = 0.875
Processing GrammarViz2_grammarviz2_src.csv successfully complete!

Processing jMotif_GI.csv started ...
	Data Shape : (383, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_adding', 'CM_fixing', 'CM_gi', 'CM_jmotif', 'CM_maven',
       'CM_next', 'CM_plugin', 'CM_prepare', 'CM_release', 'CM_working'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.875
		 SVC(kernel='linear') accuracy = 0.9375
		 SVC() accuracy = 0.875
		 GaussianProcessClassifier() accuracy = 0.96875
		 DecisionTreeClassifier() accuracy = 0.7916666666666666
		 Ra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.96875
		 AdaBoostClassifier() accuracy = 0.8645833333333334
		 GaussianNB() accuracy = 0.875
		 QuadraticDiscriminantAnalysis() accuracy = 0.875
		 LogisticRegression() accuracy = 0.96875
		 RidgeClassifier() accuracy = 0.8645833333333334
Processing jMotif_GI.csv successfully complete!

Processing jMotif_SAX.csv started ...
	Data Shape : (670, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_adding', 'CM_jmotif', 'CM_maven', 'CM_plugin',
       'CM_prepare', 'CM_readme', 'CM_release', 'CM_sax', 'CM_update',
       'CM_updating'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.9583333333333334
		 SVC(kernel='linear') accuracy = 0.9523809523809523
		 SVC() accuracy = 0.9523809523809523
		 GaussianProcessClassifier() accuracy = 0.9583333333333334
		 DecisionTreeClassifier() accuracy = 0.9

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.9523809523809523
		 AdaBoostClassifier() accuracy = 0.8690476190476191
		 GaussianNB() accuracy = 0.8690476190476191
		 QuadraticDiscriminantAnalysis() accuracy = 0.8690476190476191
		 LogisticRegression() accuracy = 0.9523809523809523
		 RidgeClassifier() accuracy = 0.9523809523809523
Processing jMotif_SAX.csv successfully complete!

Processing ksclarke_solr-iso639-filter.csv started ...
	Data Shape : (421, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_filter', 'CM_iso639', 'CM_maven', 'CM_plugin', 'CM_pom',
       'CM_prepare', 'CM_release', 'CM_script', 'CM_solr', 'CM_version'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.7735849056603774
		 SVC(kernel='linear') accuracy = 0.8207547169811321
		 SVC() accuracy = 0.8207547169811321
		 GaussianProcessClassifier() accuracy = 0.830

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.839622641509434
		 AdaBoostClassifier() accuracy = 0.5566037735849056
		 GaussianNB() accuracy = 0.5943396226415094
		 QuadraticDiscriminantAnalysis() accuracy = 0.5943396226415094
		 LogisticRegression() accuracy = 0.8018867924528302
		 RidgeClassifier() accuracy = 0.8018867924528302
Processing ksclarke_solr-iso639-filter.csv successfully complete!

Processing mtsar_mtsar.csv started ...
	Data Shape : (399, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_add', 'CM_csv', 'CM_dependencies', 'CM_dropwizard',
       'CM_fix', 'CM_make', 'CM_mtsar', 'CM_process', 'CM_update', 'CM_use'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.67
		 SVC(kernel='linear') accuracy = 0.64
		 SVC() accuracy = 0.69
		 GaussianProcessClassifier() accuracy = 0.67
		 DecisionTreeClassifier() accuracy = 0.66

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.64
		 AdaBoostClassifier() accuracy = 0.61
		 GaussianNB() accuracy = 0.3
		 QuadraticDiscriminantAnalysis() accuracy = 0.67
		 LogisticRegression() accuracy = 0.67
		 RidgeClassifier() accuracy = 0.65
Processing mtsar_mtsar.csv successfully complete!

Processing steve-community_steve.csv started ...
	Data Shape : (1721, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_bump', 'CM_com', 'CM_dependabot', 'CM_flyway', 'CM_github',
       'CM_https', 'CM_maven', 'CM_release', 'CM_spring', 'CM_version'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.9443155452436195
		 SVC(kernel='linear') accuracy = 0.951276102088167
		 SVC() accuracy = 0.951276102088167
		 GaussianProcessClassifier() accuracy = 0.9489559164733179
		 DecisionTreeClassifier() accuracy = 0.9118329466357309
		 RandomForestCl

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.9443155452436195
		 AdaBoostClassifier() accuracy = 0.951276102088167
		 GaussianNB() accuracy = 0.951276102088167
		 QuadraticDiscriminantAnalysis() accuracy = 0.951276102088167
		 LogisticRegression() accuracy = 0.951276102088167
		 RidgeClassifier() accuracy = 0.951276102088167
Processing steve-community_steve.csv successfully complete!

Processing tracee_contextlogger.csv started ...
	Data Shape : (370, 23)
	Available cols : Index(['DIFF_ND', 'DIFF_NF', 'DIFF_EN', 'SIZE_LA', 'SIZE_LD', 'PURP_FIX',
       'PURP_MR', 'SKIP_DOC', 'SKIP_MET', 'SKIP_COM', 'SKIP_FRM', 'SKIP_BLD',
       'LABEL', 'CM_added', 'CM_context', 'CM_java', 'CM_logger', 'CM_md',
       'CM_merge', 'CM_readme', 'CM_test', 'CM_tracee', 'CM_update'],
      dtype='object')
		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.956989247311828
		 SVC(kernel='linear') accuracy = 0.9247311827956989
		 SVC() accuracy = 0.956989247311828
		 GaussianProcessClassifier() accuracy = 0.935483870967

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Traceback (most recent call last):
  File "C:\Users\debon\AppData\Local\Temp\ipykernel_20108\1945774016.py", line 9, in <module>
    commits_df = pd.read_csv (processed_files_path+p_file)
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util\_decorators.py", line 331, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py", line 950, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "c:\Users\debon\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\parsers\readers.py", line 605, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "c:

		 GaussianProcessClassifier() accuracy = 0.925
		 DecisionTreeClassifier() accuracy = 0.8916666666666667
		 RandomForestClassifier() accuracy = 0.9375


  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 0.9375
		 AdaBoostClassifier() accuracy = 0.925
		 GaussianNB() accuracy = 0.9375
		 QuadraticDiscriminantAnalysis() accuracy = 0.9375
		 LogisticRegression() accuracy = 0.9333333333333333
		 RidgeClassifier() accuracy = 0.9375
Processing zixpo_candybar.csv successfully complete!



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
