In [1]:
# Data Load 
import pandas as pd
import os

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def prepare_and_split_data(commits_df:pd.DataFrame ):
    #removed zero valued columns
    commits_df = commits_df.loc[:, (commits_df != 0).any(axis=0)]
    #split data
    X = commits_df.drop(['LABEL'], axis = 1)
    y = commits_df['LABEL']
    training, testing, training_labels, testing_labels = train_test_split(X, y, test_size = .25, random_state = 42)
    
    # Normalize the data
    sc = StandardScaler()
    normed_train_data = pd.DataFrame(sc.fit_transform(training), columns = X.columns)
    normed_test_data = pd.DataFrame(sc.fit_transform(testing), columns = X.columns)

    return normed_train_data,training_labels,normed_test_data,testing_labels

In [3]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,f1_score,roc_auc_score,precision_score,recall_score,balanced_accuracy_score

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear"),
    SVC(),
    GaussianProcessClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    MLPClassifier(),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]


In [4]:
def generate_all_clf_perf(normed_train_data,training_labels,normed_test_data,testing_labels):
    classifiers_perf = []
    for i, clf in enumerate(classifiers):
        try:
            clf.fit(normed_train_data, training_labels)
            preds = clf.predict(normed_test_data)
            
            print(f'\t\t {clf} accuracy = {accuracy_score(testing_labels,preds)}')
            #print(classification_report(testing_labels,preds))
            #print(confusion_matrix(testing_labels, preds))
            classifiers_perf.append({
                "classifier":names[i],
                "accuracy_score":accuracy_score(testing_labels,preds),
                "balanced_accuracy_score":balanced_accuracy_score(testing_labels,preds),
                "f1_score":f1_score(testing_labels,preds),
                "roc_auc_score":roc_auc_score(testing_labels,preds),
                "precision_score":precision_score(testing_labels,preds),
                "recall_score":recall_score(testing_labels,preds),
                })
        
        except Exception as e:
            print(f'\tProcessing Classifer {names[i]} failed with error {e}')
    return classifiers_perf

In [5]:

processed_files_path = 'processed_data/'
processed_files = os.listdir(processed_files_path)

for p_file in processed_files:
    try:
        print(f'Processing {p_file} started ...', end='')
        commits_df = pd.read_csv (processed_files_path+p_file)
        x_train,y_train,x_test,y_test  = prepare_and_split_data(commits_df)
        classifiers_perf = generate_all_clf_perf(x_train,y_train,x_test,y_test)
        pd.DataFrame(classifiers_perf).to_csv( f'ml_perf_report/{p_file}',index=False)
        print(f'Processing {p_file} successfully complete!\n')
    except Exception as e:
        print(f'Processing {p_file} failed with error {e}')
    

Processing eBay_parallec.csv started ...		 KNeighborsClassifier(n_neighbors=3) accuracy = 0.972972972972973
		 SVC(kernel='linear') accuracy = 1.0
		 SVC() accuracy = 1.0
		 GaussianProcessClassifier() accuracy = 0.972972972972973
		 DecisionTreeClassifier() accuracy = 1.0
		 RandomForestClassifier() accuracy = 1.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


		 MLPClassifier() accuracy = 1.0
		 AdaBoostClassifier() accuracy = 0.9459459459459459
		 GaussianNB() accuracy = 0.5675675675675675
		 QuadraticDiscriminantAnalysis() accuracy = 0.5675675675675675
Processing eBay_parallec.csv successfully complete!

Processing gradle_tooling-commons.csv started ...Processing gradle_tooling-commons.csv failed with error "['LABEL'] not found in axis"
Processing GrammarViz2_grammarviz2_src.csv started ...	Processing Classifer Nearest Neighbors failed with error Input X contains NaN.
KNeighborsClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all e



	Processing Classifer Neural Net failed with error Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
	Processing Classifer AdaBoost failed with error Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively.



	Processing Classifer Neural Net failed with error Input X contains NaN.
MLPClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
	Processing Classifer AdaBoost failed with error Input X contains NaN.
AdaBoostClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively.

