In [27]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
import time
import re

import warnings
warnings.filterwarnings('ignore')

In [28]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [29]:
def apply_PCA(train, test, columns_name):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(0.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    #print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    #print("Before applying PCA train set size: ", train.shape)
    #print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    #print("After applying PCA train set size: ", train.shape)
    #print("After applying PCA test set size: ", test.shape)
    
    print (pd.DataFrame(pca.components_,columns=columns_name,index = ['PC-1','PC-2','PC-3','PC-4','PC-5']))
    
    return train,test

In [30]:
def perfrom_CART(train, test, train_label, test_label):
    clf = tree.DecisionTreeClassifier(criterion='gini', splitter='best', 
                                      min_samples_split = 2, min_weight_fraction_leaf=0.0)
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [31]:
def perfrom_KNN(train, test, train_label, test_label):
    clf = KNeighborsClassifier(n_neighbors=5, weights='distance')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [32]:
def perfrom_SVM(train, test, train_label, test_label):
    clf = svm.SVC(gamma='auto', C = 20.0, kernel='rbf')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [33]:
def perfrom_NB(train, test, train_label, test_label):
    clf = GaussianNB()
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [34]:
def perfrom_RF(train, test, train_label, test_label):
    clf = RandomForestClassifier(n_estimators=10, criterion='gini')
    start = time.perf_counter()
    clf.fit(train, train_label)
    end = time.perf_counter()
    fit_time = end - start
    start = time.perf_counter()
    predicted_label = clf.predict(test)
    end = time.perf_counter()
    predict_time = end - start
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    return recall, precision, f1, fit_time, predict_time

In [35]:
def measure_performance(true_label, predicted_label):   
    precision = recall = f1 = np.zeros(2, dtype=np.float32)
    report = classification_report(true_label, predicted_label)
    precision = precision_score(true_label, predicted_label, average=None, labels=[0,1])
    recall = recall_score(true_label, predicted_label, average=None, labels=[0,1])
    f1 = f1_score(true_label, predicted_label, average=None, labels=[0,1])
    return recall, precision, f1

In [36]:
def kfold_cv(data, true_label, columns_name):
    # 10 fold cv
    kf = KFold(n_splits=10, shuffle = True, random_state = 7)

    cv_recall_DT = []
    cv_precision_DT = []
    cv_f1_DT = []
    cv_fit_time_DT = []
    cv_predict_time_DT = []
    
    cv_recall_KNN = []
    cv_precision_KNN = []
    cv_f1_KNN = []
    cv_fit_time_KNN = []
    cv_predict_time_KNN = []
    
    cv_recall_SVM = []
    cv_precision_SVM = []
    cv_f1_SVM = []
    cv_fit_time_SVM = []
    cv_predict_time_SVM = []
    
    cv_recall_NB = []
    cv_precision_NB = []
    cv_f1_NB = []
    cv_fit_time_NB = []
    cv_predict_time_NB = []
    
    cv_recall_RF = []
    cv_precision_RF = []
    cv_f1_RF = []
    cv_fit_time_RF =[]
    cv_predict_time_RF = []


    for train_index, test_index in kf.split(data):
        train, test = data.iloc[train_index], data.iloc[test_index]
        train_label, test_label = true_label[train_index], true_label[test_index]

        train, test = apply_PCA(train, test, columns_name)

        recall, precision, f1, fit_time, predict_time = perfrom_CART(train, test, train_label, test_label)
        cv_recall_DT.append(recall)
        cv_precision_DT.append(precision)
        cv_f1_DT.append(f1)
        cv_fit_time_DT.append(fit_time)
        cv_predict_time_DT.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_KNN(train, test, train_label, test_label)
        cv_recall_KNN.append(recall)
        cv_precision_KNN.append(precision)
        cv_f1_KNN.append(f1)
        cv_fit_time_KNN.append(fit_time)
        cv_predict_time_KNN.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_SVM(train, test, train_label, test_label)
        cv_recall_SVM.append(recall)
        cv_precision_SVM.append(precision)
        cv_f1_SVM.append(f1)
        cv_fit_time_SVM.append(fit_time)
        cv_predict_time_SVM.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_NB(train, test, train_label, test_label)
        cv_recall_NB.append(recall)
        cv_precision_NB.append(precision)
        cv_f1_NB.append(f1)
        cv_fit_time_NB.append(fit_time)
        cv_predict_time_NB.append(predict_time)

        recall, precision, f1, fit_time, predict_time = perfrom_RF(train, test, train_label, test_label)
        cv_recall_RF.append(recall)
        cv_precision_RF.append(precision)
        cv_f1_RF.append(f1)
        cv_fit_time_RF.append(fit_time)
        cv_predict_time_RF.append(predict_time)

        
    recall_DT = np.mean(cv_recall_DT, axis= 0)
    precision_DT = np.mean(cv_precision_DT, axis= 0)
    f1_DT = np.mean(cv_f1_DT, axis= 0)
    fit_time_DT = np.mean(cv_fit_time_DT)
    predict_time_DT = np.mean(cv_predict_time_DT)

    recall_KNN = np.mean(cv_recall_KNN, axis= 0)
    precision_KNN = np.mean(cv_precision_KNN, axis= 0)
    f1_KNN = np.mean(cv_f1_KNN, axis= 0)
    fit_time_KNN = np.mean(cv_fit_time_KNN)
    predict_time_KNN = np.mean(cv_predict_time_KNN)
    
    recall_SVM = np.mean(cv_recall_SVM, axis= 0)
    precision_SVM = np.mean(cv_precision_SVM, axis= 0)
    f1_SVM =  np.mean(cv_f1_SVM, axis= 0)
    fit_time_SVM = np.mean(cv_fit_time_SVM)
    predict_time_SVM = np.mean(cv_predict_time_SVM)
    
    recall_NB = np.mean(cv_recall_NB, axis= 0)
    precision_NB = np.mean(cv_precision_NB, axis= 0)
    f1_NB = np.mean(cv_f1_NB, axis= 0)
    fit_time_NB = np.mean(cv_fit_time_NB)
    predict_time_NB = np.mean(cv_predict_time_NB)
    
    recall_RF = np.mean(cv_recall_RF, axis= 0)
    precision_RF = np.mean(cv_precision_RF, axis= 0)
    f1_RF = np.mean(cv_f1_RF, axis= 0)
    fit_time_RF = np.mean(cv_fit_time_RF)
    predict_time_RF = np.mean(cv_predict_time_RF)
    
    return recall_DT, precision_DT, f1_DT, fit_time_DT, predict_time_DT, recall_KNN, precision_KNN, f1_KNN,\
    fit_time_KNN, predict_time_KNN, recall_SVM, precision_SVM, f1_SVM, fit_time_SVM, predict_time_SVM, recall_NB,\
    precision_NB, f1_NB, fit_time_NB, predict_time_NB, recall_RF, precision_RF, f1_RF, fit_time_RF, predict_time_RF

In [37]:
def repeated_test(data, true_label, columns_name):
    repeated_recall_DT = []
    repeated_precision_DT = []
    repeated_f1_DT = []
    repeated_fit_time_DT = []
    repeated_predict_time_DT = []
    
    repeated_recall_KNN = []
    repeated_precision_KNN = []
    repeated_f1_KNN = []
    repeated_fit_time_KNN = []
    repeated_predict_time_KNN = []
    
    repeated_recall_SVM = []
    repeated_precision_SVM = []
    repeated_f1_SVM = []
    repeated_fit_time_SVM = []
    repeated_predict_time_SVM = []
    
    repeated_recall_NB = []
    repeated_precision_NB = []
    repeated_f1_NB = []
    repeated_fit_time_NB = []
    repeated_predict_time_NB = []
    
    repeated_recall_RF = []
    repeated_precision_RF = []
    repeated_f1_RF = []
    repeated_fit_time_RF = []
    repeated_predict_time_RF = []
    
    recall_DT= precision_DT= f1_DT= fit_time_DT= predict_time_DT= recall_KNN= precision_KNN= f1_KNN=\
    fit_time_KNN= predict_time_KNN= recall_SVM = precision_SVM= f1_SVM= fit_time_SVM= predict_time_SVM\
    = recall_NB= precision_NB= f1_NB= fit_time_NB= predict_time_NB= recall_RF= precision_RF= f1_RF\
    = fit_time_RF= predict_time_RF = 0
    
    for i in range(10):
        recall_DT, precision_DT, f1_DT, fit_time_DT, predict_time_DT, recall_KNN, precision_KNN, f1_KNN,\
        fit_time_KNN, predict_time_KNN, recall_SVM, precision_SVM, f1_SVM, fit_time_SVM, predict_time_SVM,\
        recall_NB, precision_NB, f1_NB, fit_time_NB, predict_time_NB, recall_RF, precision_RF, f1_RF, fit_time_RF,\
        predict_time_RF = kfold_cv(data, true_label, columns_name)
        
        repeated_recall_DT.append(recall_DT)
        repeated_precision_DT.append(precision_DT)
        repeated_f1_DT.append(f1_DT)
        repeated_fit_time_DT.append(fit_time_DT) 
        repeated_predict_time_DT.append(predict_time_DT)

        repeated_recall_KNN.append(recall_KNN)
        repeated_precision_KNN.append(precision_KNN)
        repeated_f1_KNN.append(f1_KNN)
        repeated_fit_time_KNN.append(fit_time_KNN) 
        repeated_predict_time_KNN.append(predict_time_KNN)

        repeated_recall_SVM.append(recall_SVM)
        repeated_precision_SVM.append(precision_SVM)
        repeated_f1_SVM.append(f1_SVM)
        repeated_fit_time_SVM.append(fit_time_SVM) 
        repeated_predict_time_SVM.append(predict_time_SVM)
        
        repeated_recall_NB.append(recall_NB)
        repeated_precision_NB.append(precision_NB)
        repeated_f1_NB.append(f1_NB)
        repeated_fit_time_NB.append(fit_time_NB) 
        repeated_predict_time_NB.append(predict_time_NB)
        
        repeated_recall_RF.append(recall_RF)
        repeated_precision_RF.append(precision_RF)
        repeated_f1_RF.append(f1_RF)
        repeated_fit_time_RF.append(fit_time_RF) 
        repeated_predict_time_RF.append(predict_time_RF)
        
    print("-------DT-------")
    print("Recall:", np.median(repeated_recall_DT, axis= 0))
    print("Precision:", np.median(repeated_precision_DT, axis= 0))
    print("f1 score:", np.median(repeated_f1_DT, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_DT))
    print("Predict time:", np.median(repeated_predict_time_DT))

    print("-------KNN-------")
    print("Recall:", np.median(repeated_recall_KNN, axis= 0))
    print("Precision:", np.median(repeated_precision_KNN, axis= 0))
    print("f1 score:", np.median(repeated_f1_KNN, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_KNN))
    print("Predict time:", np.median(repeated_predict_time_KNN))

    print("-------SVM-------")
    print("Recall:", np.median(repeated_recall_SVM, axis= 0))
    print("Precision:", np.median(repeated_precision_SVM, axis= 0))
    print("f1 score:", np.median(repeated_f1_SVM, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_SVM))
    print("Predict time:", np.median(repeated_predict_time_SVM))
    
    print("-------NB-------")
    print("Recall:", np.median(repeated_recall_NB, axis= 0))
    print("Precision:", np.median(repeated_precision_NB, axis= 0))
    print("f1 score:", np.median(repeated_f1_NB, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_NB))
    print("Predict time:", np.median(repeated_predict_time_NB))
    
    print("-------RF-------")
    print("Recall:", np.median(repeated_recall_RF, axis= 0))
    print("Precision:", np.median(repeated_precision_RF, axis= 0))
    print("f1 score:", np.median(repeated_f1_RF, axis= 0))
    print("Fit time:", np.median(repeated_fit_time_RF))
    print("Predict time:", np.median(repeated_predict_time_RF))

In [38]:
def calcFeatureImp(feature_vec, label_vec, feature_names_param, repeat=10):
    header_str, output= '', ''
    for name_ in feature_names_param:
        header_str = header_str + name_ + ','
    theRndForestModel = RandomForestClassifier()
    theRndForestModel.fit(feature_vec, label_vec)
    feat_imp_vector=theRndForestModel.feature_importances_

    for ind_ in range(repeat):
        for imp_vec_index in range(len(feat_imp_vector)):
            feat_imp_val = round(feat_imp_vector[imp_vec_index], 5)
            output = output +  str(feat_imp_val) + ','
        output = output + '\n'
    output_status = header_str + '\n' + output
    #print ("Feature importance: ", output_status)
    
    feat_imp_vector=list(feat_imp_vector)
    sorted_feat_imp_vector= [x_ for x_ in feat_imp_vector]
    sorted_feat_imp_vector.sort(reverse=True)
    
    sorted_feature_name = []
    for feat_imp_val in sorted_feat_imp_vector:
        feat_index = feat_imp_vector.index(feat_imp_val) 
        sorted_feature_name.append(feature_names_param[feat_index])
        
    print ("sorted feature names: ", sorted_feature_name)
    print ("sorted feature importance: ", sorted_feat_imp_vector)

In [None]:
process_data = pd.read_csv('..//FINAL_PROCESS_METRICS.csv') 
print("Initial process data shape: ", process_data.shape)
code_data = pd.read_csv('..//FINAL_CODE_METRICS.csv') 
print("Initial code data shape: ", code_data.shape)

actual_process_file_name = process_data['file_']
actual_code_file_name = code_data['FILE_PATH']

formatted_process_file_name = []
formatted_code_file_name = []   

for item in actual_process_file_name:
    formatted_process_file_name.append(re.split('V5/', item)[1]) 
for item in actual_code_file_name:
    formatted_code_file_name.append(re.split('V5/', item)[1])
    
process_data['file_'] =  formatted_process_file_name   
code_data['FILE_PATH'] =  formatted_code_file_name   
    
formatted_process_file_name = set(line.strip() for line in formatted_process_file_name)
formatted_code_file_name = set(line.strip() for line in formatted_code_file_name)    
    
common_data = []
true_label = []

for common_entry in formatted_process_file_name & formatted_code_file_name:
    if common_entry:
        process_index =  process_data[process_data['file_'] == common_entry].index[0]
        common_data.append(process_data.loc[process_index])
        true_label.append(process_data.iloc[process_index]['defect_status'])

data = pd.DataFrame(common_data)
data = data.reset_index(drop=True)
data = data.drop(columns=['org', 'file_', 'FILE_PATH', 'MT_PP', 'MT_NON_PP', 'defect_status'])

true_label = np.array(true_label)

columns_name = list(data.columns.values) # read all column names
calcFeatureImp(data, true_label, columns_name) # find feature importance

repeated_test(data, true_label, columns_name) #repeat kfold 10 times and report avarage performance

Initial process data shape:  (6477, 12)
Initial code data shape:  (6396, 10)
sorted feature names:  ['AVGCHNG', 'SCTR', 'COMM', 'OWN', 'DEV', 'MINOR']
sorted feature importance:  [0.41659168547968917, 0.38569896988609637, 0.08778956745997522, 0.08492126916772946, 0.014903482830879637, 0.010095025175630119]
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.429279  0.594391 -0.013228  0.508089 -0.418105 -0.171107
PC-2  0.195399  0.105998  0.256950  0.325938  0.422262  0.774610
PC-3 -0.021551 -0.031417  0.964178 -0.110793 -0.159742 -0.176399
PC-4  0.781161 -0.232138 -0.050534 -0.504961 -0.213910  0.180562
PC-5  0.394456 -0.036573  0.040096  0.119424  0.722789 -0.552062
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.425281  0.595204 -0.008288  0.518066 -0.412317 -0.162489
PC-2  0.198313  0.100322  0.234813  0.316548  0.435760  0.778062
PC-3 -0.016334 -0.031599  0.969736 -0.103948 -0.150318 -0.157944
PC-4  0.836005 -0.252352 -0.047577 -0.4625

          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.425281  0.595204 -0.008288  0.518066 -0.412317 -0.162489
PC-2  0.198313  0.100322  0.234813  0.316548  0.435760  0.778062
PC-3 -0.016334 -0.031599  0.969736 -0.103948 -0.150318 -0.157944
PC-4  0.836005 -0.252352 -0.047577 -0.462532 -0.116599  0.087294
PC-5  0.272071 -0.004105  0.046290  0.192754  0.743612 -0.577672
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.425737  0.596505 -0.007542  0.519319 -0.405968 -0.168435
PC-2  0.211639  0.105215  0.200758  0.315357  0.455727  0.772462
PC-3 -0.013185 -0.028605  0.977740 -0.089083 -0.133407 -0.131526
PC-4  0.824084 -0.245601 -0.045951 -0.476232 -0.146748  0.100610
PC-5  0.293099 -0.011670  0.039392  0.158408  0.736029 -0.587855
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.423321  0.595615 -0.010076  0.518426 -0.413014 -0.163075
PC-2  0.200228  0.105406  0.215697  0.315555  0.439887  0.780511
PC-3 -0.028102 -0.024183 

          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.423321  0.595615 -0.010076  0.518426 -0.413014 -0.163075
PC-2  0.200228  0.105406  0.215697  0.315555  0.439887  0.780511
PC-3 -0.028102 -0.024183  0.974763 -0.088440 -0.141949 -0.143148
PC-4  0.850166 -0.245319 -0.026633 -0.456575 -0.074097  0.048743
PC-5  0.221625  0.010985  0.050009  0.220183  0.748320 -0.582921
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.419695  0.596997 -0.007397  0.513992 -0.415971 -0.173713
PC-2  0.210283  0.105873  0.210635  0.325277  0.438166  0.776151
PC-3 -0.010462 -0.030767  0.975580 -0.096928 -0.132373 -0.142375
PC-4  0.824477 -0.239305 -0.050520 -0.477843 -0.145034  0.105115
PC-5  0.301770 -0.017155  0.035627  0.174752  0.736592 -0.578157
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.436066  0.592013 -0.005906  0.513276 -0.409404 -0.168130
PC-2  0.228090  0.080116  0.446843  0.259288  0.377936  0.729261
PC-3 -0.022134 -0.074388 

          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.436066  0.592013 -0.005906  0.513276 -0.409404 -0.168130
PC-2  0.228090  0.080116  0.446843  0.259288  0.377936  0.729261
PC-3 -0.022134 -0.074388  0.874966 -0.228234 -0.316652 -0.275775
PC-4  0.750466 -0.234802 -0.160545 -0.503612 -0.250764  0.198460
PC-5  0.429791 -0.065382  0.094621  0.074163  0.688215 -0.568253
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.425786  0.594601 -0.007823  0.515858 -0.414069 -0.165930
PC-2  0.194877  0.108012  0.228373  0.317377  0.433772  0.780587
PC-3 -0.013081 -0.032979  0.971398 -0.101913 -0.144064 -0.154876
PC-4  0.814139 -0.245145 -0.051018 -0.479657 -0.164324  0.131932
PC-5  0.331187 -0.021561  0.039591  0.161052  0.736270 -0.565908
          COMM       DEV   AVGCHNG     MINOR       OWN      SCTR
PC-1  0.419148  0.595110 -0.008228  0.512093 -0.420080 -0.177159
PC-2  0.189319  0.115770  0.206054  0.336905  0.430285  0.780799
PC-3 -0.007458 -0.030887 