In [1]:
# import pickle
from my_util import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, matthews_corrcoef, precision_recall_fscore_support, classification_report, auc

from imblearn.over_sampling import SMOTE, RandomOverSampler

import numpy as np
from scipy.optimize import differential_evolution
import pandas as pd
import time, pickle, math, warnings, os
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

projects = ['openstack','qt']
sampling_methods = 'DE_SMOTE_min_df_3'

remove_python_common_tokens = True

create_path_if_not_exist('./data/')
create_path_if_not_exist('./final_model/')

In [2]:
def get_combined_df(code_commit, commit_id, label, metrics_df, count_vect):
    code_df = pd.DataFrame()
    code_df['commit_id'] = commit_id
    code_df['code'] = code_commit
    code_df['label'] = label
    
    code_df = code_df.sort_values(by='commit_id')
    
    metrics_df = metrics_df.sort_values(by='commit_id')
    metrics_df = metrics_df.drop('commit_id',axis=1)
    
    code_change_arr = count_vect.transform(code_df['code']).astype(np.int16).toarray()
    metrics_df_arr = metrics_df.to_numpy(dtype=np.float32)
    
    final_features = np.concatenate((code_change_arr,metrics_df_arr),axis=1)

    return final_features, list(code_df['commit_id']), list(code_df['label'])


In [3]:
def objective_func(k, train_feature, train_label, valid_feature, valid_label):
    smote = SMOTE(random_state=42, k_neighbors= int(np.round(k)), n_jobs=32)
    train_feature_res, train_label_res = smote.fit_resample(train_feature, train_label)
    
    clf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
    clf.fit(train_feature_res, train_label_res)
    
    prob = clf.predict_proba(valid_feature)[:,1]
    auc = roc_auc_score(valid_label, prob)
    
    return -auc

## The code below this cell is used to

1. obtain the best k_neighbor of SMOTE (the value is rounded to int)
2. resample train data using SMOTE with the best k_neighbor value
3. train RF model and obtain prediction result from the model


In [15]:
def run_experiment(cur_proj):
    data_path = './data/'
    model_path = './final_model/'
        
    train_code, train_commit, train_label = prepare_data(cur_proj, mode='train',
                                                                  remove_python_common_tokens=remove_python_common_tokens)
    test_code, test_commit, test_label = prepare_data(cur_proj, mode='test',
                                                              remove_python_common_tokens=remove_python_common_tokens)

    commit_metrics = load_change_metrics_df(cur_proj)
    train_commit_metrics = commit_metrics[commit_metrics['commit_id'].isin(train_commit)]
    test_commit_metrics = commit_metrics[commit_metrics['commit_id'].isin(test_commit)]
    
    count_vect = CountVectorizer(min_df=3, ngram_range=(1,1))
    count_vect.fit(train_code)
    
    train_feature, train_commit_id, new_train_label = get_combined_df(train_code, train_commit, train_label, train_commit_metrics,count_vect)
    test_feature, test_commit_id, new_test_label = get_combined_df(test_code, test_commit, test_label, test_commit_metrics,count_vect)

    percent_80 = int(len(new_train_label)*0.8)
    
    final_train_feature = train_feature[:percent_80]
    final_train_commit_id = train_commit_id[:percent_80]
    final_new_train_label = new_train_label[:percent_80]
    
    valid_feature = train_feature[percent_80:]
    valid_commit_id = train_commit_id[percent_80:]
    valid_label = new_train_label[percent_80:]

    print('load data of',cur_proj, 'finish')
    
    # bounds = [(1,20)]
    # result = differential_evolution(objective_func, bounds, args=(final_train_feature, final_new_train_label, 
    #                                                               valid_feature, valid_label),
    #                                popsize=10, mutation=0.7, recombination=0.3,seed=0)
    
    # smote = SMOTE(random_state=42, n_jobs=32, k_neighbors=int(np.round(result.x)))
    # train_feature_res, train_label_res = smote.fit_resample(final_train_feature, final_new_train_label)
    ros = RandomOverSampler(random_state=0)
    train_feature_res, train_label_res = ros.fit_resample(train_feature, new_train_label)

    # mlp_parameter_space = {
    #     'hidden_layer_sizes': [(256,64,32), (64,32), (256, 32)],
    #     'activation': ['tanh', 'relu'],
    #     'solver': ['sgd', 'adam'],
    #     'alpha': [0.0001, 0.05],
    #     'learning_rate': ['constant','adaptive'],
    # }
    clf_name = 'RF'
    clf_rf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1, verbose=True)
    clf_svm = SVC(C=0.1, gamma=0.1, probability=True, verbose=True)
    clf_log_reg = LogisticRegression(verbose=True)
    
    clf = clf_rf
    # clf_mlp = MLPClassifier(hidden_layer_sizes=(50,50,50), max_iter=1000, activation='relu')
    # clf_mlp_gs = GridSearchCV(clf_mlp, mlp_parameter_space, n_jobs=-1, cv=3, verbose=10)
    print("training first model")
    trained_clf_rf, _ = train_eval_model(clf_rf, train_feature_res, train_label_res, 
                                       test_feature, new_test_label)
    print("finished training the first model")
    trained_clf_svm, _ = train_eval_model(clf_svm, train_feature_res, train_label_res, 
                                       test_feature, new_test_label)
    print("finished training the second model")
    trained_clf_log_reg, _ = train_eval_model(clf_log_reg, train_feature_res, train_label_res, 
                                       test_feature, new_test_label)
    print("finished training the third model")

    from sklearn.ensemble import VotingClassifier
    estimators=[('rf', trained_clf_rf), ('svm', trained_clf_svm), ('log_reg', trained_clf_log_reg)]
    clf_ensemble = VotingClassifier(estimators, voting='hard')

    trained_clf, pred_df = train_eval_model(clf_ensemble, train_feature_res, train_label_res, 
                                       test_feature, new_test_label)
    print("finished training model")
    pred_df['test_commit'] = test_commit_id
    pred_df.to_csv(data_path+cur_proj+'_'+clf_name+'_'+sampling_methods+'_prediction_result.csv')

    model_path = model_path+cur_proj+'_'+clf_name+'_'+sampling_methods+'.pkl'
    pickle.dump(trained_clf, open(model_path, 'wb'))

    print('finished',cur_proj)
    print('-'*100)

    # k_of_smote = result.x
    # best_AUC_of_obj_func = result.fun
    
    return # k_of_smote, best_AUC_of_obj_func

In [14]:
run_experiment('openstack')

100%|██████████| 11973/11973 [00:23<00:00, 508.65it/s]
100%|██████████| 1331/1331 [00:02<00:00, 608.57it/s]


load data of openstack finish
training first model


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    7.1s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   38.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.0min finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 300 out of 300 | elapsed:    0.2s finished


finished training the first model
[LibSVM]..........
*
optimization finished, #iter = 10509
obj = -1627.170000, rho = -0.700000
nSV = 21018, nBSV = 21018
Total nSV = 21018


AttributeError: predict_proba is not available when  probability=False

In [6]:
# print('The best k_neighbors of Openstack:', openstack_k_of_smote)

In [7]:
# run_experiment('qt')

In [8]:
# print('The best k_neighbors of Qt:', qt_k_of_smote)

## RQ1-RQ2 result

In [10]:
RF_data_dir = './data/'

def get_recall_at_k_percent_effort(percent_effort, result_df_arg, real_buggy_commits):
    cum_LOC_k_percent = (percent_effort/100)*result_df_arg.iloc[-1]['cum_LOC']
    buggy_line_k_percent =  result_df_arg[result_df_arg['cum_LOC'] <= cum_LOC_k_percent]
    buggy_commit = buggy_line_k_percent[buggy_line_k_percent['label']==1]
    recall_k_percent_effort = len(buggy_commit)/float(len(real_buggy_commits))
    
    return recall_k_percent_effort

def eval_metrics(result_df):
    
    pred = result_df['defective_commit_pred']
    y_test = result_df['label']

    ACC = accuracy_score(y_test, pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_test,pred,average='binary') # at threshold = 0.5
    tn, fp, fn, tp = confusion_matrix(y_test, pred, labels=[0, 1]).ravel()
#     rec = tp/(tp+fn)
    
    FAR = fp/(fp+tn) # false alarm rate
    dist_heaven = math.sqrt((pow(1-rec,2)+pow(0-FAR,2))/2.0) # distance to heaven
    
    AUC = roc_auc_score(y_test, result_df['defective_commit_prob'])

    result_df['defect_density'] = result_df['defective_commit_prob']/result_df['LOC'] # predicted defect density
    result_df['actual_defect_density'] = result_df['label']/result_df['LOC'] #defect density

    result_df = result_df.sort_values(by='defect_density',ascending=False)
    actual_result_df = result_df.sort_values(by='actual_defect_density',ascending=False)
    actual_worst_result_df = result_df.sort_values(by='actual_defect_density',ascending=True)

    result_df['cum_LOC'] = result_df['LOC'].cumsum()
    actual_result_df['cum_LOC'] = actual_result_df['LOC'].cumsum()
    actual_worst_result_df['cum_LOC'] = actual_worst_result_df['LOC'].cumsum()

    real_buggy_commits = result_df[result_df['label'] == 1]

    label_list = list(result_df['label'])

    all_rows = len(label_list)

    # find Recall@20%Effort
    cum_LOC_20_percent = 0.2*result_df.iloc[-1]['cum_LOC']
    buggy_line_20_percent = result_df[result_df['cum_LOC'] <= cum_LOC_20_percent]
    buggy_commit = buggy_line_20_percent[buggy_line_20_percent['label']==1]
    recall_20_percent_effort = len(buggy_commit)/float(len(real_buggy_commits))

    # find Effort@20%Recall
    buggy_20_percent = real_buggy_commits.head(math.ceil(0.2 * len(real_buggy_commits)))
    buggy_20_percent_LOC = buggy_20_percent.iloc[-1]['cum_LOC']
    effort_at_20_percent_LOC_recall = int(buggy_20_percent_LOC) / float(result_df.iloc[-1]['cum_LOC'])
    
    # find P_opt
    percent_effort_list = []
    predicted_recall_at_percent_effort_list = []
    actual_recall_at_percent_effort_list = []
    actual_worst_recall_at_percent_effort_list = []
    
    for percent_effort in np.arange(10,101,10):
        predicted_recall_k_percent_effort = get_recall_at_k_percent_effort(percent_effort, result_df, real_buggy_commits)
        actual_recall_k_percent_effort = get_recall_at_k_percent_effort(percent_effort, actual_result_df, real_buggy_commits)
        actual_worst_recall_k_percent_effort = get_recall_at_k_percent_effort(percent_effort, actual_worst_result_df, real_buggy_commits)
        
        percent_effort_list.append(percent_effort/100)
        
        predicted_recall_at_percent_effort_list.append(predicted_recall_k_percent_effort)
        actual_recall_at_percent_effort_list.append(actual_recall_k_percent_effort)
        actual_worst_recall_at_percent_effort_list.append(actual_worst_recall_k_percent_effort)

    p_opt = 1 - ((auc(percent_effort_list, actual_recall_at_percent_effort_list) - 
                 auc(percent_effort_list, predicted_recall_at_percent_effort_list)) /
                (auc(percent_effort_list, actual_recall_at_percent_effort_list) -
                auc(percent_effort_list, actual_worst_recall_at_percent_effort_list)))
    
    return ACC, prec, rec, f1, AUC, FAR, dist_heaven, recall_20_percent_effort, effort_at_20_percent_LOC_recall, p_opt

def eval_result(proj_name,sampling_method = 'DE_SMOTE_min_df_3'):
    
    RF_result = pd.read_csv(RF_data_dir+proj_name+'_RF_'+sampling_method+'_prediction_result.csv')
    
    RF_result.columns = ['Unnamed', 'defective_commit_prob','defective_commit_pred','label','test_commit'] # for new result

    test_code, test_commit, test_label = prepare_data(proj_name, mode='test',
                                                              remove_python_common_tokens=remove_python_common_tokens)

    # get LOC of each commit
    RF_LOC = [len(code.splitlines()) for code in test_code]
    RF_df = pd.DataFrame()
    RF_df['commit_id'] = test_commit
    RF_df['LOC'] = RF_LOC

    RF_result = pd.merge(RF_df, RF_result,how='inner',left_on = 'commit_id', right_on='test_commit')
    acc, prec, rec, f1, auc, FAR, dist_heaven, recall_20_percent_effort, effort_at_20_percent_LOC_recall,p_opt = eval_metrics(RF_result)
    
    
    print('Accuracy: {:.2f}, Precision: {:.2f}, Recall: {:.2f}, F1: {:.2f}, AUC: {:.2f}, FAR: {:.2f}, d2h: {:.2f}, PCI@20%LOC: {:.2f}, Effort@20%Recall: {:.2f}, POpt: {:.2f}'.format(acc, prec, rec, f1, auc, FAR, dist_heaven, recall_20_percent_effort, effort_at_20_percent_LOC_recall,p_opt))

In [12]:
eval_result('openstack')

100%|██████████| 1331/1331 [00:02<00:00, 588.66it/s]


Accuracy: 0.88, Precision: 0.51, Recall: 0.29, F1: 0.37, AUC: 0.83, FAR: 0.04, d2h: 0.50, PCI@20%LOC: 0.58, Effort@20%Recall: 0.04, POpt: 0.83


In [12]:
# eval_result('qt')

## RQ3 result

note: the_best_k_neighbors is obtained from model training phase

In [13]:
def check_train_time(cur_proj, the_best_k_neighbors):
    data_path = './data/'
    model_path = './final_model/'
        
    train_code, train_commit, train_label = prepare_data(cur_proj, mode='train',
                                                                  remove_python_common_tokens=remove_python_common_tokens)

    commit_metrics = load_change_metrics_df(cur_proj)
    train_commit_metrics = commit_metrics[commit_metrics['commit_id'].isin(train_commit)]
    
    count_vect = CountVectorizer(min_df=3, ngram_range=(1,1))
    count_vect.fit(train_code)
    
    print('fit countvectorizer finished')
    
    train_feature, train_commit_id, new_train_label = get_combined_df(train_code, train_commit, train_label, train_commit_metrics,count_vect)

    percent_80 = int(len(new_train_label)*0.8)
    
    final_train_feature = train_feature[:percent_80]
    final_train_commit_id = train_commit_id[:percent_80]
    final_new_train_label = new_train_label[:percent_80]
    
    smote = SMOTE(random_state=42, n_jobs=1, k_neighbors=the_best_k_neighbors)
    
    train_feature_res, train_label_res = smote.fit_resample(final_train_feature, final_new_train_label)

    clf = RandomForestClassifier(n_estimators=300, random_state=42, n_jobs=-1)
    clf_name = 'RF'
    
    start = time.time()
    
    clf.fit(train_feature_res, train_label_res)
    
    end = time.time()
    
    train_time = end-start
    print('train time of {} is {:.3f} secs'.format(cur_proj,train_time))

In [17]:
check_train_time('openstack', 9)

fit countvectorizer finished
train time of openstack is 35.702 secs


In [18]:
check_train_time('qt', 14)

fit countvectorizer finished
train time of qt is 174.545 secs
