In [1]:
import pandas as pd 
import numpy as np
import time
import re
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report, roc_auc_score, matthews_corrcoef
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import LinearSVC,SVR
from sklearn.feature_selection import RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Binarizer
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

In [2]:
stop_words = []
def clean_words(text):
    res = re.findall(r'\b\w+\b', text)
    return res

def remove_stopwords(text):
    text = str(text).lower().strip()
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
            
    return "".join(tokens)
def remove_comments(text):
    no_comments = re.sub(r'#.*', '', text)
    no_comments = re.sub(r'/\*.*\/\*', '', no_comments)
    no_comments = re.sub(r'\d+','',no_comments)
    no_comments = re.sub(r'_', ' ', no_comments)
    return no_comments

def remove_cammel(text):
    no_cammel = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text)).lower().split()
    return no_cammel



In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
classifiers = {
    'randomForest': RandomForestClassifier(random_state=1),
    'decisionTree': DecisionTreeClassifier(min_samples_leaf=1),
    'naiveBayes': GaussianNB(),
    'smo': CalibratedClassifierCV(LinearSVC(fit_intercept=False, tol=0.001, C=1, dual=False, max_iter=100000), method='sigmoid'),
    'knn': KNeighborsClassifier(n_neighbors=1, metric='euclidean'),
    'logisticRegression': LogisticRegression(max_iter=1000),
    'perceptron': CalibratedClassifierCV(Perceptron()),
    'lda': LinearDiscriminantAnalysis(),
}

def round_float(value):
    return float("{:.3f}".format(value))


def get_time(start_time):
    end_time = time.time()
    return end_time - start_time
def classify(x_train, x_test, y_train, y_test, classifiers, normalize=[]):
    
    labels = ['Flaky', 'NonFlaky']
    results = pd.DataFrame()
    
    comparison_values = {}
    
    # create a normalized version
    train_scaler = Binarizer(threshold=0.0,).fit(x_train)
    test_scaler = Binarizer(threshold=0.0).fit(x_test)
    x_train_norm = train_scaler.transform(x_train)
    x_test_norm = test_scaler.transform(x_test)
    
    for key, classifier in classifiers.items():
        
        x_train_exec = x_train 
        x_test_exec = x_test
        y_train_exec = y_train
        y_test_exec = y_test
        
        if (key in normalize):
            x_train_exec = x_train_norm
            x_test_exec = x_test_exec
        
        classifier.fit(x_train_exec, y_train)
        classifier.score(x_test_exec, y_test)
        
        predict = classifier.predict(x_test_exec)
        y_probs = classifier.predict_proba(x_test_exec)[:,1]
        
        # save_incorrect_classifications(x_test_exec, predict, y_test, key)
        
        result = {
            'classifier': key,
            'f1score': f1_score(y_test, predict, average='weighted'),
            'accuracy': classifier.score(x_test_exec, y_test),
            'confusionMatrix': confusion_matrix(y_test, predict),
            # 'execution': round_float(get_time(start_time)),
            'classificationReport': classification_report(y_test, predict, output_dict=True),
            'AUC': roc_auc_score(y_test, y_probs),
            'MCC': matthews_corrcoef(y_test, predict),
            
        }
        # results = pd.concat([results, result], ignore_index=True)
        results = results._append(result, ignore_index=True)
        
        
        # print(key, classification_report(y_test, predict, output_dict=True)['Flaky'], matthews_corrcoef(y_test, predict), roc_auc_score(y_test, y_probs), "\n \n")

        # res = [key, classification_report(y_test, predict, output_dict=True)['Flaky'], matthews_corrcoef(y_test, predict), roc_auc_score(y_test, y_probs)]
    
        # classifications = classification_report(y_test, predict, output_dict=True)['Flaky']
    
    return results

In [5]:
#init dataset
data = pd.read_excel('./flaky_data.xlsx')
df = pd.DataFrame(data, columns=['Language', 'project name', 'test case name', 'label','test case content', 'tokens'])
df.drop('project name', axis=1, inplace=True)
df.drop('test case name', axis=1, inplace=True)


#remove comments

df['test case content'] = df['test case content'].apply(lambda x: remove_comments(x))
df['test case content'] = df['test case content'].apply(lambda x: remove_cammel(x))
# df['test case content'] = df['test case content'].apply(lambda x: clean_words(x))


df['tokens'] = df['test case content']

In [6]:
tokenizer = CountVectorizer(analyzer='word', max_features=1500,stop_words=stop_words)

    
df.tokens = df.tokens.apply(lambda x: remove_stopwords(x))
bow_token = tokenizer.fit_transform(df.tokens)

bow_data = pd.DataFrame(bow_token.toarray(), columns=tokenizer.get_feature_names_out())


In [7]:
y = df.label.copy()
df = pd.concat([df, bow_data], axis=1)

In [8]:
df.label = y

In [9]:
java_df = df[df['Language'] == 'Java']
python_df = df[df['Language'] == 'Python']
go_df = df[df['Language'] == 'go']
cpp_df = df[df['Language'] == 'C++']
js_df = df[df['Language'] == 'JS']

In [10]:
def sample_data(df, n):
    java = java_df.sample(n, replace=False, random_state=123, axis=0)
    java_y = java.iloc[:, 1:2]
    java_x = java.drop(columns=['Language', 'label','label', 'test case content', 'tokens'])
    
    return java_x, java_y
    

In [11]:
# train with java and test with others, train-test spliting 70-30
# java = java_df.sample(800, replace=False, random_state=123, axis=0)
# java_y = java.iloc[:, 1:2]
# java_x = java.drop(columns=['Language', 'label','label', 'test case content', 'tokens'])

java_py_x, java_py_y = sample_data(java_df, 490)      # python=210, java=490
python_y = python_df.iloc[:, 1:2] 
python_x = python_df.drop(['Language', 'label', 'label','test case content', 'tokens'], axis=1)

java_cpp_x, java_cpp_y = sample_data(java_df, 189)      # cpp=78, java=189
cpp_y = cpp_df.iloc[:, 1:2]
cpp_x = cpp_df.drop(['Language', 'label', 'label','test case content', 'tokens'], axis=1)

java_go_x, java_go_y = sample_data(java_df, 210)      #go=90, java=210
go_y = go_df.iloc[:, 1:2]
go_x = go_df.drop(['Language', 'label', 'label','test case content', 'tokens'], axis=1)

java_js_x, java_js_y = sample_data(java_df, 140)     #js=61, java=140
js_y = js_df.iloc[:, 1:2]
js_x = js_df.drop(['Language', 'label', 'label','test case content', 'tokens'], axis=1)

In [12]:
# results of train on java and test on other 
py_results = classify(  java_py_x, python_x, java_py_y, python_y, classifiers, normalize=['knn']) 
cpp_results = classify(  java_cpp_x, cpp_x, java_cpp_y, cpp_y, classifiers, normalize=['knn']) 
go_results = classify(  java_go_x, go_x, java_go_y, go_y, classifiers, normalize=['knn']) 
js_results = classify(  java_js_x, js_x, java_js_y, js_y, classifiers, normalize=['knn']) 

In [13]:
# train on python and test on other
py_java_x, py_java_y = sample_data(java_df, 90)    # python=210, java=90
py_cpp_x, py_cpp_y = sample_data(cpp_df, 30)     #python=210, java=30
py_go_x, py_go_y = sample_data(go_df, 30)    #python=210, go=30
py_js_x, py_js_y = sample_data(js_df, 30)    #python=210, js=30

In [14]:
java_results1 = classify(  python_x, py_java_x, python_y, py_java_y,  classifiers, normalize=['knn']) 
cpp_results1 = classify(  python_x, py_cpp_x, python_y, py_cpp_y, classifiers, normalize=['knn']) 
go_results1 = classify(  python_x, py_go_x, python_y, py_go_y, classifiers, normalize=['knn']) 
js_results1 = classify(  python_x, py_js_x, python_y, py_js_y, classifiers, normalize=['knn']) 

In [15]:
# train on cpp and test on other
cpp_java_x, cpp_java_y = sample_data(java_df, 33)    # cpp=78, java=33
cpp_py_x, cpp_py_y = sample_data(python_df, 33)    #cpp=78, python=33
cpp_go_x, cpp_go_y = sample_data(go_df, 33)    # cpp=78, go=33
cpp_js_x, cpp_js_y = sample_data(js_df, 33)   #cpp=78, js=33
java_results2 = classify(  cpp_x, cpp_java_x, cpp_y, cpp_java_y,  classifiers, normalize=['knn']) 
py_results2 = classify(  cpp_x, cpp_py_x, cpp_y, cpp_py_y, classifiers, normalize=['knn']) 
go_results2 = classify(  cpp_x, cpp_go_x, cpp_y, cpp_go_y, classifiers, normalize=['knn']) 
js_results2 = classify(  cpp_x, cpp_js_x, cpp_y, cpp_js_y, classifiers, normalize=['knn']) 

In [16]:
# train on go and test on other
go_java_x, go_java_y = sample_data(java_df, 39)    # go=90, java=39
go_py_x, go_py_y = sample_data(python_df, 39)    #go=90, python=39
go_cpp_x, go_cpp_y = sample_data(cpp_df, 39)    # go=90, cpp=39
go_js_x, go_js_y = sample_data(js_df, 39)   #go=90, js=39
java_results3 = classify(  go_x, go_java_x, go_y, go_java_y,  classifiers, normalize=['knn']) 
py_results3 = classify(  go_x, go_py_x, go_y, go_py_y, classifiers, normalize=['knn']) 
cpp_results3 = classify(  go_x, go_js_x, go_y, go_js_y, classifiers, normalize=['knn']) 
js_results3 = classify(  go_x, go_js_x, go_y, go_js_y, classifiers, normalize=['knn']) 

In [19]:
# train on js and test on other
js_java_x, js_java_y = sample_data(java_df, 27)    # js=61, java=27
js_py_x, js_py_y = sample_data(python_df, 27)    # js=61, python=27
js_cpp_x, js_cpp_y = sample_data(cpp_df, 27)    # js=61, cpp=27
js_go_x, js_go_y = sample_data(go_df, 27)   # js=61, go=27
java_results4 = classify(  js_x, js_java_x, js_y, js_java_y,  classifiers, normalize=['knn']) 
py_results4 = classify(  js_x, js_py_x, js_y, js_py_y, classifiers, normalize=['knn']) 
cpp_results4 = classify(  js_x, js_cpp_x, js_y, js_cpp_y, classifiers, normalize=['knn']) 
go_results4 = classify(  js_x, js_go_x, js_y, js_go_y, classifiers, normalize=['knn']) 

In [20]:
cross_test_results = pd.DataFrame()
cross_test_results = pd.concat([py_results, cpp_results, go_results, js_results, java_results1, cpp_results1, go_results1, js_results1,
                                java_results2, py_results2, go_results2, js_results2, java_results3, py_results3, cpp_results3, js_results3,
                                java_results4, py_results4, cpp_results4, go_results4])

In [21]:
cross_test_results.to_excel('./cross_test_results.xlsx')