In [11]:
import pandas as pd 
import numpy as np
import time
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import learning_curve
from sklearn.metrics import classification_report, roc_auc_score, matthews_corrcoef
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.svm import LinearSVC,SVR
from sklearn.feature_selection import RFECV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import Binarizer
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

In [12]:
stop_words = []

def remove_stopwords(text):
    text = str(text).lower().strip()
    tokens = []
    for token in text:
        if token not in stop_words:
            tokens.append(token)
            
    return "".join(tokens)
def remove_comments(text):
    no_comments = re.sub(r'#.*', '', text)
    no_comments = re.sub(r'/\*.*\/\*', '', no_comments)
    no_comments = re.sub(r'\d+','',no_comments)
    no_comments = re.sub(r'_', ' ', no_comments)
    no_comments = re.sub(r'[][@$%:!#~/\\?*""()={}''<>.+-;]','', no_comments)
    return no_comments

def remove_cammel(text):
    no_cammel = re.sub('([A-Z][a-z]+)', r' \1', re.sub('([A-Z]+)', r' \1', text)).lower().split()
    return no_cammel


In [13]:
import warnings
warnings.filterwarnings('ignore')

In [14]:
classifiers = {
    'randomForest': RandomForestClassifier(random_state=1),
    'decisionTree': DecisionTreeClassifier(min_samples_leaf=1),
    'naiveBayes': GaussianNB(),
    'smo': CalibratedClassifierCV(LinearSVC(fit_intercept=False, tol=0.001, C=1, dual=False, max_iter=100000), method='sigmoid'),
    'knn': KNeighborsClassifier(n_neighbors=1, metric='euclidean'),
    'logisticRegression': LogisticRegression(max_iter=1000),
    'perceptron': CalibratedClassifierCV(Perceptron()),
    'lda': LinearDiscriminantAnalysis(),
}

def round_float(value):
    return float("{:.3f}".format(value))


def get_time(start_time):
    end_time = time.time()
    return end_time - start_time
def classify_with_cv(classifiers,x, y, random_state=123, cv = None, shuffle=True, normalize=[]):
    
    labels = ['Flaky', 'NonFlaky']
    results = pd.DataFrame()
    
    scaler = Binarizer(threshold=0.0,).fit(x)
    x_norm = scaler.transform(x)
    
    for key, classifier in classifiers.items():
        if (key in normalize):
            x = x_norm
         
        # classifier.fit(x_train, y_train)
        # classifier.score(x_test, y_test)
        score = cross_val_score(classifier, x, y, cv = cv, scoring='accuracy')
        predict = cross_val_predict(classifier, x, y, cv=cv, method='predict')
        # y_probs = classifier.predict_proba(x_test)[:,1]
        
        result = {
                'classifier': key,
                'f1score': f1_score(y, predict, average='weighted'),
                'accuracy': np.mean(score),
                'confusionMatrix': confusion_matrix(y, predict),
                # 'execution': round_float(get_time(start_time)),
                'classificationReport': classification_report(y, predict, output_dict=True),
                # 'AUC': roc_auc_score(y_test, y_probs),
                'MCC': matthews_corrcoef(y, predict),
                
            }
            # results = pd.concat([results, result], ignore_index=True)
        # print( result)
        # print('\n')
        results = results._append(result, ignore_index=True)
        
        
    return results

### feature importance selection
def feature_importance(x, y):
    
    estimator = RandomForestClassifier(random_state=123)
    estimator.fit(x, y)
    importance = estimator.feature_importances_
    
    return importance

In [15]:
# load data and drop unused columns

filename = './C++_data.xlsx'
data = pd.read_excel(filename)
df = pd.DataFrame(data, columns=['Language', 'project name', 'test case name', 'label','test case content', 'tokens'])
df.drop('project name', axis=1, inplace=True)
df.drop('test case name', axis=1, inplace=True)


# preprocessing 

df['test case content'] = df['test case content'].apply(lambda x: remove_comments(x))
df['test case content'] = df['test case content'].apply(lambda x: remove_cammel(x))

df['tokens'] = df['test case content']

In [16]:
# tokenize

tokenizer = TfidfVectorizer(analyzer='word', max_features=1500,stop_words=stop_words)

    
df.tokens = df.tokens.apply(lambda x: remove_stopwords(x))
bow_token = tokenizer.fit_transform(df.tokens)

bow_data = pd.DataFrame(bow_token.toarray(), columns=tokenizer.get_feature_names_out())

In [17]:
y = df.label.copy()
df = pd.concat([df, bow_data], axis=1)
df.label = y
x = df.drop(['Language', 'label', 'label', 'test case content', 'tokens'], axis=1)

In [19]:
res = classify_with_cv(classifiers, x, y, cv=10, normalize=['knn'])

In [23]:
res.to_excel('./C++_results.xlsx')

In [22]:
top_features = feature_importance(x,y)
print("The top 20 features in this project is:", x.columns[np.argsort(top_features)[:20]])

The top 20 features in this project is: Index(['aaa', 'outputfloatsaved', 'outside', 'overhead', 'overlong', 'ovl',
       'packet', 'pad', 'padding', 'parameters', 'parse', 'passes',
       'outputfloatrunning', 'patchwork', 'peanut', 'percent', 'pipe',
       'places', 'platform', 'player'],
      dtype='object')
