In [1]:
import matplotlib
import os
import sys
import random
import pandas
import numpy as np
from __future__ import division, print_function
from glob import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_files
from sklearn.metrics import *

  from numpy.core.umath_tests import inner1d


In [2]:
np.seterr(divide='ignore', invalid='ignore')
root = os.getcwd()
dir = os.path.abspath(os.path.join(root, "datasets"))
if root not in sys.path:
    sys.path.append(root)

### Utility Functions

In [3]:
def get_all_projects():
    projects = dict()
    for datapath in os.listdir(dir):
        formatted_path = os.path.join(dir, datapath)
        if os.path.isdir(formatted_path):
            projects.update({datapath: dict()})
            files = glob(os.path.join(formatted_path, "*.csv"))
            for f in files:
                fname = f.split('\\')[-1].split("-")[0]
                dframe = pandas.read_csv(f)
                projects[datapath].update({fname: dframe})
    return projects

def abcd(actual, predicted, distribution, as_percent=True):

    """
    Confusion Matrix:

    |`````````````|`````````````|
    |  TN[0][0]   |  FP[0][1]   |
    |             |             |
    |`````````````|`````````````|
    |  FN[1][0]   |  TP[1][1]   |
    |             |             |
    `````````````````````````````
    """

    c_mtx = confusion_matrix(actual, predicted)

    "Probablity of Detection: Pd"
    try:
        p_d = c_mtx[1][1] / (c_mtx[1][1] + c_mtx[1][0])  # TP/(TP+FN)
    except ZeroDivisionError:
        p_d = 0

    "Probability of False Alarm: Pf"
    try:
        p_f = c_mtx[0][1] / (c_mtx[0][1] + c_mtx[0][0])  # FP/(FP+TN)
    except ZeroDivisionError:
        p_f = 0

    "Precision"
    try:
        p_r = c_mtx[1][1] / (c_mtx[1][1] + c_mtx[0][1])  # TP/(TP+FP)
        if not np.isfinite(p_r): p_r = 0
    except ZeroDivisionError:
        p_r = 0

    "Recall (Same as Pd)"
    r_c = p_d

    "F1 measure"
    try:
        f1 = 2 * c_mtx[1][1] / (2 * c_mtx[1][1] + c_mtx[0][1] + 1 * c_mtx[1][0])  # F1 = 2*TP/(2*TP+FP+FN)
    except ZeroDivisionError:
        f1 = 0

    "G-Score"
    e_d = 2 * p_d * (1 - p_f) / (1 + p_d - p_f)
    g = np.sqrt(p_d - p_d * p_f)  # Harmonic Mean between True positive rate and True negative rate

    try:
        auroc = round(roc_auc_score(actual, distribution), 2)
    except ValueError:
        auroc = 0

    if as_percent is True:
        return p_d * 100, p_f * 100, p_r * 100, r_c * 100, f1 * 100, e_d * 100, g * 100, auroc * 100
    else:
        return p_d, p_f, p_r, r_c, f1, e_d, g, auroc

In [4]:
projects = get_all_projects()
god_class = projects['class']
feature_envy = projects['method']

In [5]:
aoi = god_class['aoi']
wct = god_class['wct']

### Using Random Forest Classifier for Bellwether Discovery

In [6]:
def rf_model(source, target, seed):
    clf = RandomForestClassifier(n_estimators=seed, random_state=1)
    # Binarize source
    source.loc[source[source.columns[-1]] > 0, source.columns[-1]] = 1
    source.loc[source[source.columns[-1]] == 0, source.columns[-1]] = 0
    target.loc[target[target.columns[-1]] > 0, target.columns[-1]] = 1
    target.loc[target[target.columns[-1]] == 0, target.columns[-1]] = 0
    # Train with rforest
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    distr = clf.predict_proba(target[target.columns[:-1]])
    return preds, distr[:, 1]

def weight_training(test_instance, training_instance):
    # Use standard score to replace original data and drop NaN
    head = training_instance.columns
    new_train = training_instance[head[:-1]]
    new_train = (new_train - test_instance[head[:-1]].mean()) / test_instance[head[:-1]].std()
    new_train[head[-1]] = training_instance[head[-1]]
    new_train.dropna(axis=1, inplace=True)
    tgt = new_train.columns
    new_test = (test_instance[tgt[:-1]] - test_instance[tgt[:-1]].mean()) / (test_instance[tgt[:-1]].std())
    new_test[tgt[-1]] = test_instance[tgt[-1]]
    new_test.dropna(axis=1, inplace=True)
    columns = list(set(tgt[:-1]).intersection(new_test.columns[:-1])) + [tgt[-1]]
    return new_train[columns], new_test[columns]

def predict_smell(train, test, seed):
    """
    Perform Code-Smell Prediction
    :param train:
    :type train:
    :param test:
    :type test:
    :return:
    """
    actual = test[test.columns[-1]].values.tolist()
    predicted, distr = rf_model(train, test, seed)
    return actual, predicted, distr

def bellw(source, target, verbose=True, n_rep=30):
    result = dict()
    for tgt_name, tgt in target.items():
        stats = []
        charts = []
        if verbose: print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src in source.items():
            if not src_name == tgt_name:
                pd, pf, pr, f1, g, auc = [], [], [], [], [], []
                for _ in range(n_rep):
                    rseed = random.randint(1,100)
                    _train, __test = weight_training(test_instance=tgt, training_instance=src)
                    actual, predicted, distribution = predict_smell(train=_train, test=__test, seed=rseed)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    pr.append(p_r)
                    f1.append(f_1)
                    g.append(_g)
                    auc.append(int(auroc))

                stats.append([src_name, int(np.mean(pd)), int(np.mean(pf)),
                              int(np.mean(pr)), int(np.mean(f1)),
                              int(np.mean(g)), int(np.mean(auc))])  # ,

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"])  # ,

        if verbose: print(tabulate(stats,
                       headers=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})

    return result

In [7]:
bw_class = bellw(god_class, god_class, False)
bw_method = bellw(feature_envy, feature_envy, False)

In [8]:
print(bellwether2)

NameError: name 'bellwether2' is not defined