In [57]:
import os
import csv
from sklearn.naive_bayes import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC, NuSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import RidgeClassifier, LogisticRegression
from hyperopt import fmin, tpe, hp, Trials
from sklearn.model_selection import cross_val_score
from functools import partial
import pandas as pd
import numpy as np
import scipy
import random
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.cluster import DBSCAN, KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score 
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
import scipy.spatial.distance as dist
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform
from scipy.stats import mannwhitneyu
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, confusion_matrix


# Domain Adaptation Techniques

## Bruakfilter

In [43]:
class Bruakfilter(object):
    def __init__(self, n_neighbors=10):
        self.n_neighbors = n_neighbors

    def run(self, Xsource, Ysource, Xtarget, Ytarget):
        Xsource = np.log(Xsource + 1)
        Xtarget = np.log(Xtarget + 1)

        if self.n_neighbors > Xsource.shape[0]:
            return 0, 0, 0, 0

        knn = NearestNeighbors()
        knn.fit(Xsource)
        data = []
        ysel = []

        for item in Xtarget:
            tmp = knn.kneighbors(item.reshape(1, -1), self.n_neighbors, return_distance=False)
            tmp = tmp[0]
            for i in tmp:
                if list(Xsource[i]) not in data:
                    data.append(list(Xsource[i]))
                    ysel.append(Ysource[i])
        Xsource = np.asanyarray(data)
        Ysource = np.asanyarray(ysel)

        return Xsource, Ysource, Xtarget, Ytarget

## Data Selection

In [44]:
class DataSelection(object):
    def __init__(self, topN=5, FSS=0.2):
        self.topN = topN
        self.FSS = FSS

    def _sample(self, Xsource, Xtarget):
        K = min(500, Xsource.shape[0], Xtarget.shape[0])
        Ltrain = np.ones(K)
        Ltest = np.ones(K) * -1

        Train = random.sample(range(Xsource.shape[0]), Xsource.shape[0] - K)
        Test = random.sample(range(Xtarget.shape[0]), Xtarget.shape[0] - K)
        Train = np.delete(Xsource, Train, axis=0)
        Test = np.delete(Xtarget, Test, axis=0)

        data = np.concatenate((Train, Test), axis=0)
        label = np.concatenate((Ltrain, Ltest), axis=0)

        return data, label

    def _calDistance(self, Xsource, Xtarget):
        acc = np.zeros(10)
        for i in range(10):
            x, y = self._sample(Xsource, Xtarget)
            lr = LogisticRegression(solver='lbfgs', max_iter=10000)
            acc[i] = np.mean(cross_val_score(lr, x, y, scoring='accuracy', cv=5))
        return 2 * abs((np.mean(acc) - 0.5))

    def run(self, Xsource, Ysource, Xtarget, Ytarget, loc):
        self.topN = min(self.topN, len(loc))
        dist = dict()

        for i in range(len(loc)):
            if i < len(loc) - 1:
                train = Xsource[loc[i]:loc[i + 1]]
                dist[i] = self._calDistance(train, Xtarget)
            else:
                train = Xsource[loc[i]:]
                dist[i] = self._calDistance(train, Xtarget)

        dist = sorted(dist.items(), key=lambda d: d[1])
        i = dist[0][0]
        if i != len(loc) - 1:
            x = Xsource[loc[i]:loc[i + 1] ]
            y = Ysource[loc[i]:loc[i + 1] ]
        else:
            x = Xsource[loc[i]:]
            y = Ysource[loc[i]:]

        for i in range(1, self.topN):
            index = dist[i][0]
            if index < len(loc) - 1:
                tmp = Xsource[loc[index]:loc[index + 1] ]
                temp = Ysource[loc[index]:loc[index + 1] ]
            else:
                tmp = Xsource[loc[index]:]
                temp = Ysource[loc[index]:]
            x = np.concatenate((x, tmp), axis=0)
            y = np.concatenate((y, temp), axis=0)

        fx, fy = self._sample(x, Xtarget)
        lr = LogisticRegression(solver='lbfgs', max_iter=10000)
        lr.fit(fx, fy)
        coef = dict()
        for i in range(Xsource.shape[1]):
            coef[i] = lr.coef_[0][i]
        coef = sorted(coef.items(), key=lambda d: d[1], reverse=True)

        dump = []
        for i in range(int(Xsource.shape[1] * self.FSS)):
            dump.append(coef[i][0])

        x = np.delete(x, dump, axis=1)
        Xtarget = np.delete(Xtarget, dump, axis=1)

        return x, y, Xtarget, Ytarget

## DSBF

In [45]:
#DSBF
class DSBF(object):
    def __init__(self, topK=1, neighbors=10):
        self.topK = int(topK)
        self.neighbors = neighbors

    def featureReduction(self, source, target):
        d = pdist(target.T, metric='euclidean')
        D = squareform(d)
        dist = D.copy()
        D = np.zeros(D.shape)

        for i in range(target.shape[1]):
            index = np.argsort(dist[i])
            count = 0
            for j in range(len(index)):
                if count < self.topK and index[j] != i:
                    D[i][index[j]] = 1
                    count += 1

        V = np.sum(D, axis=0)
        V[V < 1e-6] = 0
        index = np.where(V != 0)
        target = np.delete(target, index, axis=1)
        source = np.delete(source, index, axis=1)

        return source, target

    def outlierRemove(self, target, ys):
        d = pdist(target, metric='euclidean')
        D = squareform(d)
        dist = D.copy()
        D = np.zeros(D.shape)
        for i in range(target.shape[0]):
            index = np.argsort(dist[i])
            count = 0
            for j in range(len(index)):
                if count < self.topK and index[j] != i:
                    D[i][index[j]] = 1
                    count += 1
        V = np.sum(D, axis=0)
        V[V < 1e-6] = 0
        index = np.where(V == 0)
        target = np.delete(target, index, axis=0)
        ys = np.delete(ys, index, axis=0)
        return target, ys

    def Bruakfilter(self, Xsource, Ysource, Xtarget, Ytarget):
        Xsource = np.log(Xsource + 1)
        Xtarget = np.log(Xtarget + 1)

        if self.neighbors > Xsource.shape[0]:
            return 0, 0, 0, 0

        knn = NearestNeighbors()
        knn.fit(Xsource)
        data = []
        ysel = []

        for item in Xtarget:
            tmp = knn.kneighbors(item.reshape(1, -1), self.neighbors, return_distance=False)
            tmp = tmp[0]
            for i in tmp:
                if list(Xsource[i]) not in data:
                    data.append(list(Xsource[i]))
                    ysel.append(Ysource[i])
        Xsource = np.asanyarray(data)
        Ysource = np.asanyarray(ysel)

        return Xsource, Ysource, Xtarget, Ytarget

    def run(self, Xsource, Ysource, Xtarget, Ytarget):
        Xsource, Xtarget = self.featureReduction(Xsource, Xtarget)
        if Xsource.shape[1] == 0:
            return 0, 0, 0, 0
        Xsource, Ysource = self.outlierRemove(Xsource, Ysource)
        if len(Xsource) == 0:
            return 0, 0, 0, 0
        Xtarget, Ytarget = self.outlierRemove(Xtarget, Ytarget)
        if len(Xtarget) == 0:
            return 0, 0, 0, 0
        Xsource, Ysource, Xtarget, Ytarget = self.Bruakfilter(Xsource, Ysource, Xtarget, Ytarget)
        if len(Xsource) == 0 or len(Xtarget) == 0:
            return 0, 0, 0, 0
        Xsource, Ysource = self.outlierRemove(Xsource, Ysource)
        if len(Xsource) == 0 or len(Xtarget) == 0:
            return 0, 0, 0, 0

        return Xsource, Ysource, Xtarget, Ytarget

## DTB

In [46]:
class DTB(object):
    def __init__(self, n_neighbors=10, iter=20):
        self.n_neighbors = int(n_neighbors)
        self.iter = iter

    def _NNfilter(self, Xsource, Ysource, Xtarget, Ytarget):
        knn = NearestNeighbors()
        knn.fit(Xsource)
        data = []
        ysel = []

        for item in Xtarget:
            tmp = knn.kneighbors(item.reshape(1, -1), self.n_neighbors, return_distance=False)
            tmp = tmp[0]
            for i in tmp:
                if list(Xsource[i]) not in data:
                    data.append(list(Xsource[i]))
                    ysel.append(Ysource[i])
        Xsource = np.asanyarray(data)
        Ysource = np.asanyarray(ysel)
        return Xsource, Ysource

    # oversample for minor part
    def _SMOTE(self, Xsource, Ysource):
        smote = SMOTE()
        Xsource, Ysource = smote.fit_resample(Xsource, Ysource)
        return Xsource, Ysource

    def run(self, Xsource, Ysource, Xtarget, Ytarget):
      Xsource, Ysource = self._NNfilter(Xsource, Ysource, Xtarget, Ytarget)
      Xsource, Ysource = self._SMOTE(Xsource, Ysource)
      return Xsource, Ysource, Xtarget, Ytarget


## Peterfilter

In [47]:
class Peterfilter(object):
    def __init__(self, eachCluster=50):
        self.eachCluster = eachCluster

    def run(self, Xsource, Ysource, Xtarget, Ytarget):
        self.Xsource = Xsource
        self.Xtarget = Xtarget
        self.Ysource = Ysource
        self.Ytarget = Ytarget
        data = np.concatenate((self.Xsource, self.Xtarget), axis=0)
        if self.eachCluster == 0:
            return 0,0,0,0
        n_cluster = int(self.Xsource.shape[0] / self.eachCluster)
        if n_cluster == 0:
            return 0, 0, 0, 0
        kmeans = KMeans(n_clusters=n_cluster)
        kmeans.fit(data)
        labels = kmeans.labels_

        # remove the clusters where have no test instance
        cluster = dict()
        for i in range(n_cluster):
            cluster[i] = []

        for i in range(len(labels)):
            cluster[labels[i]].append(i)

        chosenCluster = []
        for i in range(self.Xsource.shape[0], data.shape[0]):
            for j in range(n_cluster):
                if i in cluster[j] and (j not in chosenCluster):
                    chosenCluster.append(j)

        # choose train instance in each cluster
        out = []
        for i in range(len(chosenCluster)):
            test = []
            indexTest = []
            train = []
            indexTrain = []
            for item in cluster[chosenCluster[i]]:
                if item >= self.Xsource.shape[0] and item < data.shape[0]:
                    test.append(list(data[item]))
                    indexTest.append(item)
                else:
                    train.append(list(self.Xsource[item]))
                    indexTrain.append(item)

            if len(train) == 0:
                break
            Testfans = np.zeros((len(indexTest), len(indexTrain)))

            neigh = NearestNeighbors(n_neighbors=1)
            neigh.fit(np.asarray(test))
            for item in train:
                index = neigh.kneighbors(np.asarray(item).reshape(1, -1), return_distance=False)
                Testfans[index[0][0], train.index(item)] += 1

            for i in range(len(test)):
                index = np.argmax(Testfans[i])
                if indexTrain[index] not in out:
                    out.append(indexTrain[index])

        tmp = np.zeros((len(out), self.Xsource.shape[1]))
        tmpl = np.zeros(len(out))
        for i in range(len(out)):
            tmp[i] = self.Xsource[out[i]]
            tmpl[i] = self.Ysource[out[i]]

        return tmp, tmpl, Xtarget, Ytarget

## TCA

In [48]:
def kernel(ker, X, X2, gamma):
    if not ker or ker == 'primal':
        return X
    elif ker == 'linear':
        if not X2:
            K = np.dot(X.T, X)
        else:
            K = np.dot(X.T, X2)
    elif ker == 'rbf':
        n1sq = np.sum(X ** 2, axis=0)
        n1 = X.shape[1]
        if not X2:
            D = (np.ones((n1, 1)) * n1sq).T + np.ones((n1, 1)) * n1sq - 2 * np.dot(X.T, X)
        else:
            n2sq = np.sum(X2 ** 2, axis=0)
            n2 = X2.shape[1]
            D = (np.ones((n2, 1)) * n1sq).T + np.ones((n1, 1)) * n2sq - 2 * np.dot(X.T, X)
        K = np.exp(-gamma * D)
    elif ker == 'sam':
        if not X2:
            D = np.dot(X.T, X)
        else:
            D = np.dot(X.T, X2)
        K = np.exp(-gamma * np.arccos(D) ** 2)
        K[K != K] = 0
    return K


class TCA(object):
    def __init__(self, kernel_type='primal', dim=5, lamb=1, gamma=1):
        '''
        Init func
        :param kernel_type: kernel, values: 'primal' | 'linear' | 'rbf' | 'sam'
        :param dim: dimension after transfer
        :param lamb: lambda value in equation
        :param gamma: kernel bandwidth for rbf kernel
        '''
        self.kernel_type = kernel_type
        self.dim = dim
        self.lamb = lamb
        self.gamma = gamma

    def _normalization(self, type):
        ss = self.Xsource.shape
        tt = self.Xtarget.shape

        if type == 'N1':
            # normalization for source data
            res = np.zeros((ss[0], ss[1]))
            for i in range(ss[1]):
                tmp = self.Xsource[:, i]
                minm = np.min(tmp)
                maxm = np.max(tmp)
                res[:, i] = (tmp - minm) / (maxm - minm)
            self.Xsource = res

            # normalization for target data
            res = np.zeros((tt[0], tt[1]))
            for i in range(tt[1]):
                tmp = self.Xtarget[:, i]
                minm = np.min(tmp)
                maxm = np.max(tmp)
                res[:, i] = (tmp - minm) / (maxm - minm)
            self.Xtarget = res

        elif type == 'N2':
            # normalization for source data
            res = np.zeros((ss[0], ss[1]))
            for i in range(ss[1]):
                tmp = self.Xsource[:, i]
                mean = np.mean(tmp)
                std = np.std(tmp)
                res[:, i] = (tmp - mean) / std
            self.Xsource = res

            # normalization for target data
            res = np.zeros((tt[0], tt[1]))
            for i in range(tt[1]):
                tmp = self.Xtarget[:, i]
                mean = np.mean(tmp)
                std = np.std(tmp)
                res[:, i] = (tmp - mean) / std
            self.Xtarget = res

        elif type == 'N3':
            Smean = []
            Sstd = []
            # normalization for source data
            res = np.zeros((ss[0], ss[1]))
            for i in range(ss[1]):
                tmp = self.Xsource[:, i]
                mean = np.mean(tmp)
                std = np.std(tmp)
                Smean.append(mean)
                Sstd.append(std)
                res[:, i] = (tmp - mean) / std
            self.Xsource = res

            # normalization for target data
            res = np.zeros((tt[0], tt[1]))
            for i in range(tt[1]):
                tmp = self.Xtarget[:, i]
                mean = Smean[i]
                std = Sstd
                res[:, i] = (tmp - mean) / std
            self.Xtarget = res

        elif type == 'N4':
            Smean = []
            Sstd = []

            # normalization for target data
            res = np.zeros((tt[0], tt[1]))
            for i in range(tt[1]):
                tmp = self.Xtarget[:, i]
                mean = np.mean(tmp)
                std = np.std(tmp)
                Smean.append(mean)
                Sstd.append(std)
                res[:, i] = (tmp - mean) / std
            self.Xtarget = res

            # normalization for source data
            res = np.zeros((ss[0], ss[1]))
            for i in range(ss[1]):
                tmp = self.Xsource[:, i]
                mean = Smean[i]
                std = Sstd
                res[:, i] = (tmp - mean) / std
            self.Xsource = res

        elif type == 'N0':
            return

    def _computDCV(self):
        ss = self.Xsource.shape
        tt = self.Xtarget.shape
        Sdist = []
        Tdist = []
        SDCV = []
        TDCV = []

        # compute DCV (dataset characteristic vector) of source dataset
        for i in range(ss[0]):
            for j in range(i + 1, ss[0]):
                Sdist.append(dist.euclidean(self.Xsource[i], self.Xsource[j]))
        SDCV.append(np.mean(np.asarray(Sdist)))
        SDCV.append((np.median(np.asarray(Sdist))))
        SDCV.append(np.min(np.asarray(Sdist)))
        SDCV.append(np.max(np.asarray(Sdist)))
        SDCV.append(np.std(np.asarray(Sdist)))
        SDCV.append(ss[0])

        # compute DCV (dataset characteristic vector) of target dataset
        for i in range(tt[0]):
            for j in range(i + 1, tt[0]):
                Tdist.append(dist.euclidean(self.Xtarget[i], self.Xtarget[j]))
        TDCV.append(np.mean(np.asarray(Tdist)))
        TDCV.append((np.median(np.asarray(Tdist))))
        TDCV.append(np.min(np.asarray(Tdist)))
        TDCV.append(np.max(np.asarray(Tdist)))
        TDCV.append(np.std(np.asarray(Tdist)))
        TDCV.append(ss[0])

        return np.asarray(SDCV), np.asarray(TDCV)

    def _chooseNormalization(self):
        SDCV, TDCV = self._computDCV()

        nominal = []
        for i in range(0, 6):
            if SDCV[i] * 1.6 < TDCV[i]:
                nominal.append('much-more')
            elif TDCV[i] < SDCV[i] * 0.4:
                nominal.append('much-less')
            elif (SDCV[i] * 1.3 < TDCV[i]) and (TDCV[i] <= SDCV[i] * 1.6):
                nominal.append('more')
            elif (SDCV[i] * 1.1 < TDCV[i]) and (TDCV[i] <= SDCV[i] * 1.3):
                nominal.append('slight-more')
            elif (SDCV[i] * 0.9 <= TDCV[i]) and (TDCV[i] <= SDCV[i] * 1.1):
                nominal.append('same')
            elif (SDCV[i] * 0.7 <= TDCV[i]) and (TDCV[i] < SDCV[i] * 0.9):
                nominal.append('slight-less')
            elif (SDCV[i] * 0.4 <= TDCV[i]) and (TDCV[i] < SDCV[i] * 0.7):
                nominal.append('less')

        if (nominal[5] == nominal[2] == nominal[3] == 'much-less') or (
                nominal[5] == nominal[2] == nominal[3] == 'much-more'):
            self._normalization('N1')

        elif ((nominal[4] == 'much-more') and ('less' in nominal[5])) or (
                (nominal[4] == 'much-less') and ('more' in nominal[5])):
            self._normalization('N3')

        elif (nominal[4] == nominal[5] == 'much-more') or (nominal[4] == nominal[5] == 'much-less'):
            self._normalization('N4')

        elif nominal[0] == nominal[4] == 'same':
            self._normalization('N0')

        else:
            self._normalization('N2')

    def run(self, Xs, Ys, Xt, Yt):
        '''
        Transform Xs and Xt
        :param Xs: ns * n_feature, source feature
        :param Xt: nt * n_feature, target feature
        :return: Xs_new and Xt_new after TCA
        '''
        self.Xsource = Xs
        self.Xtarget = Xt
        self._chooseNormalization()
        Xs = self.Xsource
        Xt = self.Xtarget

        X = np.hstack((Xs.T, Xt.T))
        X /= np.linalg.norm(X, axis=0)
        m, n = X.shape
        ns, nt = len(Xs), len(Xt)
        e = np.vstack((1 / ns * np.ones((ns, 1)), -1 / nt * np.ones((nt, 1))))
        M = e * e.T
        M = M / np.linalg.norm(M, 'fro')
        H = np.eye(n) - 1 / n * np.ones((n, n))
        K = kernel(self.kernel_type, X, None, gamma=self.gamma)
        n_eye = m if self.kernel_type == 'primal' else n
        a, b = np.linalg.multi_dot([K, M, K.T]) + self.lamb * np.eye(n_eye), np.linalg.multi_dot([K, H, K.T])
        w, V = scipy.linalg.eig(a, b)
        ind = np.argsort(w)
        A = V[:, ind[:self.dim]]
        Z = np.dot(A.T, K)
        Z /= np.linalg.norm(Z, axis=0)
        Xs_new, Xt_new = Z[:, :ns].T, Z[:, ns:].T
        return Xs_new, Ys, Xt_new, Yt

## Universal

In [49]:
def cliffsDelta(lst1, lst2, **dull):

    """Returns delta and true if there are more than 'dull' differences"""
    if not dull:
        dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    size = lookup_size(d, dull)
    return d, size

def lookup_size(delta: float, dull: dict) -> str:
    """
    :type delta: float
    :type dull: dict, a dictionary of small, medium, large thresholds.
    """
    delta = abs(delta)
    if delta < dull['small']:
        return 'negligible'
    if dull['small'] <= delta < dull['medium']:
        return 'small'
    if dull['medium'] <= delta < dull['large']:
        return 'medium'
    if delta >= dull['large']:
        return 'large'


def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two


def cohen(c0, c1):
    cohens_d = (mean(c0) - mean(c1)) / (sqrt((stdev(c0) ** 2 + stdev(c1) ** 2) / 2))
    t = abs(cohens_d)
    if t <= 0.2:
        res = 'negligible'
    elif t <= 0.5:
        res = 'small'
    elif t <= 0.8:
        res = 'medium'
    else:
        res = 'large'

    return res
    
class Universal(object):
    def __init__(self, pvalue=0.05, QuantifyType='cliff'):
        self.p = pvalue
        self.type = QuantifyType

    def _compareMetricDistribution(self, x1, x2):
        s, p = mannwhitneyu(x1, x2)
        if p < self.p:
            sig_diff = 1
        else:
            sig_diff = 0
        return sig_diff

    def _quantifyDifference(self, x1, x2):
        if self.type == 'cliff':
            d, res = cliffsDelta(x1, x2)
        else:
            res = cohen(x1, x2)
        return res

    def cluster(self, No_metric, numGroup, group):
        indexOfCluster = 0
        clusterOfGroup = np.zeros(numGroup)

        for i in range(0, numGroup-1):
            indexNewCluster = indexOfCluster + 1
            for j in range(i+1, numGroup):
                if self._compareMetricDistribution(group[i][:, No_metric], group[j][:, No_metric]) == 1:
                    if self._quantifyDifference(group[i][:, No_metric], group[j][:, No_metric]) == 'large':
                        clusterOfGroup[j] = indexNewCluster
                        indexOfCluster = indexNewCluster

        return clusterOfGroup

    def rankTransform(self, xsource, xtarget):
        #xsource = xsource.to_numpy()
        #xtarget = xtarget.to_numpy()
        group = [xsource, xtarget]
        resGroup = group.copy()

        for i in range(xsource.shape[1]):
            clusterIndex = self.cluster(i, len(group), group)
            cluster = np.unique(clusterIndex)
            for item in cluster:
                tmp = np.asarray(np.where(clusterIndex == item))[0]
                tmp_data = np.asarray([])
                for ncs in tmp:
                    tmp_data = np.concatenate((tmp_data, group[int(ncs)][:, i]))

                percentiles = np.percentile(sorted(tmp_data), [10, 20, 30, 40, 50, 60, 70, 80, 90])
                for ncs in tmp:
                    ncs = int(ncs)
                    t = resGroup[ncs][:, i]
                    for it in range(len(t)):
                        if t[it] <= percentiles[0]:
                            resGroup[ncs][:, i][it] = 1
                        elif t[it] <= percentiles[1]:
                            resGroup[ncs][:, i][it] = 2
                        elif t[it] <= percentiles[2]:
                            resGroup[ncs][:, i][it] = 3
                        elif t[it] <= percentiles[3]:
                            resGroup[ncs][:, i][it] = 4
                        elif t[it] <= percentiles[4]:
                            resGroup[ncs][:, i][it] = 5
                        elif t[it] <= percentiles[5]:
                            resGroup[ncs][:, i][it] = 6
                        elif t[it] <= percentiles[6]:
                            resGroup[ncs][:, i][it] = 7
                        elif t[it] <= percentiles[7]:
                            resGroup[ncs][:, i][it] = 8
                        elif t[it] <= percentiles[8]:
                            resGroup[ncs][:, i][it] = 9
                        else:
                            resGroup[ncs][:, i][it] = 10
        return resGroup

    def run(self, Xsource, Ysource, Xtarget, Ytarget):
        res = self.rankTransform(Xsource, Xtarget)
        source = np.asarray(res[0])
        target = res[1]
        source = pd.DataFrame(data=source)
        target = pd.DataFrame(data=target)

        return source, Ysource, target, Ytarget

# Data Preprocessing

In [9]:
# code to find ML files from the project

def count_ml_files(directory, keywords, output_csv, project_folder):
    ml_file_count = 0
    total_python_files_count = 0
    total_files = 0
    
    project_root = os.path.abspath(directory)
    
    with open(output_csv, mode='w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['ML_Files'])

    for root, _, files in os.walk(directory):
        for file in files:
            total_files += 1
            if file.endswith(".py"):
                total_python_files_count += 1
                with open(os.path.join(root, file), 'r', errors='ignore') as f:
                    for line in f:
                        if any(keyword in line for keyword in keywords):
                            ml_file_count += 1
                            
                            relative_path = os.path.relpath(root, project_root)
                            relative_path = relative_path.replace('.', project_folder)
                            if not relative_path.startswith(project_folder):
                                relative_path = project_folder + '/' + relative_path        
                            relative_path = os.path.join(relative_path, file)
                            
                            with open(output_csv, mode='a', newline='') as csv_file:
                                csv_writer = csv.writer(csv_file)
                                csv_writer.writerow([relative_path])
                            break  # Stop searching once a keyword is found

    print("Total Files: ", total_files)
    print("Total Python Files: ", total_python_files_count)
    #print("Total ML files: ", ml_file_count)
    return ml_file_count




In [10]:
# Jax
jax_directory = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/Jax_Versions/jax-jax-v0.3.15'
jax_libraries = ['ml_dtypes', 'numpy', 'opt_einsum', 'scipy', 'jax', 'scikit-learn', 'matplotlib']
# jax_libraries = ['ml_dtypes', 'opt_einsum', 'scipy', 'jax', 'scikit-learn']
output_csv = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_ml_files.csv'
ml_count = count_ml_files(jax_directory, jax_libraries, output_csv, 'jax-main')
print(f"Number of ML-related files in Jax: {ml_count}")
print(f"Processed files saved to {output_csv}")

Total Files:  696
Total Python Files:  417
Number of ML-related files in Jax: 387
Processed files saved to /home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_ml_files.csv


In [11]:
lightning_directory = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/Lightning_Versions/lightning-1.8.0'
lightning_libraries = ['torch', 'numpy', 'torchmetrics', 'gym', 'matplotlib', 'tensorboardX', 'scikit-learn', 'tensorboard', 'pytorch-lightning', 'torchdata', 'torchvision', 'torchmetrics', 'lightning-colossalai', 'neptune', 'comet-ml', 'mlflow', 'onnx' ] # Add your ML-related keywords to this list for lightning
output_csv = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_ml_files.csv'  # Change this to your desired output file path
ml_count = count_ml_files(lightning_directory, lightning_libraries, output_csv, 'lightning-master')
print(f"Number of ML-related files in lightning: {ml_count}")
print(f"Processed files saved to {output_csv}")



Total Files:  2095
Total Python Files:  1061
Number of ML-related files in lightning: 579
Processed files saved to /home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_ml_files.csv


In [12]:
ray_directory = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_versions/ray-ray-2.0.0'
ray_libraries = ['numpy', 'scipy', 'gymnasium', 'scikit-learn', 'scikit-image', 'pandas', 'tensorboardX'] # Add your ML-related keywords to this list for ray
output_csv = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_ml_files.csv'  # Change this to your desired output file path
ml_count = count_ml_files(ray_directory, ray_libraries, output_csv, 'ray-master')
print(f"Number of ML-related files in Ray: {ml_count}")
print(f"Processed files saved to {output_csv}")


Total Files:  6621
Total Python Files:  3463
Number of ML-related files in Ray: 998
Processed files saved to /home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_ml_files.csv


In [13]:
transformers_directory = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_versions/transformers-4.23.0'
transformers_libraries = ['deepspeed', 'diffusers', 'evaluate', 'flax', 'huggingface-hub', 'jax', 'jaxlib', 'jieba', 'keras', 'keras-nlp', 'nltk', 'numpy', 'onnxconverter-common', 'onnxruntime-tools', 'onnxruntime', 'opencv-python', 'optuna', 'safetensors', 'sagemaker', 'scikit-learn', 'sentencepiece', 'sigopt', 'tensorboard', 'tensorflow', 'torch', 'torchaudio', 'torchvision' ] # Add your ML-related keywords to this list for transformers
output_csv = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_ml_files.csv'  # Change this to your desired output file path
ml_count = count_ml_files(transformers_directory, transformers_libraries, output_csv, 'transformers-main')
print(f"Number of ML-related files in transfomers: {ml_count}")
print(f"Processed files saved to {output_csv}")

Total Files:  2583
Total Python Files:  1904
Number of ML-related files in transfomers: 1401
Processed files saved to /home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_ml_files.csv


In [14]:
yolov5_directory = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5-versions/yolov5-7.0'
yolov5_libraries = ['numpy', 'scipy', 'matplotlib', 'opencv-python', 'opencv', 'torch', 'torchvision', 'ultralytics', 'tensorboard', 'clearml', 'comet', 'coremltools', 'onnx', 'onnx-simplifier', 'scikit-learn', 'tensorflow', 'tensorflowjs', 'openvino-dev' ] # Add your ML-related keywords to this list for yolov5
output_csv = '/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_ml_files.csv'  # Change this to your desired output file path
ml_count = count_ml_files(yolov5_directory, yolov5_libraries, output_csv, 'yolov5-master')
print(f"Number of ML-related files in yolov5: {ml_count}")
print(f"Processed files saved to {output_csv}")



Total Files:  144
Total Python Files:  53
Number of ML-related files in yolov5: 44
Processed files saved to /home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_ml_files.csv


In [15]:
jax_ml_files = pd.read_csv("/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_ml_files.csv")
#removing "jax-main" from the path of ML files
jax_ml_files['ML_Files'] = jax_ml_files['ML_Files'].str.replace('jax-main/', '')

lightning_ml_files = pd.read_csv("/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_ml_files.csv")
#removing "lightning-master" from the path of ML files
lightning_ml_files['ML_Files'] = lightning_ml_files['ML_Files'].str.replace('lightning-master/', '')

ray_ml_files = pd.read_csv("/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_ml_files.csv")
#removing "ray-master" from the path of ML files
ray_ml_files['ML_Files'] = ray_ml_files['ML_Files'].str.replace('ray-master/', '')


transformers_ml_files = pd.read_csv("/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_ml_files.csv")
#removing "transformer-main" from the path of ML files
transformers_ml_files['ML_Files'] = transformers_ml_files['ML_Files'].str.replace('transformers-main/', '')

yolov5_ml_files = pd.read_csv("/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_ml_files.csv")
#removing "ylov5-master" from the path of ML files
yolov5_ml_files['ML_Files'] = yolov5_ml_files['ML_Files'].str.replace('yolov5-master/', '')


In [16]:
jax_full_data = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_0.3.15.csv')
lightning_full_data = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_1.8.0.csv')
ray_full_data = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_2.0.0.csv')
transformers_full_data = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_4.23.0.csv')
yolov5_full_data = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_7.0.csv')


In [17]:
ml_file_names = jax_ml_files['ML_Files']
jax_ml_full_data = jax_full_data[jax_full_data['Files'].isin(ml_file_names)]
jax_non_ml_full_data = jax_full_data[~jax_full_data['Files'].isin(ml_file_names)]
print(jax_ml_full_data.shape)
print(jax_non_ml_full_data.shape)

(387, 21)
(30, 21)


In [131]:
import numpy as np

def calculate_correct_incorrect(confusion_matrix):
    class_label = 0
    # Extract the row corresponding to the class label (true labels)
    true_labels = confusion_matrix[class_label, :]
    
    # Extract the column corresponding to the class label (predicted labels)
    predicted_labels = confusion_matrix[:, class_label]
    
    # Calculate the correct predictions for the class by getting the diagonal element
    correct_predictions = true_labels[class_label]
    
    # Calculate the incorrect predictions for the class by summing all elements in the column
    incorrect_predictions = np.sum(predicted_labels) - correct_predictions
    
    # Total predictions for the class is the sum of true labels
    total_predictions = np.sum(true_labels)
    
    # Calculate percentages
    correct_percentage = (correct_predictions / total_predictions) * 100
    incorrect_percentage = (incorrect_predictions / total_predictions) * 100
    
    print("Number of correct predictions (%):", correct_percentage)
    print("Number of incorrect predictions (%):", incorrect_percentage)


# WPDP

## Jax

In [91]:

# For data loading()
jax_1_73 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_0.1.73.csv')
jax_2_21 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_0.2.21.csv') 
jax_2_28 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/jax_0.2.28.csv')
  
jax_train_data = pd.concat([jax_1_73, jax_2_21, jax_2_28])
jax_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)


jax_test_data1 =  jax_ml_full_data.copy()
jax_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

jax_test_data2 =  jax_non_ml_full_data.copy()
jax_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
  #jax_test_data.shape

X_source = jax_train_data.drop(columns='Buggy')
Y_source = jax_train_data['Buggy']
# For ML Files
X_target1 = jax_test_data1.drop(columns='Buggy')
Y_target1 = jax_test_data1['Buggy']
# For Non-ML Files
X_target2 = jax_test_data2.drop(columns='Buggy')
Y_target2 = jax_test_data2['Buggy']

print("X_source = ", X_source.shape)
print("Y_source = ", Y_source.shape)
print("X_target1 = ", X_target1.shape)
print("Y_target1 = ", Y_target1.shape)
print("X_target2 = ", X_target2.shape)
print("Y_target2 = ", Y_target2.shape)

X_source =  (849, 17)
Y_source =  (849,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)


In [92]:
# Setting up classifiers for ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 5, max_features='log2', min_samples_leaf=7, min_samples_split=0.45931853796211086, n_estimators=25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9650430032505362, n_estimators=45, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='log_loss', max_depth=6, max_features='log2', min_samples_split=8, splitter='random', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='ball_tree', n_neighbors=12)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.14172511059817, max_iter=10912, solver='sparse_cg', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.006986252579354192, hidden_layer_sizes=(100,), learning_rate='constant', max_iter= 29, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C=2.012999083052283, degree=1, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[328   0]
 [ 59   0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[328   0]
 [ 59   0]]
conf_matrix =  [[328   0]
 [ 59   0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.4912153782554775
precision =  0.125
recall =  0.06779661016949153
f1 =  0.08791208791208792
conf_matrix =  [[300  28]
 [ 55   4]]
conf_matrix =  [[300  28]
 [ 55   4]]
Number of correct predictions (%): 6.779661016949152
Number of incorrect predictions (%): 47.45762711864407
CART
roc_auc =  0.5223749483257545
precision =  0.6
recall =  0.05084745762711865
f1 =  0.09375000000000001
conf_matrix =  [[326   2]
 [ 56   3]]
conf_matrix =  [[326   2]
 [ 56   3]]
Number of correct predictions (%): 5.084745762711865
Number of incorrect predictions (%): 3.38983050847457

In [132]:
# Setting up classifiers for Non-ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 5, max_features='log2', min_samples_leaf=7, min_samples_split=0.45931853796211086, n_estimators=25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate=0.9650430032505362, n_estimators=45, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='log_loss', max_depth=6, max_features='log2', min_samples_split=8, splitter='random', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='ball_tree', n_neighbors=12)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.14172511059817, max_iter=10912, solver='sparse_cg', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.006986252579354192, hidden_layer_sizes=(100,), learning_rate='constant', max_iter= 29, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C=2.012999083052283, degree=1, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.9615384615384616
precision =  0.13333333333333333
recall =  1.0
f1 =  0.23529411764705882
conf_matrix =  [[468  39]
 [  0   6]]
Number of correct predictions (%): 92.3076923076923
Number of incorrect predictions (%): 0.0
CART
roc_auc =  0.5498027613412229
precision =  0.02857142857142857
recall =  0.16666666666666666
f1 =  0.048780487804878044
conf_matrix =  [[473  34]
 [  5   1]]
Number of correct predictions (%): 93.29388560157791
Number of incorrect predictions (%): 0.9861932938856016
KNN
roc_auc =  0.46745562130177515
precision =  0.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[474  33]
 [  6

## Lightning

In [94]:
ml_file_names = lightning_ml_files['ML_Files']
lightning_ml_full_data = lightning_full_data[lightning_full_data['Files'].isin(ml_file_names)]
lightning_non_ml_full_data = lightning_full_data[~lightning_full_data['Files'].isin(ml_file_names)]
print(lightning_ml_full_data.shape)
print(lightning_non_ml_full_data.shape)

(577, 21)
(484, 21)


In [95]:

# For data loading()
lightning_0_5 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_0.5.1.csv')
lightning_1_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_1.0.0.csv')
lightning_1_5 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_1.5.0.csv') 


lightning_train_data = pd.concat([lightning_0_5, lightning_1_0, lightning_1_5])
lightning_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

lightning_test_data1 = lightning_ml_full_data.copy()
lightning_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

lightning_test_data2 =  lightning_non_ml_full_data.copy()
lightning_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
  #jax_test_data.shape

X_source = lightning_train_data.drop(columns='Buggy')
Y_source = lightning_train_data['Buggy']
# For ML Files
X_target1 = lightning_test_data1.drop(columns='Buggy')
Y_target1 = lightning_test_data1['Buggy']
# For Non-ML Files
X_target2 = lightning_test_data2.drop(columns='Buggy')
Y_target2 = lightning_test_data2['Buggy']

print("X_source = ", X_source.shape)
print("Y_source = ", Y_source.shape)
print("X_target1 = ", X_target1.shape)
print("Y_target1 = ", Y_target1.shape)
print("X_target2 = ", X_target2.shape)
print("Y_target2 = ", Y_target2.shape)

X_source =  (759, 17)
Y_source =  (759,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)


In [96]:
# Setting up classifiers for ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 6, max_features='sqrt', min_samples_leaf=4, min_samples_split=0.6173188814474816, n_estimators=3, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME.R', learning_rate= 0.788035723318785, n_estimators=49, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='gini', max_depth=6, max_features='log2', min_samples_split= 2, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='brute', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.15782926553250476, max_iter=4094, solver='svd', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.002941741563655937, hidden_layer_sizes=(100,), learning_rate='constant', max_iter= 80, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C=0.2863651711831961, degree=3, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[446   0]
 [  0 131]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[446   0]
 [  0 131]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.9921524663677129
precision =  0.9492753623188406
recall =  1.0
f1 =  0.9739776951672863
conf_matrix =  [[439   7]
 [  0 131]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 5.343511450381679
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[446   0]
 [  0 131]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.6038150823263616
precision =  0.2983050847457627
recall =  0.6717557251908397
f1 =  0.41314553990610337
conf_matrix =  [[239 207]
 [ 43  88]]
Number of correct predictions (%): 67.

In [97]:
# Setting up classifiers for Non-ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 6, max_features='sqrt', min_samples_leaf=4, min_samples_split=0.6173188814474816, n_estimators=3, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME.R', learning_rate= 0.788035723318785, n_estimators=49, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='gini', max_depth=6, max_features='log2', min_samples_split= 2, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='brute', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.15782926553250476, max_iter=4094, solver='svd', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.002941741563655937, hidden_layer_sizes=(100,), learning_rate='constant', max_iter= 80, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C=0.2863651711831961, degree=3, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[391   0]
 [  0  93]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[391   0]
 [  0  93]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.9946236559139785
precision =  1.0
recall =  0.989247311827957
f1 =  0.9945945945945946
conf_matrix =  [[391   0]
 [  1  92]]
Number of correct predictions (%): 98.9247311827957
Number of incorrect predictions (%): 0.0
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[391   0]
 [  0  93]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.6450375381569177
precision =  0.3355263157894737
recall =  0.5483870967741935
f1 =  0.41632653061224484
conf_matrix =  [[290 101]
 [ 42  51]]
Number of correct predictions (%): 54.8387

## Ray

In [98]:
ml_file_names = ray_ml_files['ML_Files']
ray_ml_full_data = ray_full_data[ray_full_data['Files'].isin(ml_file_names)]
ray_non_ml_full_data = ray_full_data[~ray_full_data['Files'].isin(ml_file_names)]
print(ray_ml_full_data.shape)
print(ray_non_ml_full_data.shape)

(695, 21)
(1915, 21)


In [99]:

# For data loading()
ray_0_3 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_0.3.0.csv')
ray_0_6 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_0.6.1.csv')
ray_0_8 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_0.8.0.csv') 
ray_1_1 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_1.1.0.csv')
ray_1_9 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_1.9.0.csv')


ray_train_data = pd.concat([ray_0_3, ray_0_6, ray_0_8, ray_1_1, ray_1_9])
ray_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

ray_test_data1 = ray_ml_full_data.copy()
ray_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

ray_test_data2 =  ray_non_ml_full_data.copy()
ray_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
  #jax_test_data.shape

X_source = ray_train_data.drop(columns='Buggy')
Y_source = ray_train_data['Buggy']
# For ML Files
X_target1 = ray_test_data1.drop(columns='Buggy')
Y_target1 = ray_test_data1['Buggy']
# For Non-ML Files
X_target2 = ray_test_data2.drop(columns='Buggy')
Y_target2 = ray_test_data2['Buggy']

print("X_source = ", X_source.shape)
print("Y_source = ", Y_source.shape)
print("X_target1 = ", X_target1.shape)
print("Y_target1 = ", Y_target1.shape)
print("X_target2 = ", X_target2.shape)
print("Y_target2 = ", Y_target2.shape)

X_source =  (4168, 17)
Y_source =  (4168,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)


In [100]:
# Setting up classifiers for ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 4, max_features='log2', min_samples_leaf=9, min_samples_split= 0.2941037470966731, n_estimators=43, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME.R', learning_rate= 0.5467481993881079, n_estimators=43, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='log2', min_samples_split= 4, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='brute', n_neighbors=16)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.6328583171856641, max_iter=14462, solver='cholesky', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.004894991445063436, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 90, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C=0.5761836015907289, degree=3, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[628   0]
 [  0  67]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[628   0]
 [  0  67]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.8957006369426752
precision =  0.3383838383838384
recall =  1.0
f1 =  0.5056603773584906
conf_matrix =  [[497 131]
 [  0  67]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 195.5223880597015
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[628   0]
 [  0  67]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.6663418575910257
precision =  0.1941747572815534
recall =  0.5970149253731343
f1 =  0.29304029304029305
conf_matrix =  [[462 166]
 [ 27  40]]
Number of correct predictions (%): 59.

In [101]:
# Setting up classifiers for Non-ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 4, max_features='log2', min_samples_leaf=9, min_samples_split= 0.2941037470966731, n_estimators=43, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME.R', learning_rate= 0.5467481993881079, n_estimators=43, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=5, max_features='log2', min_samples_split= 4, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='brute', n_neighbors=16)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.6328583171856641, max_iter=14462, solver='cholesky', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.004894991445063436, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 90, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C=0.5761836015907289, degree=3, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[1770    0]
 [   0  145]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[1770    0]
 [   0  145]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.964406779661017
precision =  0.5350553505535055
recall =  1.0
f1 =  0.6971153846153846
conf_matrix =  [[1644  126]
 [   0  145]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 86.89655172413792
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[1770    0]
 [   0  145]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.627907656341321
precision =  0.19
recall =  0.3931034482758621
f1 =  0.25617977528089886
conf_matrix =  [[1527  243]
 [  88   57]]
Number of correct predictions (%):

## Transformers

In [102]:
ml_file_names = transformers_ml_files['ML_Files']
transformers_ml_full_data = transformers_full_data[transformers_full_data['Files'].isin(ml_file_names)]
transformers_non_ml_full_data = transformers_full_data[~transformers_full_data['Files'].isin(ml_file_names)]
print(transformers_ml_full_data.shape)
print(transformers_non_ml_full_data.shape)

(1391, 21)
(513, 21)


In [103]:

# For data loading()
transformers_2_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_2.0.0.csv')
transformers_3_5 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_3.5.0.csv')
transformers_4_13 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_4.13.0.csv')

transformers_train_data = pd.concat([transformers_2_0, transformers_3_5, transformers_4_13])
transformers_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

transformers_test_data1 = transformers_ml_full_data.copy()
transformers_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

transformers_test_data2 =  transformers_non_ml_full_data.copy()
transformers_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
  #jax_test_data.shape

X_source = transformers_train_data.drop(columns='Buggy')
Y_source = transformers_train_data['Buggy']
# For ML Files
X_target1 = transformers_test_data1.drop(columns='Buggy')
Y_target1 = transformers_test_data1['Buggy']
# For Non-ML Files
X_target2 = transformers_test_data2.drop(columns='Buggy')
Y_target2 = transformers_test_data2['Buggy']

print("X_source = ", X_source.shape)
print("Y_source = ", Y_source.shape)
print("X_target1 = ", X_target1.shape)
print("Y_target1 = ", Y_target1.shape)
print("X_target2 = ", X_target2.shape)
print("Y_target2 = ", Y_target2.shape)

X_source =  (1935, 17)
Y_source =  (1935,)
X_target1 =  (1391, 17)
Y_target1 =  (1391,)
X_target2 =  (513, 17)
Y_target2 =  (513,)


In [104]:
# Setting up classifiers for ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.14925335668960113, n_estimators=28, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split= 5, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.6275778011613018, max_iter=12807, solver='cholesky', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.004894991445063436, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 90, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C= 1.2704047468360506, degree=2, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[1288    0]
 [   0  103]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[1288    0]
 [   0  103]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.6876017608394138
precision =  0.21951219512195122
recall =  0.5242718446601942
f1 =  0.3094555873925502
conf_matrix =  [[1096  192]
 [  49   54]]
Number of correct predictions (%): 52.42718446601942
Number of incorrect predictions (%): 186.40776699029126
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[1288    0]
 [   0  103]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.5240833986612796
precision =  0.17391304347826086
recall =  0.07766990291262135
f1 =  0.10738255033557047
conf_matrix =  [[1250   38]
 

In [105]:
# Setting up classifiers for Non-ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.14925335668960113, n_estimators=28, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split= 5, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.6328583171856641, max_iter=14462, solver='cholesky', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='identity', alpha=0.004894991445063436, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 90, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C= 1.2704047468360506, degree=2, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.4921104536489152
precision =  0.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[499   8]
 [  6   0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 133.33333333333331
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[507   0]
 [  6   0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
Ridge
roc_auc =  0.5
precision =  1.0
recall =  0.

## Yolov5

In [106]:
ml_file_names = yolov5_ml_files['ML_Files']
yolov5_ml_full_data = yolov5_full_data[yolov5_full_data['Files'].isin(ml_file_names)]
yolov5_non_ml_full_data = yolov5_full_data[~yolov5_full_data['Files'].isin(ml_file_names)]
print(yolov5_ml_full_data.shape)
print(yolov5_non_ml_full_data.shape)

(44, 21)
(9, 21)


In [107]:

# For data loading()
yolov5_4_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_4.0.csv')
yolov5_6_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_6.0.csv')


yolov5_train_data = pd.concat([yolov5_4_0, yolov5_6_0])
yolov5_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

yolov5_test_data1 = yolov5_ml_full_data.copy()
yolov5_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

yolov5_test_data2 =  transformers_non_ml_full_data.copy()
yolov5_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
  #jax_test_data.shape

X_source = yolov5_train_data.drop(columns='Buggy')
Y_source = yolov5_train_data['Buggy']
# For ML Files
X_target1 = yolov5_test_data1.drop(columns='Buggy')
Y_target1 = yolov5_test_data1['Buggy']
# For Non-ML Files
X_target2 = yolov5_test_data2.drop(columns='Buggy')
Y_target2 = yolov5_test_data2['Buggy']

print("X_source = ", X_source.shape)
print("Y_source = ", Y_source.shape)
print("X_target1 = ", X_target1.shape)
print("Y_target1 = ", Y_target1.shape)
print("X_target2 = ", X_target2.shape)
print("Y_target2 = ", Y_target2.shape)

X_source =  (50, 17)
Y_source =  (50,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)


In [108]:
# Setting up classifiers for ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 2, max_features='log2', min_samples_leaf=4, min_samples_split= 0.07335409800854176, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.3618390422647574, n_estimators=11, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='log_loss', max_depth=3, max_features='log2', min_samples_split=9, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=9)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C= 0.6728812217893367, degree=2, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[10  0]
 [ 0 34]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[10  0]
 [ 0 34]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.7588235294117648
precision =  0.9545454545454546
recall =  0.6176470588235294
f1 =  0.75
conf_matrix =  [[ 9  1]
 [13 21]]
Number of correct predictions (%): 61.76470588235294
Number of incorrect predictions (%): 2.941176470588235
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[10  0]
 [ 0 34]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.4852941176470588
precision =  0.7674418604651163
recall =  0.9705882352941176
f1 =  0.8571428571428571
conf_matrix =  [[ 0 10]
 [ 1 33]]
Number of correct predictions (%): 97.05882352

In [109]:
# Setting up classifiers for Non-ML Files
# Random Forest 
print("Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 2, max_features='log2', min_samples_leaf=4, min_samples_split= 0.07335409800854176, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# AdaBoost
print("AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.3618390422647574, n_estimators=11, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Gaussian NB
print("Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#CART
print("CART")
model = DecisionTreeClassifier(criterion='log_loss', max_depth=3, max_features='log2', min_samples_split=9, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#KNN
print("KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=9)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Ridge
print("Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#MLP
print("MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

#SVM
print("SVM")
model = SVC(C= 0.6728812217893367, degree=2, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target2)
print("roc_auc = ", roc_auc_score(Y_target2, y_pred))
print("precision = ", precision_score(Y_target2, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target2, y_pred))
print("f1 = ", f1_score(Y_target2, y_pred))
conf_mat = confusion_matrix(Y_target2, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
AdaBoost
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
Gaussian NB
roc_auc =  0.6341222879684418
precision =  0.05714285714285714
recall =  0.3333333333333333
f1 =  0.09756097560975609
conf_matrix =  [[474  33]
 [  4   2]]
Number of correct predictions (%): 33.33333333333333
Number of incorrect predictions (%): 550.0
CART
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 0.0
KNN
roc_auc =  0.7652859960552268
precision =  0.02459016393442623
recall =  1.0
f1 =  0.048
conf_matrix =  [[269 238]
 [  0   6]]
Number of correct predictions (%): 100.0
Number o

# CPDP

## Jax

In [110]:
# Data loading functions
def data_loading_lj():
    # Load your dataset (replace X and y with your features and labels)
    lightning_0_5 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_0.5.1.csv')
    lightning_1_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_1.0.0.csv')
    lightning_1_5 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_1.5.0.csv')
    lightning_1_8 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/lightning_1.8.0.csv')
    
    
    lightning_train_data = pd.concat([lightning_0_5, lightning_1_0, lightning_1_5, lightning_1_8])
    lightning_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    
    jax_test_data1 =  jax_ml_full_data.copy()
    jax_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data2 =  jax_non_ml_full_data.copy()
    jax_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = lightning_train_data.drop(columns='Buggy')
    Y_source = lightning_train_data['Buggy']
    # For ML Files
    X_target1 = jax_test_data1.drop(columns='Buggy')
    Y_target1 = jax_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = jax_test_data2.drop(columns='Buggy')
    Y_target2 = jax_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2
   

def data_loading_rj():
    # Load your dataset (replace X and y with your features and labels)
    ray_0_3 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_0.3.0.csv')
    ray_0_6 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_0.6.1.csv')
    ray_0_8 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_0.8.0.csv')
    ray_1_1 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_1.1.0.csv')
    ray_1_9 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_1.9.0.csv')
    ray_2_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/ray_2.0.0.csv')
    
    ray_train_data = pd.concat([ray_0_3, ray_0_6, ray_0_8, ray_1_1, ray_1_9, ray_2_0])
    ray_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data1 =  jax_ml_full_data.copy()
    jax_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data2 =  jax_non_ml_full_data.copy()
    jax_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = ray_train_data.drop(columns='Buggy')
    Y_source = ray_train_data['Buggy']
    # For ML Files
    X_target1 = jax_test_data1.drop(columns='Buggy')
    Y_target1 = jax_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = jax_test_data2.drop(columns='Buggy')
    Y_target2 = jax_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_tj():
    # Load your dataset (replace X and y with your features and labels)
    transformers_2_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_2.0.0.csv')
    transformers_3_5 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_3.5.0.csv')
    transformers_4_13 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_4.13.0.csv')
    transformers_4_23 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/transformers_4.23.0.csv')
    
    
    transformers_train_data = pd.concat([transformers_2_0, transformers_3_5, transformers_4_13, transformers_4_23])
    transformers_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data1 =  jax_ml_full_data.copy()
    jax_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data2 =  jax_non_ml_full_data.copy()
    jax_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = transformers_train_data.drop(columns='Buggy')
    Y_source = transformers_train_data['Buggy']
    # For ML Files
    X_target1 = jax_test_data1.drop(columns='Buggy')
    Y_target1 = jax_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = jax_test_data2.drop(columns='Buggy')
    Y_target2 = jax_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_yj():
    # Load your dataset (replace X and y with your features and labels)
    yolov5_4_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_4.0.csv')
    yolov5_6_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_6.0.csv')
    yolov5_7_0 = pd.read_csv('/home/user/CS21D002_Eashaan/MSR_2024_Experiments/Experiment_Type1/MSR_2024/Dataset/yolov5_7.0.csv')
    
    yolov5_train_data = pd.concat([yolov5_4_0, yolov5_6_0, yolov5_7_0])
    yolov5_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data1 =  jax_ml_full_data.copy()
    jax_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    jax_test_data2 =  jax_non_ml_full_data.copy()
    jax_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = yolov5_train_data.drop(columns='Buggy')
    Y_source = yolov5_train_data['Buggy']
    # For ML Files
    X_target1 = jax_test_data1.drop(columns='Buggy')
    Y_target1 = jax_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = jax_test_data2.drop(columns='Buggy')
    Y_target2 = jax_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

In [129]:
# ML Files

# Peterfilter-SVM
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Peterfilter-SVM")
model = SVC(C= 0.6728812217893367, degree=2, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# #TCA-MLP
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
# tca = TCA()
# X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
# print("TCA-MLP")
# model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))

# # TCA-SVM
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
# tca = TCA()
# X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
# print("Peterfilter-SVM")
# model = SVC(C= 0.6728812217893367, degree=2, kernel='linear')
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))

# Peterfilter-ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tj()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Peterfilter-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# DTB-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter- Random Forest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yj()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Bruakfilter-Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 2, max_features='log2', min_samples_leaf=4, min_samples_split= 0.07335409800854176, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter - Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Peterfilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yj()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target1, Y_target1)
print("DSBF-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rj()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DSBF-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# # TCA - Naive Bayes
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
# tca = TCA()
# X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
# print("TCA-Gaussian NB")
# model = GaussianNB(var_smoothing=1e-09)
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))

X_source =  (1820, 17)
Y_source =  (1820,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)
Peterfilter-SVM
roc_auc =  0.5025578751550227
precision =  0.16666666666666666
recall =  0.05084745762711865
f1 =  0.07792207792207792
conf_matrix =  [[313  15]
 [ 56   3]]
Number of correct predictions (%): 5.084745762711865
Number of incorrect predictions (%): 25.423728813559322
X_source =  (3839, 17)
Y_source =  (3839,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)
Peterfilter-Ridge
roc_auc =  0.4917062835882596
precision =  0.08333333333333333
recall =  0.01694915254237288
f1 =  0.028169014084507043
conf_matrix =  [[317  11]
 [ 58   1]]
Number of correct predictions (%): 1.694915254237288
Number of incorrect predictions (%): 18.64406779661017
X_source =  (1820, 17)
Y_source =  (1820,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)
DTB-Gaussian NB
roc_auc =  0.49560768912773867
precis

In [115]:
# Non-ML Files

# Peterfilter-SVM
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Peterfilter-SVM")
model = SVC(C= 0.6728812217893367, degree=2, kernel='linear')
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# #TCA-MLP
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
# tca = TCA()
# X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
# print("TCA-MLP")
# model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))

# # TCA-SVM
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
# tca = TCA()
# X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
# print("Peterfilter-SVM")
# model = SVC(C= 0.6728812217893367, degree=2, kernel='linear')
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))

# Peterfilter-ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tj()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Peterfilter-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# DTB-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter- Random Forest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yj()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Bruakfilter-Random FOrest")
model = RandomForestClassifier(criterion='entropy', max_depth= 2, max_features='log2', min_samples_leaf=4, min_samples_split= 0.07335409800854176, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter - Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Peterfilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF - MLP
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yj()
# dsbf = DSBF()
# X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target2, Y_target2)
# print("DSBF-MLP")
# model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# conf_mat = confusion_matrix(Y_target1, y_pred)
# print("conf_matrix = ", conf_mat)
# calculate_correct_incorrect(conf_mat)


# DTB - MLP
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rj()
# dtb = DTB()
# X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target1)
# print("DTB-MLP")
# model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# conf_mat = confusion_matrix(Y_target1, y_pred)
# print("conf_matrix = ", conf_mat)
# calculate_correct_incorrect(conf_mat)

# # TCA - Naive Bayes
# X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lj()
# tca = TCA()
# X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
# print("TCA-Gaussian NB")
# model = GaussianNB(var_smoothing=1e-09)
# model.fit(X_source, Y_source)
# y_pred = model.predict(X_target1)
# print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
# print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
# print("recall = ", recall_score(Y_target1, y_pred))
# print("f1 = ", f1_score(Y_target1, y_pred))
# print("conf_matrix = ", confusion_matrix(Y_target1, y_pred))

X_source =  (1820, 17)
Y_source =  (1820,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)
Peterfilter-SVM
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[23  0]
 [ 7  0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
X_source =  (3839, 17)
Y_source =  (3839,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)
Peterfilter-Ridge
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[23  0]
 [ 7  0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
X_source =  (1820, 17)
Y_source =  (1820,)
X_target1 =  (387, 17)
Y_target1 =  (387,)
X_target2 =  (30, 17)
Y_target2 =  (30,)
DTB-Gaussian NB
roc_auc =  0.4782608695652174
precision =  0.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[22  1]
 [ 7  0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 14.285714285714285
X_source =  (103, 17)
Y_sourc

## Lightning

In [116]:
def data_loading_jl():
    # Load your dataset (replace X and y with your features and labels)
    jax_1_73 = pd.read_csv('../Dataset/jax_0.1.73.csv')
    jax_2_21 = pd.read_csv('../Dataset/jax_0.2.21.csv')
    jax_2_28 = pd.read_csv('../Dataset/jax_0.2.28.csv')
    jax_3_15 = pd.read_csv('../Dataset/jax_0.3.15.csv')
    
    
    jax_train_data = pd.concat([jax_1_73, jax_2_21, jax_2_28, jax_3_15])
    jax_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data1 = lightning_ml_full_data.copy()
    lightning_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data2 =  lightning_non_ml_full_data.copy()
    lightning_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = jax_train_data.drop(columns='Buggy')
    Y_source = jax_train_data['Buggy']
    # For ML Files
    X_target1 = lightning_test_data1.drop(columns='Buggy')
    Y_target1 = lightning_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = lightning_test_data2.drop(columns='Buggy')
    Y_target2 = lightning_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2             


def data_loading_rl():
    # Load your dataset (replace X and y with your features and labels)
    ray_0_3 = pd.read_csv('../Dataset/ray_0.3.0.csv')
    ray_0_6 = pd.read_csv('../Dataset/ray_0.6.1.csv')
    ray_0_8 = pd.read_csv('../Dataset/ray_0.8.0.csv')
    ray_1_1 = pd.read_csv('../Dataset/ray_1.1.0.csv')
    ray_1_9 = pd.read_csv('../Dataset/ray_1.9.0.csv')
    ray_2_0 = pd.read_csv('../Dataset/ray_2.0.0.csv')
    
    ray_train_data = pd.concat([ray_0_3, ray_0_6, ray_0_8, ray_1_1, ray_1_9, ray_2_0])
    ray_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

    lightning_test_data1 = lightning_ml_full_data.copy()
    lightning_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data2 =  lightning_non_ml_full_data.copy()
    lightning_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = ray_train_data.drop(columns='Buggy')
    Y_source = ray_train_data['Buggy']
    # For ML Files
    X_target1 = lightning_test_data1.drop(columns='Buggy')
    Y_target1 = lightning_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = lightning_test_data2.drop(columns='Buggy')
    Y_target2 = lightning_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 

def data_loading_tl():
    # Load your dataset (replace X and y with your features and labels)
    transformers_2_0 = pd.read_csv('../Dataset/transformers_2.0.0.csv')
    transformers_3_5 = pd.read_csv('../Dataset/transformers_3.5.0.csv')
    transformers_4_13 = pd.read_csv('../Dataset/transformers_4.13.0.csv')
    transformers_4_23 = pd.read_csv('../Dataset/transformers_4.23.0.csv')
    
    
    transformers_train_data = pd.concat([transformers_2_0, transformers_3_5, transformers_4_13, transformers_4_23])
    transformers_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data1 = lightning_ml_full_data.copy()
    lightning_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data2 =  lightning_non_ml_full_data.copy()
    lightning_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = transformers_train_data.drop(columns='Buggy')
    Y_source = transformers_train_data['Buggy']
    # For ML Files
    X_target1 = lightning_test_data1.drop(columns='Buggy')
    Y_target1 = lightning_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = lightning_test_data2.drop(columns='Buggy')
    Y_target2 = lightning_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 

def data_loading_yl():
    # Load your dataset (replace X and y with your features and labels)
    yolov5_4_0 = pd.read_csv('../Dataset/yolov5_4.0.csv')
    yolov5_6_0 = pd.read_csv('../Dataset/yolov5_6.0.csv')
    yolov5_7_0 = pd.read_csv('../Dataset/yolov5_7.0.csv')
    
    yolov5_train_data = pd.concat([yolov5_4_0, yolov5_6_0, yolov5_7_0])
    yolov5_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data1 = lightning_ml_full_data.copy()
    lightning_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    lightning_test_data2 =  lightning_non_ml_full_data.copy()
    lightning_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = yolov5_train_data.drop(columns='Buggy')
    Y_source = yolov5_train_data['Buggy']
    # For ML Files
    X_target1 = lightning_test_data1.drop(columns='Buggy')
    Y_target1 = lightning_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = lightning_test_data2.drop(columns='Buggy')
    Y_target2 = lightning_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 

In [117]:
# ML Files


# DS-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jl()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target1, Y_target1, loc)
print("DS-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#Universal-KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yl()
universal = Universal()
X_source, Y_source, X_target1, Y_target1 = universal.run(X_source, Y_source, X_target1, Y_target1)
print("Universal-KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=9)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yl()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Bruakfilter-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rl()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target1, Y_target1)
print("DSBF-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA-CART
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rl()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split= 5, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)



# DS-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target1, Y_target1, loc)
print("DS-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter_Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Peterfilter-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Bruakfilter-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rl()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (1266, 17)
Y_source =  (1266,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)
DS-MLP
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[446   0]
 [131   0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)
Universal-KNN
roc_auc =  0.6927224181015301
precision =  0.3501577287066246
recall =  0.8473282442748091
f1 =  0.49553571428571425
conf_matrix =  [[240 206]
 [ 20 111]]
Number of correct predictions (%): 84.7328244274809
Number of incorrect predictions (%): 157.25190839694656
X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)
Bruakfilter-MLP
roc_auc =  0.5
precision =  0.2270363951473137
recall =  1.0
f1 =  0.3700564971751412
conf_matrix =  [[  0 446]
 [  0 131]]
Number of correct p

In [118]:
# Non-ML Files


# DS-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jl()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target2, Y_target2, loc)
print("DS-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#Universal-KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yl()
universal = Universal()
X_source, Y_source, X_target1, Y_target1 = universal.run(X_source, Y_source, X_target2, Y_target2)
print("Universal-KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=9)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yl()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Bruakfilter-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rl()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target2, Y_target2)
print("DSBF-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA-CART
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rl()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split= 5, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)



# DS-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target2, Y_target2, loc)
print("DS-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter_Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tl()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Peterfilter-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)

# Bruakfilter-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rl()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (1266, 17)
Y_source =  (1266,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)
DS-MLP
roc_auc =  0.5
precision =  1.0
recall =  0.0
f1 =  0.0
conf_matrix =  [[391   0]
 [ 93   0]]
Number of correct predictions (%): 0.0
Number of incorrect predictions (%): 0.0
X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)
Universal-KNN
roc_auc =  0.7583120204603581
precision =  0.32978723404255317
recall =  1.0
f1 =  0.496
conf_matrix =  [[202 189]
 [  0  93]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 203.2258064516129
X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (484, 17)
Y_target2 =  (484,)
Bruakfilter-MLP
roc_auc =  0.5
precision =  0.1921487603305785
recall =  1.0
f1 =  0.3223570190641248
conf_matrix =  [[  0 391]
 [  0  93]]
Number of correct predictions (%): 100.0
Number of incorrec

## Ray

In [119]:
def data_loading_tr():
    # Load your dataset (replace X and y with your features and labels)
    transformers_2_0 = pd.read_csv('../Dataset/transformers_2.0.0.csv')
    transformers_3_5 = pd.read_csv('../Dataset/transformers_3.5.0.csv')
    transformers_4_13 = pd.read_csv('../Dataset/transformers_4.13.0.csv')
    transformers_4_23 = pd.read_csv('../Dataset/transformers_4.23.0.csv')
    
    
    transformers_train_data = pd.concat([transformers_2_0, transformers_3_5, transformers_4_13, transformers_4_23])
    transformers_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data1 = ray_ml_full_data.copy()
    ray_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data2 =  ray_non_ml_full_data.copy()
    ray_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = transformers_train_data.drop(columns='Buggy')
    Y_source = transformers_train_data['Buggy']
    # For ML Files
    X_target1 = ray_test_data1.drop(columns='Buggy')
    Y_target1 = ray_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = ray_test_data2.drop(columns='Buggy')
    Y_target2 = ray_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_jr():
    # Load your dataset (replace X and y with your features and labels)
    jax_1_73 = pd.read_csv('../Dataset/jax_0.1.73.csv')
    jax_2_21 = pd.read_csv('../Dataset/jax_0.2.21.csv')
    jax_2_28 = pd.read_csv('../Dataset/jax_0.2.28.csv')
    jax_3_15 = pd.read_csv('../Dataset/jax_0.3.15.csv')
    
    
    jax_train_data = pd.concat([jax_1_73, jax_2_21, jax_2_28, jax_3_15])
    jax_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data1 = ray_ml_full_data.copy()
    ray_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data2 =  ray_non_ml_full_data.copy()
    ray_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = jax_train_data.drop(columns='Buggy')
    Y_source = jax_train_data['Buggy']
    # For ML Files
    X_target1 = ray_test_data1.drop(columns='Buggy')
    Y_target1 = ray_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = ray_test_data2.drop(columns='Buggy')
    Y_target2 = ray_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_lr():
    # Load your dataset (replace X and y with your features and labels)
    lightning_0_5 = pd.read_csv('../Dataset/lightning_0.5.1.csv')
    lightning_1_0 = pd.read_csv('../Dataset/lightning_1.0.0.csv')
    lightning_1_5 = pd.read_csv('../Dataset/lightning_1.5.0.csv')
    lightning_1_8 = pd.read_csv('../Dataset/lightning_1.8.0.csv')
    
    
    lightning_train_data = pd.concat([lightning_0_5, lightning_1_0, lightning_1_5, lightning_1_8])
    lightning_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data1 = ray_ml_full_data.copy()
    ray_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data2 =  ray_non_ml_full_data.copy()
    ray_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = lightning_train_data.drop(columns='Buggy')
    Y_source = lightning_train_data['Buggy']
    # For ML Files
    X_target1 = ray_test_data1.drop(columns='Buggy')
    Y_target1 = ray_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = ray_test_data2.drop(columns='Buggy')
    Y_target2 = ray_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_yr():
    # Load your dataset (replace X and y with your features and labels)
    yolov5_4_0 = pd.read_csv('../Dataset/yolov5_4.0.csv')
    yolov5_6_0 = pd.read_csv('../Dataset/yolov5_6.0.csv')
    yolov5_7_0 = pd.read_csv('../Dataset/yolov5_7.0.csv')
    
    yolov5_train_data = pd.concat([yolov5_4_0, yolov5_6_0, yolov5_7_0])
    yolov5_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data1 = ray_ml_full_data.copy()
    ray_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    ray_test_data2 =  ray_non_ml_full_data.copy()
    ray_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = yolov5_train_data.drop(columns='Buggy')
    Y_source = yolov5_train_data['Buggy']
    # For ML Files
    X_target1 = ray_test_data1.drop(columns='Buggy')
    Y_target1 = ray_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = ray_test_data2.drop(columns='Buggy')
    Y_target2 = ray_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

In [120]:
# ML Files

# DSBF-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yr()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target1, Y_target1)
print("DSBF-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#DTB-KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA_AdaBoost
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.3618390422647574, n_estimators=11, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter-KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lr()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Bruakfilter - KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lr()
tca= TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter_NB
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("PeterFilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yr()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Bruakfilter-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lr()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tr()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)
DSBF-Ridge
roc_auc =  0.9148936170212766
precision =  1.0
recall =  0.8297872340425532
f1 =  0.9069767441860465
conf_matrix =  [[445   0]
 [  8  39]]
Number of correct predictions (%): 82.97872340425532
Number of incorrect predictions (%): 0.0
X_source =  (1266, 17)
Y_source =  (1266,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)
DTB-KNN
roc_auc =  0.5388701397471243
precision =  0.11507936507936507
recall =  0.43283582089552236
f1 =  0.18181818181818182
conf_matrix =  [[405 223]
 [ 38  29]]
Number of correct predictions (%): 43.28358208955223
Number of incorrect predictions (%): 332.8358208955224
X_source =  (1266, 17)
Y_source =  (1266,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)
TCA-AdaBoost
roc_auc =  0.917910447761194
precision =  1.0
recall =  0.835820895522388
f1 = 

In [122]:
# Non-ML Files

# DSBF-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yr()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target2, Y_target2)
print("DSBF-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#DTB-KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA_AdaBoost
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.3618390422647574, n_estimators=11, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter-KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lr()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Bruakfilter - KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lr()
tca= TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter_NB
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("PeterFilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jr()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yr()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Bruakfilter-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lr()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_tr()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)
DSBF-Ridge
roc_auc =  0.9555555555555555
precision =  1.0
recall =  0.9111111111111111
f1 =  0.9534883720930233
conf_matrix =  [[1023    0]
 [   8   82]]
Number of correct predictions (%): 91.11111111111111
Number of incorrect predictions (%): 0.0
X_source =  (1266, 17)
Y_source =  (1266,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)
DTB-KNN
roc_auc =  0.5403370348723944
precision =  0.09344262295081968
recall =  0.3931034482758621
f1 =  0.1509933774834437
conf_matrix =  [[1217  553]
 [  88   57]]
Number of correct predictions (%): 39.310344827586206
Number of incorrect predictions (%): 381.37931034482756
X_source =  (1266, 17)
Y_source =  (1266,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (1915, 17)
Y_target2 =  (1915,)
TCA-AdaBoost
roc_auc =  0.8931034482758621
precision =  1.0
recall =  0.786206896551

## Transformers

In [123]:
def data_loading_jt():
    # Load your dataset (replace X and y with your features and labels)

    jax_1_73 = pd.read_csv('../Dataset/jax_0.1.73.csv')
    jax_2_21 = pd.read_csv('../Dataset/jax_0.2.21.csv')
    jax_2_28 = pd.read_csv('../Dataset/jax_0.2.28.csv')
    jax_3_15 = pd.read_csv('../Dataset/jax_0.3.15.csv')
    
    
    jax_train_data = pd.concat([jax_1_73, jax_2_21, jax_2_28, jax_3_15])
    jax_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    
    transformers_test_data1 = transformers_ml_full_data.copy()
    transformers_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    transformers_test_data2 =  transformers_non_ml_full_data.copy()
    transformers_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = jax_train_data.drop(columns='Buggy')
    Y_source = jax_train_data['Buggy']
    # For ML Files
    X_target1 = transformers_test_data1.drop(columns='Buggy')
    Y_target1 = transformers_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = transformers_test_data2.drop(columns='Buggy')
    Y_target2 = transformers_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_lt():
    # Load your dataset (replace X and y with your features and labels)
    lightning_0_5 = pd.read_csv('../Dataset/lightning_0.5.1.csv')
    lightning_1_0 = pd.read_csv('../Dataset/lightning_1.0.0.csv')
    lightning_1_5 = pd.read_csv('../Dataset/lightning_1.5.0.csv')
    lightning_1_8 = pd.read_csv('../Dataset/lightning_1.8.0.csv')
    
    
    lightning_train_data = pd.concat([lightning_0_5, lightning_1_0, lightning_1_5, lightning_1_8])
    lightning_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    
    transformers_test_data1 = transformers_ml_full_data.copy()
    transformers_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    transformers_test_data2 =  transformers_non_ml_full_data.copy()
    transformers_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = jax_train_data.drop(columns='Buggy')
    Y_source = jax_train_data['Buggy']
    # For ML Files
    X_target1 = lightning_test_data1.drop(columns='Buggy')
    Y_target1 = lightning_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = transformers_test_data2.drop(columns='Buggy')
    Y_target2 = transformers_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_rt():
    # Load your dataset (replace X and y with your features and labels)
    
    ray_0_3 = pd.read_csv('../Dataset/ray_0.3.0.csv')
    ray_0_6 = pd.read_csv('../Dataset/ray_0.6.1.csv')
    ray_0_8 = pd.read_csv('../Dataset/ray_0.8.0.csv')
    ray_1_1 = pd.read_csv('../Dataset/ray_1.1.0.csv')
    ray_1_9 = pd.read_csv('../Dataset/ray_1.9.0.csv')
    ray_2_0 = pd.read_csv('../Dataset/ray_2.0.0.csv')
    
    ray_train_data = pd.concat([ray_0_3, ray_0_6, ray_0_8, ray_1_1, ray_1_9, ray_2_0])
    ray_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    transformers_test_data1 = transformers_ml_full_data.copy()
    transformers_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    transformers_test_data2 =  transformers_non_ml_full_data.copy()
    transformers_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = jax_train_data.drop(columns='Buggy')
    Y_source = jax_train_data['Buggy']
    # For ML Files
    X_target1 = ray_test_data1.drop(columns='Buggy')
    Y_target1 = ray_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = transformers_test_data2.drop(columns='Buggy')
    Y_target2 = transformers_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_yt():
    # Load your dataset (replace X and y with your features and labels)
    yolov5_4_0 = pd.read_csv('../Dataset/yolov5_4.0.csv')
    yolov5_6_0 = pd.read_csv('../Dataset/yolov5_6.0.csv')
    yolov5_7_0 = pd.read_csv('../Dataset/yolov5_7.0.csv')
    
    yolov5_train_data = pd.concat([yolov5_4_0, yolov5_6_0, yolov5_7_0])
    yolov5_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    transformers_2_0 = pd.read_csv('../Dataset/transformers_2.0.0.csv')
    transformers_3_5 = pd.read_csv('../Dataset/transformers_3.5.0.csv')
    transformers_4_13 = pd.read_csv('../Dataset/transformers_4.13.0.csv')
    transformers_4_23 = pd.read_csv('../Dataset/transformers_4.23.0.csv')
    
    
    transformers_test_data1 = transformers_ml_full_data.copy()
    transformers_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    transformers_test_data2 =  transformers_non_ml_full_data.copy()
    transformers_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = yolov5_train_data.drop(columns='Buggy')
    Y_source = yolov5_train_data['Buggy']
    # For ML Files
    X_target1 = transformers_test_data1.drop(columns='Buggy')
    Y_target1 = transformers_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = transformers_test_data2.drop(columns='Buggy')
    Y_target2 = transformers_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

In [124]:
# ML Files

# TCA-CART
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lt()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split= 5, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#TCA-RandomForest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter-RandomForest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yt()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Peterfilter-Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Universal_Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yt()
universal = Universal()
X_source, Y_source, X_target1, Y_target1 = universal.run(X_source, Y_source, X_target1, Y_target1)
print("Universal-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yt()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target1, Y_target1)
print("DSBF-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter- KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Bruakfilter - KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("PeterFilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jt()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target1, Y_target1)
print("TCA-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (849, 17)
Y_source =  (849,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
TCA-CART
roc_auc =  0.9640998870365933
precision =  0.8936170212765957
recall =  0.9618320610687023
f1 =  0.9264705882352942
conf_matrix =  [[431  15]
 [  5 126]]
Number of correct predictions (%): 96.18320610687023
Number of incorrect predictions (%): 11.450381679389313
X_source =  (849, 17)
Y_source =  (849,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
TCA-Random FOrest
roc_auc =  0.9029850746268657
precision =  1.0
recall =  0.8059701492537313
f1 =  0.8925619834710743
conf_matrix =  [[628   0]
 [ 13  54]]
Number of correct predictions (%): 80.59701492537313
Number of incorrect predictions (%): 0.0
X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (1391, 17)
Y_target1 =  (1391,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
Peterfilter-Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix 

In [125]:
# Non-ML Files

# TCA-CART
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_lt()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-CART")
model = DecisionTreeClassifier(criterion='entropy', max_depth=7, max_features='sqrt', min_samples_split= 5, splitter='best', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


#TCA-RandomForest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter-RandomForest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yt()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Peterfilter-Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Universal_Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yt()
universal = Universal()
X_source, Y_source, X_target1, Y_target1 = universal.run(X_source, Y_source, X_target2, Y_target2)
print("Universal-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_yt()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target2, Y_target2)
print("DSBF-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter- KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Bruakfilter - KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB-MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter-Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("PeterFilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# TCA - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_jt()
tca = TCA()
X_source, Y_source, X_target1, Y_target1 = tca.run(X_source, Y_source, X_target2, Y_target2)
print("TCA-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_rt()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (849, 17)
Y_source =  (849,)
X_target1 =  (577, 17)
Y_target1 =  (577,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
TCA-CART
roc_auc =  0.980276134122288
precision =  0.23076923076923078
recall =  1.0
f1 =  0.375
conf_matrix =  [[487  20]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 333.33333333333337
X_source =  (849, 17)
Y_source =  (849,)
X_target1 =  (695, 17)
Y_target1 =  (695,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
TCA-Random FOrest
roc_auc =  0.9921104536489153
precision =  0.42857142857142855
recall =  1.0
f1 =  0.6
conf_matrix =  [[499   8]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 133.33333333333331
X_source =  (103, 17)
Y_source =  (103,)
X_target1 =  (1391, 17)
Y_target1 =  (1391,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
Peterfilter-Random FOrest
roc_auc =  1.0
precision =  1.0
recall =  1.0
f1 =  1.0
conf_matrix =  [[507   0]
 [  0   6]]
Number of correct predict

## Yolov5

In [126]:
def data_loading_jy():
    # Load your dataset (replace X and y with your features and labels)
    jax_1_73 = pd.read_csv('../Dataset/jax_0.1.73.csv')
    jax_2_21 = pd.read_csv('../Dataset/jax_0.2.21.csv')
    jax_2_28 = pd.read_csv('../Dataset/jax_0.2.28.csv')
    jax_3_15 = pd.read_csv('../Dataset/jax_0.3.15.csv')
    
    
    jax_train_data = pd.concat([jax_1_73, jax_2_21, jax_2_28, jax_3_15])
    jax_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data1 = yolov5_ml_full_data.copy()
    yolov5_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data2 =  transformers_non_ml_full_data.copy()
    yolov5_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = jax_train_data.drop(columns='Buggy')
    Y_source = jax_train_data['Buggy']
    # For ML Files
    X_target1 = yolov5_test_data1.drop(columns='Buggy')
    Y_target1 = yolov5_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = yolov5_test_data2.drop(columns='Buggy')
    Y_target2 = yolov5_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_ly():
    # Load your dataset (replace X and y with your features and labels)
    lightning_0_5 = pd.read_csv('../Dataset/lightning_0.5.1.csv')
    lightning_1_0 = pd.read_csv('../Dataset/lightning_1.0.0.csv')
    lightning_1_5 = pd.read_csv('../Dataset/lightning_1.5.0.csv')
    lightning_1_8 = pd.read_csv('../Dataset/lightning_1.8.0.csv')
    
    
    lightning_train_data = pd.concat([lightning_0_5, lightning_1_0, lightning_1_5, lightning_1_8])
    lightning_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data1 = yolov5_ml_full_data.copy()
    yolov5_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data2 =  transformers_non_ml_full_data.copy()
    yolov5_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = lightning_train_data.drop(columns='Buggy')
    Y_source = lightning_train_data['Buggy']
    # For ML Files
    X_target1 = yolov5_test_data1.drop(columns='Buggy')
    Y_target1 = yolov5_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = yolov5_test_data2.drop(columns='Buggy')
    Y_target2 = yolov5_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2


def data_loading_ry():
    # Load your dataset (replace X and y with your features and labels)    
    ray_0_3 = pd.read_csv('../Dataset/ray_0.3.0.csv')
    ray_0_6 = pd.read_csv('../Dataset/ray_0.6.1.csv')
    ray_0_8 = pd.read_csv('../Dataset/ray_0.8.0.csv')
    ray_1_1 = pd.read_csv('../Dataset/ray_1.1.0.csv')
    ray_1_9 = pd.read_csv('../Dataset/ray_1.9.0.csv')
    ray_2_0 = pd.read_csv('../Dataset/ray_2.0.0.csv')
    
    ray_train_data = pd.concat([ray_0_3, ray_0_6, ray_0_8, ray_1_1, ray_1_9, ray_2_0])
    ray_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)

    yolov5_test_data1 = yolov5_ml_full_data.copy()
    yolov5_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data2 =  transformers_non_ml_full_data.copy()
    yolov5_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = ray_train_data.drop(columns='Buggy')
    Y_source = ray_train_data['Buggy']
    # For ML Files
    X_target1 = yolov5_test_data1.drop(columns='Buggy')
    Y_target1 = yolov5_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = yolov5_test_data2.drop(columns='Buggy')
    Y_target2 = yolov5_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

def data_loading_ty():
    # Load your dataset (replace X and y with your features and labels)
    transformers_2_0 = pd.read_csv('../Dataset/transformers_2.0.0.csv')
    transformers_3_5 = pd.read_csv('../Dataset/transformers_3.5.0.csv')
    transformers_4_13 = pd.read_csv('../Dataset/transformers_4.13.0.csv')
    transformers_4_23 = pd.read_csv('../Dataset/transformers_4.23.0.csv')
    
    
    transformers_train_data = pd.concat([transformers_2_0, transformers_3_5, transformers_4_13, transformers_4_23])
    transformers_train_data.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data1 = yolov5_ml_full_data.copy()
    yolov5_test_data1.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
    
    yolov5_test_data2 =  transformers_non_ml_full_data.copy()
    yolov5_test_data2.drop(['Unnamed: 0','Project', 'Files'], axis=1, inplace=True)
      #jax_test_data.shape
    
    X_source = transformers_train_data.drop(columns='Buggy')
    Y_source = transformers_train_data['Buggy']
    # For ML Files
    X_target1 = yolov5_test_data1.drop(columns='Buggy')
    Y_target1 = yolov5_test_data1['Buggy']
    # For Non-ML Files
    X_target2 = yolov5_test_data2.drop(columns='Buggy')
    Y_target2 = yolov5_test_data2['Buggy']
    
    print("X_source = ", X_source.shape)
    print("Y_source = ", Y_source.shape)
    print("X_target1 = ", X_target1.shape)
    print("Y_target1 = ", Y_target1.shape)
    print("X_target2 = ", X_target2.shape)
    print("Y_target2 = ", Y_target2.shape)
    X_source = X_source.to_numpy()
    X_target1 = X_target1.to_numpy()
    Y_source = Y_source.to_numpy()
    Y_target1 = Y_target1.to_numpy()
    X_target2 = X_target2.to_numpy()
    Y_target2 = Y_target2.to_numpy()
    return X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2

In [127]:
# ML Files


# DTB-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Data Selection - Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ly()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target1, Y_target1, loc)
print("DS-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter - KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Bruakfilter - KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ty()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter - NB
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2= data_loading_jy()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("PeterFilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ly()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target1, Y_target1)
print("DBSF-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Universal - AdaBoost
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
universal = Universal()
X_source, Y_source, X_target1, Y_target1 = universal.run(X_source, Y_source, X_target1, Y_target1)
print("Universal-AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.3618390422647574, n_estimators=11, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DS - Random Forest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ty()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target1, Y_target1, loc)
print("DS-Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target1, Y_target1)
print("DTB-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ly()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target1, Y_target1)
print("Peterfilter-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (6778, 17)
Y_source =  (6778,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
DTB-Ridge
roc_auc =  0.4294117647058824
precision =  0.5
recall =  0.058823529411764705
f1 =  0.10526315789473684
conf_matrix =  [[ 8  2]
 [32  2]]
Number of correct predictions (%): 5.88235294117647
Number of incorrect predictions (%): 5.88235294117647
X_source =  (1820, 17)
Y_source =  (1820,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
DS-Gaussian NB
roc_auc =  0.8705882352941177
precision =  0.9411764705882353
recall =  0.9411764705882353
f1 =  0.9411764705882353
conf_matrix =  [[ 8  2]
 [ 2 32]]
Number of correct predictions (%): 94.11764705882352
Number of incorrect predictions (%): 5.88235294117647
X_source =  (6778, 17)
Y_source =  (6778,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
Bruakfilter - KNN
roc_auc =  0.9264705882352942
precision =  1.0
recall =  0.8529411764705882

In [130]:
# Non-ML Files


# DTB-Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Data Selection - Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ly()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target2, Y_target2, loc)
print("DS-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Bruakfilter - KNN
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
bruakfilter = Bruakfilter()
X_source, Y_source, X_target1, Y_target1 = bruakfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Bruakfilter - KNN")
model = KNeighborsClassifier(algorithm='kd_tree', n_neighbors=5)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF - MLP
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ty()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-MLP")
model = MLPClassifier(activation='logistic', alpha=0.0062202779219760335, hidden_layer_sizes=(50,), learning_rate='invscaling', max_iter= 25, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter - NB
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2= data_loading_jy()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("PeterFilter-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DSBF - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ly()
dsbf = DSBF()
X_source, Y_source, X_target1, Y_target1 = dsbf.run(X_source, Y_source, X_target2, Y_target2)
print("DBSF-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Universal - AdaBoost
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
universal = Universal()
X_source, Y_source, X_target1, Y_target1 = universal.run(X_source, Y_source, X_target2, Y_target2)
print("Universal-AdaBoost")
model = AdaBoostClassifier(algorithm='SAMME', learning_rate= 0.3618390422647574, n_estimators=11, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DS - Random Forest
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ty()
ds = DataSelection()
loc = [0]
X_source, Y_source, X_target1, Y_target1 = ds.run(X_source, Y_source, X_target2, Y_target2, loc)
print("DS-Random FOrest")
model = RandomForestClassifier(criterion='gini', max_depth= 4, max_features='log2', min_samples_leaf=5, min_samples_split= 0.1794337157967658, n_estimators=6, random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# DTB - Naive Bayes
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ry()
dtb = DTB()
X_source, Y_source, X_target1, Y_target1 = dtb.run(X_source, Y_source, X_target2, Y_target2)
print("DTB-Gaussian NB")
model = GaussianNB(var_smoothing=1e-09)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


# Peterfilter - Ridge
X_source, Y_source, X_target1, Y_target1, X_target2, Y_target2 = data_loading_ly()
peterfilter = Peterfilter()
X_source, Y_source, X_target1, Y_target1 = peterfilter.run(X_source, Y_source, X_target2, Y_target2)
print("Peterfilter-Ridge")
model = RidgeClassifier(alpha=0.1986668460759633, max_iter=4107, solver='saga', random_state=42)
model.fit(X_source, Y_source)
y_pred = model.predict(X_target1)
print("roc_auc = ", roc_auc_score(Y_target1, y_pred))
print("precision = ", precision_score(Y_target1, y_pred, zero_division=1))
print("recall = ", recall_score(Y_target1, y_pred))
print("f1 = ", f1_score(Y_target1, y_pred))
conf_mat = confusion_matrix(Y_target1, y_pred)
print("conf_matrix = ", conf_mat)
calculate_correct_incorrect(conf_mat)


X_source =  (6778, 17)
Y_source =  (6778,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
DTB-Ridge
roc_auc =  0.9230769230769231
precision =  0.07142857142857142
recall =  1.0
f1 =  0.13333333333333333
conf_matrix =  [[429  78]
 [  0   6]]
Number of correct predictions (%): 100.0
Number of incorrect predictions (%): 1300.0
X_source =  (1820, 17)
Y_source =  (1820,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
DS-Gaussian NB
roc_auc =  0.6094674556213017
precision =  0.03333333333333333
recall =  0.3333333333333333
f1 =  0.0606060606060606
conf_matrix =  [[449  58]
 [  4   2]]
Number of correct predictions (%): 33.33333333333333
Number of incorrect predictions (%): 966.6666666666666
X_source =  (6778, 17)
Y_source =  (6778,)
X_target1 =  (44, 17)
Y_target1 =  (44,)
X_target2 =  (513, 17)
Y_target2 =  (513,)
Bruakfilter - KNN
roc_auc =  0.9146942800788955
precision =  0.7142857142857143
recall =  0.8333333333333