In [1]:
import pandas as pd
import numpy as np

import math 
from IPython.display import display

from scipy.io import arff

from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import pairwise_distances

from sklearn.metrics import roc_auc_score

In [2]:
def readArff(file):
    data = arff.loadarff(file)
    df = pd.DataFrame(data[0])
    
    df.iloc[:,-1] = df.iloc[:,-1].apply(lambda x: str(x)[1:]) #Retirando caractere indesejado
    
    df.iloc[:,-1] = df.iloc[:,-1].map({"'true'": True, "'false'": False})
    #display(y)
    return df

In [3]:
def getMetrics(df, target, pred ):
    target = target.astype(bool)
    pred = pred.astype(bool)
    
    acc = accuracy_score(target,pred)
    f1 = f1_score(target, pred)
    tn, fp, fn, tp = confusion_matrix(target, pred).ravel()
    
    
    print("AUC: ", roc_auc_score(target, pred))
    return acc, f1, tn, fp, fn, tp

In [4]:
def getOneClass(df):

    scaler = preprocessing.StandardScaler()
    df.iloc[:, :-1] = scaler.fit_transform(df.iloc[:, :-1])

    display(df.head())
    
    oneclass = df.query("defects == False")
    
    oneclass = oneclass.sample(frac=0.8,random_state = 200)

    target = oneclass.iloc[:, -1]
    

    df.drop(oneclass.index, inplace=True)

    oneclass  = oneclass.iloc[:, :-1] # retirando a última coluna, pois é o target
    
    return df, oneclass, target # teste, treino e target do treino

In [5]:
def NumericalMissingTransformer(df): #colocando pior valor possivel
        columns = df.select_dtypes(include=['float','int']).columns
        for column in columns:
            df[column].fillna(df[column].max(),inplace=True) 
        return df
        
def euclideanDistance(vector1, vector2):
        return np.sqrt(np.sum(np.power(vector1-vector2, 2)))

def getMedianDistance(row_df, data):

        distances = np.array([])
        
        for index_pt, row_pt in data.iterrows():

            distances = np.append(distances, euclideanDistance(row_df, row_pt))

        return (np.percentile(distances, 50)) #mediana
    
def setClass(rows_M, data, limits):
    #Calculando distancia de todos os elementos de teste com os clusters
        M = pd.DataFrame(data = rows_M, columns= data.columns)
        aux = 0
        data.loc[:, "defect"] = True
        for index_M, row_M in M.iterrows():
            
            for index_pt, row_pt in data.iterrows():
                data.loc[index_pt, "distance"] =  euclideanDistance(row_M, row_pt)
            limit = limits[aux]
            #print("Limite: ", limit)
            data.eval('defect = defect & (distance > @limit)', inplace=True)

            aux = aux +1

        return data.loc[:,"defect"] # retorna a coluna defect com os valores preditos

def kMedoids(D, k, tmax=100):
    # determina as dimensoes da matriz de distancia D 
    m, n = D.shape

    if k > n:
        raise Exception('muitos medoids')
    
    # Encontra um conjunto inicial de cluster
    valid_medoid_inds = set(range(n))
    invalid_medoid_inds = set([])
    rs,cs = np.where(D==0)
    # linhas e colunas embaralhadas, pois serao mantidos os primeiros elementos duplicados
    index_shuf = list(range(len(rs)))
    np.random.shuffle(index_shuf)
    rs = rs[index_shuf]
    cs = cs[index_shuf]
    for r,c in zip(rs,cs):
        # Se houver dois pontos com distancia 0, manter o primeiro
        if r < c and r not in invalid_medoid_inds:
            invalid_medoid_inds.add(c)
    valid_medoid_inds = list(valid_medoid_inds - invalid_medoid_inds)

    if k > len(valid_medoid_inds):
        raise Exception('muitos medoids (depois de remover {} pontos em duplicidade)'.format(
            len(invalid_medoid_inds)))

    # Inicializa um array com k medoid indices
    M = np.array(valid_medoid_inds)
    np.random.shuffle(M)
    M = np.sort(M[:k])

    # cria uma copia do array de indices do medoid
    Mnew = np.copy(M)

    # Inicializa um dicionario para representar os clusters
    C = {}
    for t in range(tmax):
        # determina os clusters, i. e. os arrays com os indices dos dados
        J = np.argmin(D[:,M], axis=1)
        #print(J)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
        # Atualiza o cluster medoids
        for kappa in range(k):
            J = np.mean(D[np.ix_(C[kappa],C[kappa])],axis=1)
            j = np.argmin(J)
            Mnew[kappa] = C[kappa][j]
        np.sort(Mnew)
        # checa a convergencia
        if np.array_equal(M, Mnew):
            break
        M = np.copy(Mnew)
    else:
        # ultima atualizacao dos membros do cluster
        J = np.argmin(D[:,M], axis=1)
        for kappa in range(k):
            C[kappa] = np.where(J==kappa)[0]
    
    return M, C

In [6]:
files = ["./Data/cm1.arff","./Data/kc1.arff"]
#files = ["./Data/cm1.arff"]
for index,file in enumerate(files):

    df = readArff(file)
    columns = df.columns
    
    df = NumericalMissingTransformer(df)
    
    #k = round(math.log(df.shape[0]))
    k = 1
    df, oneclass, oneclassTarget = getOneClass(df)
    # Cria matriz de correlacao
    # Retira parte nao relevante da matriz de correlacao
    # Retira colunas que tem grau de correlacao maior que 95%
    
    #
    corr_matrix = oneclass.corr().abs()

    
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    

    df.drop(to_drop, axis=1, inplace = True)
    oneclass.drop(to_drop, axis=1, inplace = True)
    
    oneclass.reset_index(drop=True, inplace=True)
    oneclassTarget.reset_index(drop=True, inplace=True)
    

    #
    
    # Calcula os elementos que representam os medoids (armazenados em M) e C os elementos do cluster estao em C
    # Em limits sao armazenados os raio "medianos" de cada cluster
    data = oneclass.values
    limits = np.array([])
    
    # matriz de distancias
    D = pairwise_distances(data, metric='euclidean')
         
    # divide em k clusters
    M, C = kMedoids(D, k)

    #print ("M: ",M)
    print('clustering result:')
    for label in C:
        limits = np.append(limits, getMedianDistance(oneclass.loc[M[label]], oneclass.loc[C[label]]))
    
    print("limits: ", limits)
    #display(oneclass.loc[M])

    realClass = df.loc[:,"defects"]
    df = df.drop(['defects'], axis=1)

    print(df.shape)
    pred = setClass(oneclass.loc[M],df ,limits)
    
    acc, f1,tn, fp, fn, tp = getMetrics(df,realClass ,pred)
    print("ACC: ",acc)
    print ("F1: ",f1)
    print ("TP: ",tp)
    print ("FP: ",fp)

Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,-0.66833,-0.477556,-0.298416,-0.389989,-0.646007,-0.532158,7.24775,-0.948668,-1.005309,-0.260268,...,-0.21025,-0.39853,-0.477629,19.935901,-1.457012,-0.715579,-0.646896,-0.625798,-0.527871,False
1,-0.670671,-0.525524,-0.40785,-0.463264,-0.647366,-0.532335,5.363055,-0.968256,-1.013426,-0.26027,...,-0.327896,-0.437286,-0.527725,9.937836,-1.477828,-0.72148,-0.64838,-0.6281,-0.554436,True
2,-0.132163,-0.045848,-0.40785,-0.09689,-0.366604,-0.349914,-0.228207,-0.413265,-0.160051,-0.238366,...,-0.327896,-0.476041,-0.277242,-0.060229,-0.020732,-0.3084,-0.329346,-0.420923,-0.023125,False
3,-0.225817,-0.165767,0.412905,-0.280077,-0.439059,-0.405352,-0.542323,0.01114,-0.676027,-0.234552,...,-0.445541,-0.476041,-0.427532,-0.060229,0.083346,-0.51494,-0.425798,-0.455452,-0.155953,False
4,-0.132163,0.07407,0.960075,-0.280077,-0.325849,-0.328009,-0.542323,0.09798,-0.500157,-0.215514,...,-0.445541,-0.476041,-0.427532,-0.060229,0.083346,-0.396917,-0.314508,-0.340354,0.109703,False


clustering result:
limits:  [1.47853256]
(139, 14)
AUC:  0.5607709750566894
ACC:  0.539568345323741
F1:  0.4920634920634921
TP:  31
FP:  46


Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,-0.647864,-0.36874,-0.124739,-0.339674,-0.580634,-0.498642,3.093249,-0.695929,-0.927649,-0.300506,...,-0.51795,0.341722,0.062345,2.652867,-1.122654,-0.683788,-0.576537,-0.54844,-0.419224,False
1,-0.651226,-0.471309,-0.306546,-0.458191,-0.584224,-0.499223,2.146739,-0.734088,-0.941606,-0.300524,...,-0.559302,0.017524,-0.196995,1.232121,-1.157565,-0.700191,-0.5804,-0.554677,-0.47057,True
2,2.105319,2.0929,-0.306546,2.504721,1.449754,1.296396,-0.882093,2.069348,0.885308,0.925197,...,2.087232,2.935302,1.099707,-0.188624,1.809801,1.268178,1.467364,1.409972,2.096706,True
3,0.861512,1.323638,1.96604,1.615847,1.090817,0.990097,-0.787442,1.028871,1.42217,0.355163,...,0.929373,0.341722,0.840366,-0.188624,1.460699,1.514224,1.11963,1.035753,1.326523,True
4,0.155568,0.041533,-0.306546,0.134392,0.097757,-0.007646,-0.661241,0.328012,0.279593,-0.164008,...,0.26774,-0.306673,0.062345,-0.188624,0.587944,0.037947,0.192341,-0.055719,0.042885,True


clustering result:
limits:  [1.26017086]
(683, 13)
AUC:  0.6589421044491417
ACC:  0.6515373352855052
F1:  0.6925064599483204
TP:  268
FP:  180


In [7]:
# Kmedoids retirado de: https://github.com/letiantian/kmedoids

#Class Distribution: the class value (defects) is discrete 
#%    false:   449 = 90.16%
#%    true:   49 =  9.83%

#% 9. Class Distribution: the class value (problems) is discrete 
#%    yes:  326 = 15.45%
#%    no:  1783 = 84.54%
