In [566]:
# !pip3 install sklearn
# !pip3 install scipy
# !pip3 install pandas
# !pip3 install numpy
# !pip3 install kneed
# !pip3 install plotly

In [567]:
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from scipy.io import arff

from kneed import KneeLocator

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

## Utils: Visualização

In [568]:
def reduce_dimensions(df, n_dimensions):
    pca = PCA(n_components=n_dimensions)
    df = pca.fit_transform(df[df.columns].values)
    return df

def plot_clusters(df, kmeans):
    fig = go.Figure(
        data=[go.Bar(x=[1, 2, 3], y=[1, 3, 2])],
        layout=dict(title=dict(text="A Figure Specified By A Graph Object"))
    )
    fig.data = []
    fig.layout = {}
        
    label = kmeans.fit_predict(df)
    u_labels = np.unique(label)
    
    for i in u_labels:
        fig.add_trace(
            go.Scatter(
                x=df[label == i , 0],
                y=df[label == i , 1],
                mode="markers",
                text=u_labels))
    fig.show()

## pré-processamento

In [569]:
def group_by_class(X):
    lines_to_remove = []
    for index, row in X.iterrows():
        if row['defects'] == b'true':
            lines_to_remove.append(index)
    defects_false = X.drop(lines_to_remove)
    defects_true = X.iloc[lines_to_remove]
    return defects_false, defects_true

def normalize_data(X):
    arr = X.values
    scaler = preprocessing.MinMaxScaler()
    X_scaled = scaler.fit_transform(arr)
    return pd.DataFrame(X_scaled)

def randomize_dataframe(df):
    df = df.sample(frac=1, random_state=20)
    df.reset_index(drop=True, inplace=True)
    return df

In [570]:
resultados = {
    "defects_false": {},
    "defects_true": {},
    "naive_bayes": []
}

In [571]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])
df_false, df_true = group_by_class(df)

df_false = df_false.drop(columns=['defects'])
df_true = df_true.drop(columns=['defects'])

df_false = normalize_data(df_false)
df_true = normalize_data(df_true)

## treinamento

In [572]:
def get_sse(df, k_range, args):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, **args)
        kmeans.fit(df)
        sse.append(kmeans.inertia_)
        
    return sse

def get_best_k(k_range, sse):
    kl = KneeLocator(k_range, sse, curve="convex", direction="decreasing")
    return kl.elbow

def elbow_plot(k_range, sse):
    fig = px.line(x=k_range, y=sse)
    fig.show()
    
def generate_metaclasses(df, args):
    kmeans = KMeans(**args)
    metaclass = kmeans.fit_predict(df)
    df['metaclass'] = metaclass
    return df

def nb_fit(df):
    gnb = GaussianNB()
    
    y = df['metaclass']
    X = df.drop(columns=['metaclass'])
    
    return gnb.fit(X, y)

def nb_predict(gnb, test):
    test = test.drop(columns=['metaclass'])
    predicted_metaclasses = gnb.predict(test)
    return predicted_metaclasses

def nb_score(gnb, test_samples, true_labels):
    return gnb.score(test_samples, true_labels)


### clusters gerados com defects = false

In [573]:
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
sse = get_sse(df_false, k_range, args)
best_k = get_best_k(k_range, sse) #if get_best_k(k_range, sse) else 4

elbow_plot(k_range, sse)
resultados["defects_false"] = {"best_k": best_k}

In [574]:
args["n_clusters"] = best_k
df_false = generate_metaclasses(df_false, args)
df_false_copy = df_false.copy()

### clusters gerados com defects = true

In [575]:
# criar meta classes de todo mundo
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
sse = get_sse(df_true, k_range, args)
best_k = get_best_k(k_range, sse)
print("Melhor k: {}".format(best_k))
elbow_plot(k_range, sse)
resultados["defects_true"] = {"best_k": best_k}

Melhor k: 3


In [576]:
args["n_clusters"] = best_k
df_true = generate_metaclasses(df_true, args)
df_true["metaclass"] = df_true["metaclass"].apply(lambda x: x + 4)
df_true_copy = df_true.copy()

In [577]:
args["n_clusters"] = 7
df_true_false = pd.concat([df_false_copy, df_true_copy])
df_true_false_2d = reduce_dimensions(df_true_false, 2)
plot_clusters(df_true_false_2d, KMeans(**args))

## Pós processamento

In [578]:
df_true_copy["metaclass"] = df_true_copy["metaclass"].apply(lambda x: x + 4)
df_final = pd.concat([df_false_copy, df_true_copy])
df_final = df_final.sample(frac=1, random_state=20)

In [579]:
# Verificar acertos por cluster

kf = KFold(n_splits=5)

results = []
i = 0
for train, test in kf.split(df_final):
    fold_results = {}
    fold_results["fold"] = i
    result = {}
    print("Running fold {}".format(i))
    result["fold"] = i
    training_set = df_final.iloc[train]
    gnb = nb_fit(training_set)

    test_set = df_final.iloc[test]
    
    real_labels = test_set['metaclass']
    result["real_labels"] = real_labels
    
    predicted_classes = nb_predict(gnb, test_set)
    result["predicted_classes"] = predicted_classes
    
    test_set = test_set.drop(columns=['metaclass'])
    acc = nb_score(gnb, test_set, real_labels)
    fold_results["cluster_hit"] = acc
    resultados["naive_bayes"].append(fold_results)
    results.append(result)
    i+=1
    

Running fold 0
Running fold 1
Running fold 2
Running fold 3
Running fold 4


In [580]:
### Verificar acertos por label
label_hits = 0
label_false = set([0,1,2,3])
label_true = set([4,5,6])

for result in results:
    predicted_classes = result["predicted_classes"]
    real_labels = result["real_labels"]
    
    for i, j in zip(predicted_classes, real_labels):
        if i == j:
            label_hits += 1
        elif ((i in label_false and j in label_false) or (i in label_true and j in label_true)):
            label_hits += 1
    
    acc = label_hits/len(predicted_classes)
    resultados["naive_bayes"][result["fold"]]["label_hit"] = acc
    acc = 0
    label_hits = 0

In [581]:
hit_rate = []
cluster_hit = []
label_hit = []
for i in range(len(resultados['naive_bayes'])):
    label_hit.append(resultados['naive_bayes'][i]['label_hit'])
    cluster_hit.append(resultados['naive_bayes'][i]['cluster_hit'])

resultados['std_cluster_hit'] = np.std(cluster_hit)
resultados['std_label_hit'] = np.std(label_hit)
resultados['mean_cluster_hit'] = np.mean(cluster_hit)
resultados['mean_label_hit'] = np.mean(label_hit)


In [582]:
s = "{'defects_false': {'best_k': 4}, 'defects_true': {'best_k': 3}, 'naive_bayes': [{'fold': 0, 'hit_rate': 0.95, 'cluster_hit': 0.95, 'label_hit': 0.95}, {'fold': 1, 'hit_rate': 0.82, 'cluster_hit': 0.82, 'label_hit': 0.83}, {'fold': 2, 'hit_rate': 0.9, 'cluster_hit': 0.9, 'label_hit': 0.92}, {'fold': 3, 'hit_rate': 0.8585858585858586, 'cluster_hit': 0.8585858585858586, 'label_hit': 0.9090909090909091}, {'fold': 4, 'hit_rate': 0.7676767676767676, 'cluster_hit': 0.7676767676767676, 'label_hit': 0.8787878787878788}], 'std_hit_rate': 0.06296472890445597, 'std_cluster_hit': 0.06296472890445597, 'std_label_hit': 0.04075098516050726, 'mean_hit_rate': 0.8592525252525253, 'mean_cluster_hit': 0.8592525252525253, 'mean_label_hit': 0.8975757575757575}"
s = s.replace("'", "\"")
s

'{"defects_false": {"best_k": 4}, "defects_true": {"best_k": 3}, "naive_bayes": [{"fold": 0, "hit_rate": 0.95, "cluster_hit": 0.95, "label_hit": 0.95}, {"fold": 1, "hit_rate": 0.82, "cluster_hit": 0.82, "label_hit": 0.83}, {"fold": 2, "hit_rate": 0.9, "cluster_hit": 0.9, "label_hit": 0.92}, {"fold": 3, "hit_rate": 0.8585858585858586, "cluster_hit": 0.8585858585858586, "label_hit": 0.9090909090909091}, {"fold": 4, "hit_rate": 0.7676767676767676, "cluster_hit": 0.7676767676767676, "label_hit": 0.8787878787878788}], "std_hit_rate": 0.06296472890445597, "std_cluster_hit": 0.06296472890445597, "std_label_hit": 0.04075098516050726, "mean_hit_rate": 0.8592525252525253, "mean_cluster_hit": 0.8592525252525253, "mean_label_hit": 0.8975757575757575}'

-------------------------------------------

## Naïve Bayes

In [583]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])

In [584]:
defects = df['defects']# = df['defects'].apply(lambda x: 0 if x == "b'false'" else 1)
df = df.drop(columns='defects')
df = normalize_data(df)
df['defects'] = defects
df['defects'] = df['defects'].apply(str).str.replace("b|'", '')
df['defects'] = df['defects'].apply(lambda x: 1 if x == 'true' else 0)
df = randomize_dataframe(df)


The default value of regex will change from True to False in a future version.



In [585]:
kf = KFold(n_splits=5)

results = []
i = 0
for train, test in kf.split(df):
    result = {}
    
    result["fold"] = i
    
    training_set = df.iloc[train]
    target_values = training_set['defects']
    training_set = training_set.drop(columns=['defects'])
    
    gnb = GaussianNB()
    gnb.fit(training_set, target_values)

    test_set = df.iloc[test]
    real_labels = test_set['defects']
    result["real_labels"] = real_labels
    
    test_set = test_set.drop(columns=['defects'])
    predicted_classes = gnb.predict(test_set)
    result["predicted_classes"] = predicted_classes
    

    acc = gnb.score(test_set, real_labels)
    results.append(result)
    i+=1

## 1NN

In [586]:

kf = KFold(n_splits=5)

results = []
i = 0
for train, test in kf.split(df):
    result = {}
    result["fold"] = i
    
    
    training_set = df.iloc[train]
    target_values = training_set['defects']
    training_set = training_set.drop(columns=['defects'])
    
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(training_set, target_values)
    
    test_set = df.iloc[test]
    real_labels = test_set['defects']
    result["real_labels"] = real_labels
    
    test_set = test_set.drop(columns=['defects'])
    predicted_classes = knn.predict(test_set)
    result["predicted_classes"] = predicted_classes
    
    acc = 0
    for j, k in  zip(predicted_classes, real_labels):
        if j == k:
            acc += 1
    
    acc = acc/len(predicted_classes)
    i+=1
