In [89]:
!pip3 install sklearn
!pip3 install scipy
!pip3 install pandas
!pip3 install numpy
!pip3 install kneed
!pip3 install plotly



In [90]:
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from scipy.io import arff

from kneed import KneeLocator

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

## Utils: Visualização

In [91]:
def reduce_dimensions(df, n_dimensions):
    pca = PCA(n_components=n_dimensions)
    df = pca.fit_transform(df[df.columns].values)
    return df

def plot_clusters(df, kmeans):
    fig = go.Figure(
        data=[go.Bar(x=[1, 2, 3], y=[1, 3, 2])],
        layout=dict(title=dict(text="A Figure Specified By A Graph Object"))
    )
    fig.data = []
    fig.layout = {}
        
    label = kmeans.fit_predict(df)
    u_labels = np.unique(label)
    
    for i in u_labels:
        fig.add_trace(
            go.Scatter(
                x=df[label == i , 0],
                y=df[label == i , 1],
                mode="markers",
                text=u_labels))
    fig.show()

## pré-processamento

In [92]:
def group_by_class(X):
    lines_to_remove = []
    for index, row in X.iterrows():
        if row['defects'] == b'true':
            lines_to_remove.append(index)
    defects_false = X.drop(lines_to_remove)
    defects_true = X.iloc[lines_to_remove]
    return defects_false, defects_true

def normalize_data(X):
    arr = X.values
    scaler = preprocessing.MinMaxScaler()
    X_scaled = scaler.fit_transform(arr)
    return pd.DataFrame(X_scaled)

def randomize_dataframe(df):
    df = df.sample(frac=1, random_state=20)
    df.reset_index(drop=True, inplace=True)
    return df

In [93]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])
df_false, df_true = group_by_class(df)

df_false = df_false.drop(columns=['defects'])
df_true = df_true.drop(columns=['defects'])

df_false = normalize_data(df_false)
df_true = normalize_data(df_true)

## treinamento

In [94]:
def get_sse(df, k_range, args):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, **args)
        kmeans.fit(df)
        sse.append(kmeans.inertia_)
        
    return sse

def get_best_k(k_range, sse):
    kl = KneeLocator(k_range, sse, curve="convex", direction="decreasing")
    return kl.elbow

def elbow_plot(k_range, sse):
    fig = px.line(range(2,7), y=sse)
    fig.show()
    
def generate_metaclasses(df, args):
    kmeans = KMeans(**args)
    metaclass = kmeans.fit_predict(df)
    df['metaclass'] = metaclass
    return df

def nb_fit(df):
    gnb = GaussianNB()
    
    y = df['metaclass']
    X = df.drop(columns=['metaclass'])
    
    return gnb.fit(X, y)

def nb_predict(gnb, test):
    test = test.drop(columns=['metaclass'])
    predicted_metaclasses = gnb.predict(test)
    return predicted_metaclasses

def nb_score(gnb, test_samples, true_labels):
    return gnb.score(test_samples, true_labels)


### clusters gerados com defects = false

In [95]:
# criar meta classes de todo mundo
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
sse = get_sse(df_false, k_range, args)
best_k = get_best_k(k_range, sse) if get_best_k(k_range, sse) else 4
print("Melhor k: {}".format(best_k))
elbow_plot(k_range, sse)

Melhor k: 4


In [96]:
args["n_clusters"] = best_k
df_false = generate_metaclasses(df_false, args)
df_false_copy = df_false.copy()

In [97]:
df_false_2d = reduce_dimensions(df_false, 2)
plot_clusters(df_false_2d, KMeans(**args))

In [98]:
kf = KFold(n_splits=5)
i = 0
for train, test in kf.split(df_false):
    print("Running fold {}".format(i))
    training_set = df_false.iloc[train]
    gnb = nb_fit(training_set)

    test_set = df_false.iloc[test]
    real_labels = test_set['metaclass']
    predicted_classes = nb_predict(gnb, test_set)
    
    test_set = test_set.drop(columns=['metaclass'])
    acc = nb_score(gnb, test_set, real_labels)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i += 1

Running fold 0
Acurácia: 92.22%


Running fold 1
Acurácia: 95.56%


Running fold 2
Acurácia: 98.89%


Running fold 3
Acurácia: 87.78%


Running fold 4
Acurácia: 96.63%




### clusters gerados com defects = true

In [99]:
# criar meta classes de todo mundo
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
sse = get_sse(df_true, k_range, args)
best_k = get_best_k(k_range, sse)
print("Melhor k: {}".format(best_k))
elbow_plot(k_range, sse)

Melhor k: 3


In [100]:
args["n_clusters"] = best_k
df_true = generate_metaclasses(df_true, args)
df_true_copy = df_true.copy()

In [101]:
df_true_2d = reduce_dimensions(df_true, 2)
plot_clusters(df_true_2d, KMeans(**args))

In [102]:
kf = KFold(n_splits=5)
i = 0
for train, test in kf.split(df_true):
    print("Running fold {}".format(i))
    training_set = df_true.iloc[train]
    gnb = nb_fit(training_set)

    test_set = df_true.iloc[test]
    real_labels = test_set['metaclass']
    predicted_classes = nb_predict(gnb, test_set)
    
    test_set = test_set.drop(columns=['metaclass'])
    acc = nb_score(gnb, test_set, real_labels)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i+=1

Running fold 0
Acurácia: 90.0%


Running fold 1
Acurácia: 90.0%


Running fold 2
Acurácia: 70.0%


Running fold 3
Acurácia: 100.0%


Running fold 4
Acurácia: 100.0%




## Pós processamento

In [103]:
df_true_copy["metaclass"] = df_true_copy["metaclass"].apply(lambda x: x + 4)

df_final = pd.concat([df_false_copy, df_true_copy])
df_final = df_final.sample(frac=1, random_state=20)

In [104]:
args = {
    "n_clusters": 7,
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}


reduced = reduce_dimensions(df_final, 2)
plot_clusters(reduced, KMeans(**args))

In [105]:
kf = KFold(n_splits=5)

results = []
i = 0
for train, test in kf.split(df_final):
    result = {}
    print("Running fold {}".format(i))
    result["fold"] = i
    training_set = df_final.iloc[train]
    gnb = nb_fit(training_set)

    test_set = df_final.iloc[test]
    
    real_labels = test_set['metaclass']
    result["real_labels"] = real_labels
    
    predicted_classes = nb_predict(gnb, test_set)
    result["predicted_classes"] = predicted_classes
    
    test_set = test_set.drop(columns=['metaclass'])
    acc = nb_score(gnb, test_set, real_labels)
    results.append(result)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i+=1

Running fold 0
Acurácia: 95.0%


Running fold 1
Acurácia: 82.0%


Running fold 2
Acurácia: 90.0%


Running fold 3
Acurácia: 85.86%


Running fold 4
Acurácia: 76.77%




In [106]:
### Verificar acertos por cluster
print("Acertos por cluster")
print('\n')
cluster_hits = 0
for result in results:
    print('Avaliando fold {}'.format(result["fold"]))
    
    predicted_classes = result["predicted_classes"]
    real_labels = result["real_labels"]
    
    for i, j in zip(predicted_classes, real_labels):
        if i == j:
            cluster_hits += 1
    
    acc = cluster_hits/len(predicted_classes)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    acc = 0
    cluster_hits = 0

Acertos por cluster


Avaliando fold 0
Acurácia: 95.0%


Avaliando fold 1
Acurácia: 82.0%


Avaliando fold 2
Acurácia: 90.0%


Avaliando fold 3
Acurácia: 85.86%


Avaliando fold 4
Acurácia: 76.77%




In [107]:
### Verificar acertos por label
print("Acertos por label")
print('\n')
label_hits = 0
label_false = set([0,1,2,3])
label_true = set([4,5,6])

for result in results:
    print('Avaliando fold {}'.format(result["fold"]))
    
    predicted_classes = result["predicted_classes"]
    real_labels = result["real_labels"]
    
    for i, j in zip(predicted_classes, real_labels):
        if i == j:
            label_hits += 1
        elif ((i in label_false and j in label_false) or (i in label_true and j in label_true)):
            label_hits += 1
    
    acc = label_hits/len(predicted_classes)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    acc = 0
    label_hits = 0

Acertos por label


Avaliando fold 0
Acurácia: 96.0%


Avaliando fold 1
Acurácia: 84.0%


Avaliando fold 2
Acurácia: 92.0%


Avaliando fold 3
Acurácia: 90.91%


Avaliando fold 4
Acurácia: 87.88%




-------------------------------------------

## Naïve Bayes

In [108]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])

In [109]:
defects = df['defects']# = df['defects'].apply(lambda x: 0 if x == "b'false'" else 1)
df = df.drop(columns='defects')
df = normalize_data(df)
df['defects'] = defects
df['defects'] = df['defects'].apply(str).str.replace("b|'", '')
df['defects'] = df['defects'].apply(lambda x: 1 if x == 'true' else 0)
df = randomize_dataframe(df)


The default value of regex will change from True to False in a future version.



In [110]:
kf = KFold(n_splits=5)

results = []
i = 0
for train, test in kf.split(df):
    result = {}
    print("Running fold {}".format(i))
    
    result["fold"] = i
    
    training_set = df.iloc[train]
    target_values = training_set['defects']
    training_set = training_set.drop(columns=['defects'])
    
    gnb = GaussianNB()
    gnb.fit(training_set, target_values)

    test_set = df.iloc[test]
    real_labels = test_set['defects']
    result["real_labels"] = real_labels
    
    test_set = test_set.drop(columns=['defects'])
    predicted_classes = gnb.predict(test_set)
    result["predicted_classes"] = predicted_classes
    

    acc = gnb.score(test_set, real_labels)
    results.append(result)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i+=1

Running fold 0
Acurácia: 87.0%


Running fold 1
Acurácia: 89.0%


Running fold 2
Acurácia: 68.0%


Running fold 3
Acurácia: 87.88%


Running fold 4
Acurácia: 79.8%




## 1NN

In [111]:

kf = KFold(n_splits=5)

results = []
i = 0
for train, test in kf.split(df):
    result = {}
    print("Running fold {}".format(i))
    result["fold"] = i
    
    
    training_set = df.iloc[train]
    target_values = training_set['defects']
    training_set = training_set.drop(columns=['defects'])
    
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(training_set, target_values)
    
    test_set = df.iloc[test]
    real_labels = test_set['defects']
    result["real_labels"] = real_labels
    
    test_set = test_set.drop(columns=['defects'])
    predicted_classes = knn.predict(test_set)
    result["predicted_classes"] = predicted_classes
    
    acc = 0
    for j, k in  zip(predicted_classes, real_labels):
        if j == k:
            acc += 1
    
    acc = acc/len(predicted_classes)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i+=1


Running fold 0
Acurácia: 84.0%


Running fold 1
Acurácia: 90.0%


Running fold 2
Acurácia: 82.0%


Running fold 3
Acurácia: 88.89%


Running fold 4
Acurácia: 83.84%


