In [185]:
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from scipy.io import arff
from kneed import KneeLocator

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

## Utils: Visualização

In [186]:
def reduce_dimensions(df, n_dimensions):
    pca = PCA(n_components=n_dimensions)
    df = pca.fit_transform(df[df.columns].values)
    return df

def plot_clusters(df, kmeans):
    fig.data = []
    fig.layout = {}
        
    label = kmeans.fit_predict(df)
    u_labels = np.unique(label)

    for i in u_labels:
        fig.add_trace(
            go.Scatter(
                x=df[label == i , 0],
                y=df[label == i , 1],
                mode="markers",
                text=u_labels))
    fig.show()

## pré-processamento

In [187]:
def group_by_class(X):
    lines_to_remove = []
    for index, row in X.iterrows():
        if row['defects'] == b'true':
            lines_to_remove.append(index)
    defects_false = X.drop(lines_to_remove)
    defects_true = X.iloc[lines_to_remove]
    return defects_false, defects_true

def normalize_data(X):
    arr = X.values
    scaler = preprocessing.MinMaxScaler()
    X_scaled = scaler.fit_transform(arr)
    return pd.DataFrame(X_scaled)

def randomize_dataframe(df):
    df = df.sample(frac=1, random_state=20)
    df.reset_index(drop=True, inplace=True)
    return df

In [188]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])
df_false, df_true = group_by_class(df)

df_false = df_false.drop(columns=['defects'])
df_true = df_true.drop(columns=['defects'])

df_false = normalize_data(df_false)
df_true = normalize_data(df_true)

## treinamento

In [189]:
def get_sse(df, k_range, args):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, **args)
        kmeans.fit(df)
        sse.append(kmeans.inertia_)
        
    return sse

def get_best_k(k_range, sse):
    kl = KneeLocator(k_range, sse, curve="convex", direction="decreasing")
    return kl.elbow

def elbow_plot(k_range, sse):
    fig = px.line(range(2,7), y=sse)
    fig.show()
    
def generate_metaclasses(df, args):
    kmeans = KMeans(**args)
    metaclass = kmeans.fit_predict(df)
    df['metaclass'] = metaclass
    return df

def nb_fit(df):
    gnb = GaussianNB()
    
    y = df['metaclass']
    X = df.drop(columns=['metaclass'])
    
    return gnb.fit(X, y)

def nb_predict(gnb, test):
    test = test.drop(columns=['metaclass'])
    predicted_metaclasses = gnb.predict(test)
    return predicted_metaclasses

def nb_score(gnb, test_samples, true_labels):
    return gnb.score(test_samples, true_labels)


### clusters gerados com defects = false

In [190]:
# criar meta classes de todo mundo
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
sse = get_sse(df_false, k_range, args)
best_k = get_best_k(k_range, sse)
print("Melhor k: {}".format(best_k))
elbow_plot(k_range, sse)

Melhor k: 4


In [191]:
args["n_clusters"] = best_k
df_false = generate_metaclasses(df_false, args)
df_false_copy = df_false.copy()

In [192]:
df_false_2d = reduce_dimensions(df_false, 2)
plot_clusters(df_false_2d, KMeans(**args))

In [225]:
kf = KFold(n_splits=5)
i = 0
for train, test in kf.split(df_false):
    print("Running fold {}".format(i))
    training_set = df_false.iloc[train]
    gnb = nb_fit(training_set)

    test_set = df_false.iloc[test]
    real_labels = test_set['metaclass']
    predicted_classes = nb_predict(gnb, test_set)
    
    test_set = test_set.drop(columns=['metaclass'])
    acc = nb_score(gnb, test_set, real_labels)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i += 1

Running fold 0
Acurácia: 93.33%


Running fold 1
Acurácia: 96.67%


Running fold 2
Acurácia: 100.0%


Running fold 3
Acurácia: 88.89%


Running fold 4
Acurácia: 97.75%




### clusters gerados com defects = true

In [216]:
# criar meta classes de todo mundo
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
sse = get_sse(df_true, k_range, args)
best_k = get_best_k(k_range, sse)
print("Melhor k: {}".format(best_k))
elbow_plot(k_range, sse)

Melhor k: 3


In [217]:
args["n_clusters"] = best_k
df_true = generate_metaclasses(df_true, args)
df_true_copy = df_true.copy()

In [218]:
df_true_2d = reduce_dimensions(df_true, 2)
plot_clusters(df_true_2d, KMeans(**args))

In [224]:
kf = KFold(n_splits=5)
i = 0
for train, test in kf.split(df_true):
    print("Running fold {}".format(i))
    training_set = df_true.iloc[train]
    gnb = nb_fit(training_set)

    test_set = df_true.iloc[test]
    real_labels = test_set['metaclass']
    predicted_classes = nb_predict(gnb, test_set)
    
    test_set = test_set.drop(columns=['metaclass'])
    acc = nb_score(gnb, test_set, real_labels)
    print("Acurácia: {}%".format(round(acc*100, 2)))
    print('\n')
    i+=1

Running fold 0
Acurácia: 90.0%


Running fold 1
Acurácia: 90.0%


Running fold 2
Acurácia: 70.0%


Running fold 3
Acurácia: 100.0%


Running fold 4
Acurácia: 100.0%




## Pós processamento

-------------------------------------------

In [220]:
kf = KFold(n_splits=5)
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
i = 0
n_clusters = [5,5,5,5,4]

total_sse = []
for train, test in kf.split(df_false):
    # encontrar o melhor k
    training_set = df_false.iloc[train]
    sse = get_sse(training_set, k_range, args)
    total_sse.append(sse)
    args["n_clusters"] = n_clusters[i]
    
    # aplicar o kmeans com o melhor k encontrado
    training_set = generate_metaclasses(training_set, args)
    
    # aplicar o naïve bayes
    gnb = nb_fit(training_set)
    predicted_classes = nb_predict(gnb, df_false.iloc[test])
    
    print(predicted_classes)
    args.pop("n_clusters")
    i += 1
        

    
    



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



[4 4 4 4 4 2 0 4 4 4 0 4 4 4 4 0 2 4 2 0 2 0 0 0 2 2 4 3 1 4 1 3 3 4 4 4 4
 4 4 4 4 4 4 3 4 0 4 2 4 3 4 4 4 4 0 4 0 4 4 4 4 4 0 2 4 4 4 4 4 3 4 0 3 0
 4 0 0 0 4 4 2 4 2 0 0 0 2 4 0 4]
[0 0 4 2 4 2 2 2 0 0 0 3 4 4 4 0 0 4 0 2 2 3 3 4 0 0 4 3 3 0 3 0 4 4 4 3 4
 2 2 4 2 2 0 0 4 4 2 4 3 2 4 0 4 2 0 0 4 0 4 2 0 4 2 2 0 0 3 3 2 2 3 2 2 2
 2 2 4 0 4 4 0 4 2 4 4 0 2 4 0 0]
[2 2 2 1 1 1 1 4 1 1 2 1 2 2 2 2 2 2 2 2 1 2 1 2 1 4 2 2 1 2 4 1 1 1 1 2 1
 2 2 2 1 1 1 2 4 2 2 2 2 2 2 2 2 2 1 2 2 2 2 4 4 2 1 1 1 2 2 2 2 2 1 2 4 1
 4 4 2 2 1 1 2 1 2 2 4 2 2 1 2 1]
[2 1 4 4 1 4 1 2 1 1 2 4 4 4 4 1 1 1 1 4 4 1 4 1 1 1 4 1 1 4 2 4 1 1 2 1 1
 4 1 2 1 2 1 2 2 2 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2 1 4 1 2 1 2 2 2 1 1 2 1 1
 4 1 1 2 2 1 1 1 1 3 1 1 4 4 1 4]
[0 3 3 3 0 3 3 3 3 3 0 0 2 2 1 1 3 2 2 3 1 1 2 2 2 2 2 3 1 2 2 2 1 2 2 2 1
 1 2 1 2 3 1 1 2 2 2 3 3 3 2 2 2 3 3 2 3 3 3 3 3 2 3 3 3 3 3 3 3 2 1 3 2 0
 3 3 1 2 3 3 1 2 1 2 2 2 1 1 2]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [221]:
for sse in total_sse:
    print(sse)
    fig = px.line(range(2,7), y=sse)
    fig.show()


[82.47343939162562, 31.299035621511898, 28.719698141767918, 22.610943254334263, 20.91446564132181]


[89.05069606211828, 39.340982734254915, 36.964130922230076, 21.02001682098744, 19.10042457368181]


[93.05601758776793, 39.91286113888512, 37.59104397770374, 24.37288191488862, 20.04212622741805]


[71.06604572320302, 27.527117841454583, 24.809193675107466, 21.67774541729494, 18.019580142447403]


[94.69665752165488, 39.25315589634228, 36.789801126047784, 21.021909502413457, 20.436116898095282]
