In [113]:
!pip3 install sklearn
!pip3 install scipy
!pip3 install pandas
!pip3 install numpy
!pip3 install kneed
!pip3 install plotly



In [114]:
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from scipy.io import arff

from kneed import KneeLocator

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

## Utils: Visualização

In [115]:
def reduce_dimensions(df, n_dimensions):
    pca = PCA(n_components=n_dimensions)
    df = pca.fit_transform(df[df.columns].values)
    return df

def plot_clusters(df, kmeans):
    fig = go.Figure(
        data=[go.Bar(x=[1, 2, 3], y=[1, 3, 2])],
        layout=dict(title=dict(text="A Figure Specified By A Graph Object"))
    )
    fig.data = []
    fig.layout = {}
        
    label = kmeans.fit_predict(df)
    u_labels = np.unique(label)
    
    for i in u_labels:
        fig.add_trace(
            go.Scatter(
                x=df[label == i , 0],
                y=df[label == i , 1],
                mode="markers",
                text=u_labels))
    fig.show()

## pré-processamento

In [116]:
def group_by_class(X):
    lines_to_remove = []
    for index, row in X.iterrows():
        if row['defects'] == b'true':
            lines_to_remove.append(index)
    defects_true = X.iloc[lines_to_remove]
    defects_false = X.drop(lines_to_remove)
    return defects_false, defects_true

def normalize_data(X):
    arr = X.values
    scaler = preprocessing.MinMaxScaler()
    X_scaled = scaler.fit_transform(arr)
    return pd.DataFrame(X_scaled)

def randomize_dataframe(df):
    df = df.sample(frac=1, random_state=20)
    df.reset_index(drop=True, inplace=True)
    return df

In [117]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])
df_false, df_true = group_by_class(df)

df_false = df_false.drop(columns=['defects'])
df_true = df_true.drop(columns=['defects'])

df_false = normalize_data(df_false)
df_true = normalize_data(df_true)

## treinamento

In [118]:
def get_sse(df, k_range, args):
    sse = []
    for k in k_range:
        kmeans = KMeans(n_clusters=k, **args)
        kmeans.fit(df)
        sse.append(kmeans.inertia_)
        
    return sse

def get_best_k(k_range, sse):
    kl = KneeLocator(k_range, sse, curve="convex", direction="decreasing")
    return kl.elbow

def elbow_plot(k_range, sse):
    fig = px.line(x=k_range, y=sse)
    fig.show()
    
def generate_metaclasses(df, args):
    kmeans = KMeans(**args)
    metaclass = kmeans.fit_predict(df)
    df['metaclass'] = metaclass
    return df

def nb_fit(df):
    gnb = GaussianNB()
    
    y = df['metaclass']
    X = df.drop(columns=['metaclass'])
    
    return gnb.fit(X, y)

def nb_predict(gnb, test):
    test = test.drop(columns=['metaclass'])
    predicted_metaclasses = gnb.predict(test)
    return predicted_metaclasses

def nb_score(gnb, test_samples, true_labels):
    return gnb.score(test_samples, true_labels)


## Clusterização por fold

In [119]:
kf = KFold(n_splits=5)
k_range = [2,3,4,5,6]
args = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}
results = []
sets = []
fold = 0
resultados = {
    "fold": [],
    "std_cluster_hit": 0,
    "std_label_hit": 0,
    "mean_cluster_hit": 0,
    "mean_label_hit": 0
}
for train, test in kf.split(df):
    fold_data = {}
    fold_data['fold_number'] = fold
    test_set = df.iloc[test]
    real_classes = test_set['defects']
    real_classes = real_classes.apply(str).str.replace("b|'", '')
    real_classes = real_classes.apply(lambda x: 1 if x == 'true' else 0)

    result = {}
    
    #get training set
    training_set = df.iloc[train]
    training_set.reset_index(drop=True, inplace=True)
    
    #split true and false
    label_false, label_true = group_by_class(training_set)
    label_false = label_false.drop(columns=['defects'])
    label_true = label_true.drop(columns=['defects'])
    
    true_instances = label_true.shape[0]
    fold_data["defects_true"] =  {"number_instances": true_instances}
    
    false_instances = label_false.shape[0]
    fold_data["defects_false"] =  {"number_instances": false_instances}

    #train kmeans with label_false
    if false_instances > 1:
        sse = get_sse(label_false, k_range, args)
        best_k = get_best_k(k_range, sse)
        fold_data['defects_false']['best_k'] = best_k
        #elbow_plot(k_range, sse)

        args['n_clusters'] = best_k
        label_false = generate_metaclasses(label_false, args)
        args.pop('n_clusters')
    elif false_instances == 1:
        label_false['metaclass'] = best_k
    
    #train kmeans with label_true
    if true_instances > 1:
        sse = get_sse(label_true, k_range, args)
        best_k = get_best_k(k_range, sse)
        #elbow_plot(k_range, sse)
        fold_data['defects_true']['best_k'] = best_k
        args['n_clusters'] = best_k
        label_true = generate_metaclasses(label_true, args)
        args.pop('n_clusters')
    elif true_instances == 1:
        label_true['metaclass'] = 0
        fold_data['defects_true']['best_k'] = 1
    
    label_true["metaclass"] = label_true["metaclass"].apply(lambda x: x + best_k)

    
    df_final = pd.concat([label_false, label_true])
    metaclasses = np.unique(df_final['metaclass'])
    fold_data["n_total_clusters"] = len(metaclasses)
    args['n_clusters'] = len(metaclasses)
    to_plot = reduce_dimensions(df_final, 2)
    plot_clusters(to_plot, KMeans(**args))
    args.pop('n_clusters')
    
    # Naive Bayes
    gnb = GaussianNB()
    target_values = df_final['metaclass']
    df_final.drop(columns=['metaclass'])
    gnb.fit(df_final, target_values)
    
    test_set['defects'] = test_set['defects'].apply(str).str.replace("b|'", '')
    test_set['defects'] = test_set['defects'].apply(lambda x: 1 if x == 'true' else 0)
    test_set.drop(columns=['defects'])
    predicted_classes = gnb.predict(test_set)
    fold_data["test_instances"] = len(predicted_classes)
    label_hit = 0
    cluster_hit = 0
    for i, j in zip(predicted_classes, real_classes):       
        if i == j:
            cluster_hit += 1
        
        if ((i in [0,1,2]) and j == 0):
            label_hit += 1
        elif i > 2 and j == 1:
            label_hit += 1
            
    fold_data["naive_bayes"] = {"label_hit": label_hit/len(predicted_classes), "cluster_hit": cluster_hit/len(predicted_classes)}
    acc = label_hit/len(real_classes)
    
    acc = 0
    fold += 1
    resultados["fold"].append(fold_data)



The default value of regex will change from True to False in a future version.




The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version.




The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version.




The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version.




The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


The default value of regex will change from True to False in a future version.




The default value of regex will change from True to False in a future version.



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [120]:
cluster_hit = []
label_hit = []
for i in range(len(resultados['fold'])):
    label_hit.append(resultados['fold'][i]['naive_bayes']['label_hit'])
    cluster_hit.append(resultados['fold'][i]['naive_bayes']['cluster_hit'])
    

resultados['std_cluster_hit'] = np.std(cluster_hit)
resultados['std_label_hit'] = np.std(label_hit)
resultados['mean_cluster_hit'] = np.mean(cluster_hit)
resultados['mean_label_hit'] = np.mean(label_hit)


s = str(resultados)
s = s.replace("'", "\"")
s

'{"fold": [{"fold_number": 0, "defects_true": {"number_instances": 48, "best_k": 3}, "defects_false": {"number_instances": 350, "best_k": 3}, "n_total_clusters": 6, "test_instances": 100, "naive_bayes": {"label_hit": 0.82, "cluster_hit": 0.03}}, {"fold_number": 1, "defects_true": {"number_instances": 49, "best_k": 3}, "defects_false": {"number_instances": 349, "best_k": 3}, "n_total_clusters": 6, "test_instances": 100, "naive_bayes": {"label_hit": 0.89, "cluster_hit": 0.82}}, {"fold_number": 2, "defects_true": {"number_instances": 49, "best_k": 3}, "defects_false": {"number_instances": 349, "best_k": 3}, "n_total_clusters": 6, "test_instances": 100, "naive_bayes": {"label_hit": 0.86, "cluster_hit": 0.0}}, {"fold_number": 3, "defects_true": {"number_instances": 49, "best_k": 3}, "defects_false": {"number_instances": 350, "best_k": 3}, "n_total_clusters": 6, "test_instances": 99, "naive_bayes": {"label_hit": 0.8888888888888888, "cluster_hit": 0.0}}, {"fold_number": 4, "defects_true": {"n

-------------------------------------------

## Naïve Bayes

In [121]:
data = arff.loadarff('./datasets/CM1.arff')
df = pd.DataFrame(data[0])

In [122]:
defects = df['defects']# = df['defects'].apply(lambda x: 0 if x == "b'false'" else 1)
df = df.drop(columns='defects')
df = normalize_data(df)
df['defects'] = defects
df['defects'] = df['defects'].apply(str).str.replace("b|'", '')
df['defects'] = df['defects'].apply(lambda x: 1 if x == 'true' else 0)
df = randomize_dataframe(df)


The default value of regex will change from True to False in a future version.



In [123]:
kf = KFold(n_splits=5)
resultados = {
    "fold": [],
    "std_hit_rate": 0,
    "mean_hit_rate": 0
}
results = []
i = 0
for train, test in kf.split(df):
    resultado = {}
    
    resultado["fold_number"] = i
    
    training_set = df.iloc[train]
    target_values = training_set['defects']
    training_set = training_set.drop(columns=['defects'])
    
    gnb = GaussianNB()
    gnb.fit(training_set, target_values)

    test_set = df.iloc[test]
    real_labels = test_set['defects']
    result["real_labels"] = real_labels
    
    test_set = test_set.drop(columns=['defects'])
    predicted_classes = gnb.predict(test_set)
    result["predicted_classes"] = predicted_classes
    

    acc = gnb.score(test_set, real_labels)
    resultado["hit_rate"] = acc
    resultados["fold"].append(resultado)
    results.append(result)
    i+=1

hits = []
for i in range(len(resultados['fold'])):
    hits.append(resultados['fold'][i]['hit_rate'])
    

resultados['std_hit_rate'] = np.std(hits)
resultados['mean_hit_rate'] = np.mean(hits)

s = str(resultados)
s = s.replace("'", "\"")
s

'{"fold": [{"fold_number": 0, "hit_rate": 0.87}, {"fold_number": 1, "hit_rate": 0.89}, {"fold_number": 2, "hit_rate": 0.68}, {"fold_number": 3, "hit_rate": 0.8787878787878788}, {"fold_number": 4, "hit_rate": 0.797979797979798}], "std_hit_rate": 0.07859347681907419, "mean_hit_rate": 0.8233535353535354}'

## 1NN

In [124]:

kf = KFold(n_splits=5)

results = []
i = 0
resultados = {
    "fold": [],
    "std_hit_rate": 0,
    "mean_hit_rate": 0
}
for train, test in kf.split(df):
    resultado = {}
    result = {}
    result["fold"] = i
    resultado["fold_number"] = i
    
    
    training_set = df.iloc[train]
    target_values = training_set['defects']
    training_set = training_set.drop(columns=['defects'])
    
    knn = KNeighborsClassifier(n_neighbors=1)
    knn.fit(training_set, target_values)
    
    test_set = df.iloc[test]
    real_labels = test_set['defects']
    result["real_labels"] = real_labels
    
    test_set = test_set.drop(columns=['defects'])
    predicted_classes = knn.predict(test_set)
    result["predicted_classes"] = predicted_classes
    
    acc = 0
    for j, k in  zip(predicted_classes, real_labels):
        if j == k:
            acc += 1
    
    
    acc = acc/len(predicted_classes)
    resultado["hit_rate"] = acc
    resultados["fold"].append(resultado)
    i+=1


hits = []
for i in range(len(resultados['fold'])):
    hits.append(resultados['fold'][i]['hit_rate'])
    

resultados['std_hit_rate'] = np.std(hits)
resultados['mean_hit_rate'] = np.mean(hits)

s = str(resultados)
s = s.replace("'", "\"")
s

'{"fold": [{"fold_number": 0, "hit_rate": 0.84}, {"fold_number": 1, "hit_rate": 0.9}, {"fold_number": 2, "hit_rate": 0.82}, {"fold_number": 3, "hit_rate": 0.8888888888888888}, {"fold_number": 4, "hit_rate": 0.8383838383838383}], "std_hit_rate": 0.031207145495978052, "mean_hit_rate": 0.8574545454545455}'