In [61]:
# aplicar k-means = {2,3,4,5,6} para UMA classe DONE
# selecionar melhor k com cv DONE
# gerar k grupos para a classe DONE
# usar meta-labels como entrada para naive bayes
# usar cross-validation no naive bayes
# rodar naive bayes sem agrupamento 
# rodar 1-NN 

## Imports

In [62]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import make_scorer, f1_score

## Funções

In [63]:
def generate_meta_classes(X, y):
  # para cada k rodar k-means com cross-validation
  k_list = [2,3,4,5,6]
  k_scores = []
  for k in k_list:
    kmeans = KMeans(n_clusters=k, random_state=0)
    # usa o numero default de folds = 5
    scores = cross_val_score(estimator=kmeans, X=X, y=y)
    # salva a média dos scores de cada fold no cv
    k_scores.append(np.mean(scores))

  # escolhe k de acordo com o maior score do k-means
  best_score = max(k_scores)
  best_k_ind = k_scores.index(best_score)
  best_k = k_list[best_k_ind]

  # roda novamente o k-means com o k ideal
  kmeans = KMeans(n_clusters = best_k, random_state=0)
  kmeans.fit(X)
  return best_k, kmeans.labels_

In [64]:
def gnb_clf(X, y):
  gnb = GaussianNB()
  gnb_scores = cross_val_score(estimator=gnb, X=X, y=y, scoring='f1_weighted')
  gnb_score = np.mean(gnb_scores)
  return gnb_score

## Base 01

In [65]:
df01 = pd.read_csv('/content/base01.csv')
df01.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,2,2,2,2,1.2,1.2,1.2,1.2,1.4,no
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1,1,1.0,1.0,1.0,1.0,1.0,yes
2,2,415.0,59.0,50.0,51.0,1159.0,8411.31,0.01,103.53,81.24,870848.58,2.8,48380.48,359,35,9,10,47.0,106.0,692.0,467.0,106.0,yes
3,3,230.0,33.0,10.0,16.0,575.0,3732.82,0.03,39.82,93.74,148644.06,1.24,8258.0,174,15,34,5,23.0,67.0,343.0,232.0,65.0,yes
4,4,175.0,26.0,12.0,13.0,500.0,3123.96,0.03,29.48,105.96,92103.07,1.04,5116.84,142,7,19,4,18.0,58.0,310.0,190.0,51.0,yes


In [66]:
# transformar label em numerico
df01['21'] = df01['21'].astype('category')
df01['21'] = df01['21'].cat.codes

In [67]:
# separar classe 0
class0_df1 = df01[df01['21'] == 0]

# separar classe 1
class1_df1 = df01[df01['21'] == 1]

print('Classe 0: ', class0_df1.shape, '\nClasse 1: ', class1_df1.shape)

Classe 0:  (415, 23) 
Classe 1:  (107, 23)


In [68]:
# separar labels classe 0
X0_b1 = class0_df1.iloc[:, :-1].values
y0_b1 = class0_df1.iloc[:, -1].values

# separar labels classe 1
X1_b1 = class1_df1.iloc[:, :-1].values
y1_b1 = class1_df1.iloc[:, -1].values

# gerar meta-classes a partir de função definida anteriormente
k0_b1, y0_b1_new = generate_meta_classes(X0_b1, y0_b1)
k1_b1, y1_b1_n = generate_meta_classes(X1_b1, y1_b1)

In [69]:
# normalizar meta-classes da classe 1
y1_b1_new = list(map(lambda x: x + k0_b1, y1_b1_n))

# unir meta-classes
X_b1 = np.concatenate((X0_b1, X1_b1),axis=0)
y_b1 = np.concatenate((y0_b1_new, y1_b1_new), axis=0)

# rodar gnb para meta-classes
gnb_mc_score = gnb_clf(X_b1, y_b1)
print(gnb_mc_score)

0.7688024967130704




In [70]:
X_01 = df01.iloc[:, :-1].values
y_01 = df01.iloc[:, -1].values

# rodar gnb sem agrupamento
gnb_score = gnb_clf(X_01, y_01)
print(gnb_score)

# rodar 1-nn sem agrupamento
knn = KNeighborsClassifier(n_neighbors=1)
knn_scores = cross_val_score(estimator=knn, X=X_01, y=y_01, scoring='f1_weighted')
knn_score = np.mean(knn_scores)
print(knn_score)

0.8177962828374727
0.8250515966543702


## Base 02

In [71]:
df02 = pd.read_csv('/content/base02.csv')
df02.head()

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,1.3,1.3,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
2,2,83.0,11.0,1.0,11.0,171.0,927.89,0.04,23.04,40.27,21378.61,0.31,1187.7,65,10,6,0,18.0,25.0,107.0,64.0,21.0,True
3,3,46.0,8.0,6.0,8.0,141.0,769.78,0.07,14.86,51.81,11436.73,0.26,635.37,37,2,5,0,16.0,28.0,89.0,52.0,15.0,True
4,4,25.0,3.0,1.0,3.0,58.0,254.75,0.11,9.35,27.25,2381.95,0.08,132.33,21,0,2,0,11.0,10.0,41.0,17.0,5.0,True


In [72]:
# transformar label em numerico
df02['21'] = df02['21'].astype('category')
df02['21'] = df02['21'].cat.codes

In [73]:
# separar classe 0
class0_df2 = df02[df02['21'] == 0]

# separar classe 1
class1_df2 = df02[df02['21'] == 1]

print('Classe 0: ', class0_df2.shape, '\nClasse 1: ', class1_df2.shape)

Classe 0:  (1783, 23) 
Classe 1:  (326, 23)


In [74]:
# separar labels classe 0
X0_b2 = class0_df2.iloc[:, :-1].values
y0_b2 = class0_df2.iloc[:, -1].values

# separar labels classe 1
X1_b2 = class1_df2.iloc[:, :-1].values
y1_b2 = class1_df2.iloc[:, -1].values

# gerar meta-classes a partir de função definida anteriormente
k0_b2, y0_b2_new = generate_meta_classes(X0_b2, y0_b2)
k1_b2, y1_b2_n = generate_meta_classes(X1_b2, y1_b2)

In [75]:
# normalizar meta-classes da classe 1
y1_b2_new = list(map(lambda x: x + k0_b2, y1_b2_n))

# unir meta-classes
X_b2 = np.concatenate((X0_b2, X1_b2),axis=0)
y_b2 = np.concatenate((y0_b2_new, y1_b2_new), axis=0)

# rodar gnb para meta-classes
gnb_mc_score = gnb_clf(X_b2, y_b2)
print(gnb_mc_score)

0.8541886218915137




In [76]:
X_02 = df02.iloc[:, :-1].values
y_02 = df02.iloc[:, -1].values

# rodar gnb sem agrupamento
gnb_score = gnb_clf(X_02, y_02)
print(gnb_score)

# rodar 1-nn sem agrupamento
knn = KNeighborsClassifier(n_neighbors=1)
knn_scores = cross_val_score(estimator=knn, X=X_02, y=y_02, scoring='f1_weighted')
knn_score = np.mean(knn_scores)
print(knn_score)

0.8620728241826983
0.8875468139657918
