<h2 style='font-size:30px'>Loading and Scaling the Dataset </h2>

In [21]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Loading the data and segregating the train and test sets.
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.8, random_state=42)

# Scaling the training data.
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [22]:
# If you observe the dataset's shape, it is visible that we have a significant number of features.
# Perhaps not all of them are essential for the models' creation. 
X_train_scaled.shape

(113, 30)

In [18]:
# The reason of choosing XGBoost as our feature selection classifier was its usual high performance in most of the projects and also
# the fact that it presents the convenient 'feature_importances' attribute.

from keras_tuner import SklearnTuner, HyperParameters
from keras_tuner.oracles import BayesianOptimization
from xgboost import XGBClassifier


# Constructing the algorithm.
def build_model(hp):
    model = XGBClassifier(
        n_estimators = hp.Int('n_estimators', min_value=30, max_value=100, step=10),
        max_depth = hp.Int('max_depth', min_value=2, max_value=4, step=1),
        gamma = hp.Float('gamma', min_value=.05, max_value=.5, step=.05),
        colsample_bytree = hp.Float('colsample_bytree', min_value=.1, max_value=.5, step=.1)
        )
    return model

In [2]:
# Now, let's use K-Means
import numpy as np
from sklearn.metrics import silhouette_score

# 'n_clusters' holds the different 'n_clusters' values that will be used; 'silhouette' keeps the corresponding Silhouette Scores achieved
# in each iteration.
n_clusters = []
silhouetes = []
for i in range(2,11):
    predictions = KMeans(n_clusters=i, random_state=42).fit_predict(X_train_scaled)
    silhouette = silhouette_score(X_train_scaled, predictions)
    n_clusters.append(i)
    silhouetes.append(silhouette)

In [5]:
# What as the best 'n_clusters' number?
idx = np.argmax(silhouetes)
best_n_clusters = n_clusters[idx]

best_n_clusters

2

<p style='color:red'> Prosseguir com a montagem da Otimização Bayesiana. Após isso, ficará mais fácil analisar os dados e remover possíveis outliers.</p>
<p> https://keras.io/api/keras_tuner/tuners/sklearn/</p>
<p>https://keras.io/guides/keras_tuner/tailor_the_search_space/ </p>