<h2 style='font-size:30px'>Loading and Scaling the Dataset </h2>

In [68]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Loading the data..
breast_cancer = load_breast_cancer()
df = pd.DataFrame(np.concatenate((breast_cancer['data'], breast_cancer['target'].reshape(-1,1)), axis=1), 
             columns=np.append(breast_cancer['feature_names'], 'target'))

# Segregating the independent variables from the dependent ones.
X,y = df.iloc[:, :-1], df.iloc[:, -1].astype('int')

# Finally, splitting the training and test sets and scaling the numbers.
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.2, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h2 style='font-size:30px'> Outlier Detection and Removal</h2>

In [70]:
import matplotlib.pyplot as plt
import seaborn as sns
#train = np.concatenate((X_train_scaled, y_train.reshape(-1,1)), axis=1)
X_train_scaled

array([[-1.44075296, -0.43531947, -1.36208497, ...,  0.9320124 ,
         2.09724217,  1.88645014],
       [ 1.97409619,  1.73302577,  2.09167167, ...,  2.6989469 ,
         1.89116053,  2.49783848],
       [-1.39998202, -1.24962228, -1.34520926, ..., -0.97023893,
         0.59760192,  0.0578942 ],
       ...,
       [ 0.04880192, -0.55500086, -0.06512547, ..., -1.23903365,
        -0.70863864, -1.27145475],
       [-0.03896885,  0.10207345, -0.03137406, ...,  1.05001236,
         0.43432185,  1.21336207],
       [-0.54860557,  0.31327591, -0.60350155, ..., -0.61102866,
        -0.3345212 , -0.84628745]])

In [18]:
# The reason of choosing XGBoost as our feature selection classifier was its usual high performance in most of the projects and also
# the fact that it presents the convenient 'feature_importances' attribute.

from keras_tuner import SklearnTuner, HyperParameters
from keras_tuner.oracles import BayesianOptimization
from xgboost import XGBClassifier


# Constructing the algorithm.
def build_model(hp):
    model = XGBClassifier(
        n_estimators = hp.Int('n_estimators', min_value=30, max_value=100, step=10),
        max_depth = hp.Int('max_depth', min_value=2, max_value=4, step=1),
        gamma = hp.Float('gamma', min_value=.05, max_value=.5, step=.05),
        colsample_bytree = hp.Float('colsample_bytree', min_value=.1, max_value=.5, step=.1)
        )
    return model

In [71]:
# Now, let's use K-Means
import numpy as np
from sklearn.metrics import silhouette_score

# 'n_clusters' holds the different 'n_clusters' values that will be used; 'silhouette' keeps the corresponding Silhouette Scores achieved
# in each iteration.
n_clusters = []
silhouetes = []
for i in range(2,11):
    predictions = KMeans(n_clusters=i, random_state=42).fit_predict(X_train_scaled)
    silhouette = silhouette_score(X_train_scaled, predictions)
    n_clusters.append(i)
    silhouetes.append(silhouette)

In [72]:
# What as the best 'n_clusters' number?
idx = np.argmax(silhouetes)
best_n_clusters = n_clusters[idx]

best_n_clusters

2

<p style='color:red'> Fazer a remoção de outliers para depois montar a Bayesian Opt</p>
<p> https://keras.io/api/keras_tuner/tuners/sklearn/</p>
<p>https://keras.io/guides/keras_tuner/tailor_the_search_space/ </p>