In [15]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import numpy as np

pd.set_option('max_columns', None)

# Loading the data and storing it in a pandas DF.
breast_cancer = load_breast_cancer()
df = pd.DataFrame(np.concatenate((breast_cancer['data'], breast_cancer['target'].reshape(-1,1)), axis=1), 
                  columns=np.append(breast_cancer['feature_names'], 'target'))
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,radius error,texture error,perimeter error,area error,smoothness error,compactness error,concavity error,concave points error,symmetry error,fractal dimension error,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [16]:
from sklearn.model_selection import StratifiedShuffleSplit

# Separating independent and dependent variables. 
X = df.iloc[:, :-1]
y = df.iloc[:, -1]#.astype('int')

# Creating the train and test sets according to the target variables proportions that were just exposed.
split = StratifiedShuffleSplit(n_splits=1, test_size=.2, random_state=42)
for train_index, test_index in split.split(X,y):
    train = df.loc[train_index]
    test = df.loc[test_index]

In [17]:
# Dropping out the referred columns.
to_drop = ['mean smoothness', 'mean symmetry', 'mean fractal dimension', 'texture error',
          'smoothness error', 'symmetry error', 'fractal dimension error', 'worst symmetry',
          'worst fractal dimension']

train.drop(to_drop, axis=1, inplace=True)

In [18]:
# Keep in mind that the categories have different distribution shapes for each feature. Therefore, the outlier removal needs to be 
# carried out according to the instance's target variable.
from scipy.stats import zscore

# We'll avail the occasion to separate the independent and dependent variables.
X_train = pd.DataFrame(columns=train.columns[:-1])
for target in train.target.unique():
    X = zscore(train[train.target==target].iloc[:, :-1]).abs()    
    # Only appending to 'X_train' rows that have all their zscore absolute value lower than 3. 
    X_train = X_train.append(train.loc[X[X<3].dropna().index, :'worst concave points'])
    
# Creating 'y_train' with the indices from 'X_train'.
y_train = train.loc[X_train.index, 'target']

In [19]:
# 'FeatureFilter' is a class that drops out the features mentioned in the 'to_drop' list.
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureFilter(BaseException, TransformerMixin):
    def __init__(self, to_drop):
        # The user needs to pass a list of feature names that need to be disregarded.
        self.to_drop = to_drop
        
    def fit(self, X, y=None):
        return self
    
    # Returning the dataset properly trimmed. 
    def transform(self, X):
        # In the training set case, this procedure can't be done, since the features were already trimmed off.
        try:
            return X.drop(self.to_drop, axis=1)
        except:
            return X

In [20]:
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer 
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ('filter', FeatureFilter(to_drop)),
    ('knn_imputer', KNNImputer(n_neighbors=7)),
    ('std_scaler', StandardScaler())
])

pipe_transformer = pipe.fit(X_train)
X_train_transformed = pipe_transformer.transform(X_train)

In [23]:
class KeepMostImportant(BaseEstimator, TransformerMixin):
    
    def __repr__(self):
        return 'KeepMostImportant()'
    
    def __init__(self, minimum_importance:float = .01, feature_importances_:list = None):
        # 'minimum_importance' is the 
        self.minimum_importance = minimum_importance
        # list of feature importances arrays.
        self.feature_importances_ = feature_importances_
        
    # Fit detects the columns  which their average importance is lower than self.minimum_importance.
    def fit(self, X, y=None):
        self.__feature_importances_mean = np.mean(self.feature_importances_, axis=0)
        self.__keep = np.argwhere(self.__feature_importances_mean > self.minimum_importance).reshape(1, -1).ravel()
        return self
    
    # 'transform' returns the features array without the irrelevant columns.
    def transform(self, X):
        return X[:, self.__keep]

In [31]:
from joblib import load
xgb = load('models/xgboost.joblib')
xgb_importances = xgb.feature_importances_

extra = load('models/extra_trees.joblib')
extra_importances = extra.feature_importances_

In [32]:
# Making a second pipe with 'pipe' and our new transformator.
pipe2 = Pipeline([
    ('pipe', pipe),
    ('keep_most_important', KeepMostImportant(feature_importances_=[xgb_importances, extra_importances]))
    ])

# Removing the irrelevant columns.
X_train_transformed2 = pipe2.fit_transform(X_train)

In [34]:
kmeans = load('models/kmeans.joblib')

In [36]:
from sklearn.cluster import KMeans
def get_nearest_centroids_neighbors(X:'array'=None, y:'array'=None, kmeans:KMeans=None, n:int=1):
    from sklearn.neighbors import KNeighborsClassifier
    instances = []
    # The KNN algorithm below identifies the n-nearest instances to the centroids.
    knn = KNeighborsClassifier().fit(X,y)
    
    # For each cluster created, locate the n-nearest neighbors of the respective centroids and append them to 'instances'..
    for i in np.unique(kmeans.labels_).astype('int'):
        instances.extend(knn.kneighbors(kmeans.cluster_centers_[i].reshape(1, -1), n_neighbors=n, return_distance=False).ravel())
        
    # Returning the X and y arrays with only the data of the centroids' n-nearest neighbors.
    print(instances)
    return X[instances], y[instances]

In [37]:
# As initially proposed, getting the top-1 closest instances to each centroid.
X_nn, y_nn = get_nearest_centroids_neighbors(X=X_train_transformed2,  y=y_train.values, kmeans=kmeans, n=1)

[115, 405]


In [38]:
svm = load('models/svm.joblib')

In [39]:
# Before any validation is made, it is important to transform the dataset to the training format.
X_test = test.iloc[:, :-1]
y_test = test.iloc[:, -1]

# Now applying the pipeline transformations in the 'X_test' variable.
X_test_transformed = pipe2.transform(X_test)

In [42]:
from sklearn.metrics import fbeta_score
fbeta_score(y_test,svm.predict(X_test_transformed), beta=.37)

0.9490345446046117