## Ejercicio breast cancer de sklearn

1. Carga el dataset [breast_cancer de `sklearn`](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html)
2. Prueba todos los métodos de clasificación vistos hasta ahora mediante GridSearchCV. Utiliza pipeline si es necesario.

In [7]:
from sklearn import datasets
cancer = datasets.load_breast_cancer()

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

df = pd.DataFrame(data=np.c_[cancer.data, cancer.target],
                 columns = list(cancer.feature_names) + ['target'])

df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0.0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0.0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0.0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0.0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0.0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, :-1],
                                                   df.iloc[:, -1],
                                                   test_size = 0.2,
                                                   random_state=42)

In [10]:
reg_log = Pipeline([
    ("scaler", StandardScaler()),
    ("reglog", LogisticRegression())
])

rand_forest = RandomForestClassifier()

svm = Pipeline([
    ("scaler", StandardScaler()),
    ("svm", SVC())
])

gbc = GradientBoostingClassifier()

knn = KNeighborsClassifier()

knn_scal = Pipeline([
    ("scaler", StandardScaler()),
    ("knn", KNeighborsClassifier())
])

reg_log_param = {
    "reglog__penalty": ['l1', 'l2'],
    "reglog__C": np.arange(0.1, 1, 0.1)
}

rand_forest_param = {
    'n_estimators': [10, 100, 300], # numero alto, no tiene por que dar overfitting
    'max_features': [2, 4, 6]
}

svm_param = {
    'svm__C': np.arange(0.2, 1, 0.1),
    'svm__kernel': ['linear', 'poly', 'rbf']
}

gbc_param = {
    'learning_rate': [0.05, 0.1, 0.5],
    'n_estimators': [20, 50, 100],
    'max_depth': [1,2,3,4,5]
}

knn_param = {
    'n_neighbors': [1,2,3,4,5,6]
}


knn_param_scal = {
    'knn__n_neighbors': [1,2,3,4,5,6]
}

In [12]:
gs_reg_log = GridSearchCV(reg_log,
                         reg_log_param,
                         cv=10,
                         verbose=1,
                         n_jobs=-1)

gs_rand_forest = GridSearchCV(rand_forest,
                         rand_forest_param,
                         cv=10,
                         verbose=1,
                         n_jobs=-1)

gs_svm = GridSearchCV(svm,
                         svm_param,
                         cv=10,
                         verbose=1,
                         n_jobs=-1)

gs_gbc = GridSearchCV(gbc,
                         gbc_param,
                         cv=10,
                         verbose=1,
                         n_jobs=-1)

gs_knn = GridSearchCV(knn,
                         knn_param,
                         cv=10,
                         verbose=1,
                         n_jobs=-1)

gs_knn_scal = GridSearchCV(knn_scal,
                         knn_param_scal,
                         cv=10,
                         verbose=1,
                         n_jobs=-1)

grids = {
    "gs_reg_log": gs_reg_log,
    "gs_rand_forest": gs_rand_forest,
    "gs_svm": gs_svm,
    "gs_gbc": gs_gbc,
    "gs_knn": gs_knn,
    "gs_knn_scal": gs_knn_scal    
}

In [13]:
%%time

for i, j in grids.items():
    j.fit(X_train, y_train)

Fitting 10 folds for each of 18 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:    3.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 9 candidates, totalling 90 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:    6.0s finished


Fitting 10 folds for each of 24 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:    0.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 45 candidates, totalling 450 fits


[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.9s
[Parallel(n_jobs=-1)]: Done 248 tasks      | elapsed:   13.2s
[Parallel(n_jobs=-1)]: Done 450 out of 450 | elapsed:   27.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 6 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 10 folds for each of 6 candidates, totalling 60 fits
Wall time: 38.8 s


[Parallel(n_jobs=-1)]: Done  45 out of  60 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:    0.2s finished


In [17]:
best_grids = [(i, j.best_score_) for i, j in grids.items()]

best_grids = pd.DataFrame(best_grids,
                          columns=["Grid", "Best Score"]).sort_values("Best Score", ascending=False)
best_grids

Unnamed: 0,Grid,Best Score
2,gs_svm,0.975894
0,gs_reg_log,0.975749
3,gs_gbc,0.971449
1,gs_rand_forest,0.969227
5,gs_knn_scal,0.966957
4,gs_knn,0.927391


In [18]:
len(X_train)

455

In [26]:
feat_names = df.columns[:-1].values

In [27]:
feat_imp = gs_rand_forest.best_estimator_.feature_importances_

In [30]:
pd.DataFrame(data= {"Feature": feat_names,
                   "importance": feat_imp}).sort_values('importance', ascending=False).head(10)

Unnamed: 0,Feature,importance
27,worst concave points,0.124818
7,mean concave points,0.108438
23,worst area,0.101916
20,worst radius,0.100262
22,worst perimeter,0.097223
0,mean radius,0.054882
3,mean area,0.049157
6,mean concavity,0.044256
26,worst concavity,0.043539
13,area error,0.041617


In [36]:
pd.DataFrame(data= {"Feature": feat_names,
                   "importance": gs_reg_log.best_estimator_['reglog'].coef_[0]}).sort_values('importance', ascending=False).head(10)

Unnamed: 0,Feature,importance
15,compactness error,0.531269
19,fractal dimension error,0.451476
18,symmetry error,0.346359
5,mean compactness,0.249127
9,mean fractal dimension,0.191711
11,texture error,0.100954
8,mean symmetry,0.091117
16,concavity error,0.081277
25,worst compactness,-0.075787
4,mean smoothness,-0.121565


In [34]:
gs_reg_log.best_estimator_['reglog'].coef_

array([[-0.43453449, -0.47142266, -0.41131678, -0.44289311, -0.12156454,
         0.24912718, -0.59282194, -0.76323043,  0.09111667,  0.19171054,
        -0.86944793,  0.10095371, -0.51866682, -0.62168892, -0.18664788,
         0.53126902,  0.08127681, -0.21749327,  0.34635948,  0.45147565,
        -0.70021058, -0.97167689, -0.55280111, -0.64847191, -0.51529052,
        -0.07578654, -0.68610128, -0.644949  , -0.85758871, -0.13862776]])