# Dicionário de variáveis:
- ID number
- Diagnosis (M = malignant, B = benign)
- radius (mean of distances from center to points on the perimeter)
- texture (standard deviation of gray-scale values)
- perimeter
- area
- smoothness (local variation in radius lengths)
- compactness (perimeter^2 / area - 1.0)
- concavity (severity of concave portions of the contour)
- concave points (number of concave portions of the contour)
- symmetry
- fractal dimension ("coastline approximation" - 1)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import uniform, randint
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

In [3]:
!pip install catboost



In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [5]:
df = pd.read_csv("/content/drive/MyDrive/Udemy/ML com Python/Datasets/data_cancer2.csv")

In [6]:
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [7]:
df.dtypes

Unnamed: 0,0
id,int64
diagnosis,object
radius_mean,float64
texture_mean,float64
perimeter_mean,float64
area_mean,float64
smoothness_mean,float64
compactness_mean,float64
concavity_mean,float64
concave points_mean,float64


In [8]:
df.shape

(569, 33)

In [9]:
df.isnull().sum()

Unnamed: 0,0
id,0
diagnosis,0
radius_mean,0
texture_mean,0
perimeter_mean,0
area_mean,0
smoothness_mean,0
compactness_mean,0
concavity_mean,0
concave points_mean,0


In [10]:
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)

In [11]:
df.head(1)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189


In [12]:
df.describe()

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,0.405172,1.216853,2.866059,40.337079,0.007041,0.025478,0.031894,0.011796,0.020542,0.003795,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,0.277313,0.551648,2.021855,45.491006,0.003003,0.017908,0.030186,0.00617,0.008266,0.002646,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,0.1115,0.3602,0.757,6.802,0.001713,0.002252,0.0,0.0,0.007882,0.000895,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,0.2324,0.8339,1.606,17.85,0.005169,0.01308,0.01509,0.007638,0.01516,0.002248,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,0.3242,1.108,2.287,24.53,0.00638,0.02045,0.02589,0.01093,0.01873,0.003187,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,0.4789,1.474,3.357,45.19,0.008146,0.03245,0.04205,0.01471,0.02348,0.004558,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,2.873,4.885,21.98,542.2,0.03113,0.1354,0.396,0.05279,0.07895,0.02984,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


In [13]:
# Transformar a variavel "diagnosis" em numerica
df['diagnosis'].replace({'M': 1, 'B': 0}, inplace=True)

In [14]:
df.head(1)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189


In [15]:
df.describe()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,0.372583,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,0.405172,1.216853,2.866059,40.337079,0.007041,0.025478,0.031894,0.011796,0.020542,0.003795,16.26919,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946
std,0.483918,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,0.277313,0.551648,2.021855,45.491006,0.003003,0.017908,0.030186,0.00617,0.008266,0.002646,4.833242,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061
min,0.0,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,0.1115,0.3602,0.757,6.802,0.001713,0.002252,0.0,0.0,0.007882,0.000895,7.93,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504
25%,0.0,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,0.2324,0.8339,1.606,17.85,0.005169,0.01308,0.01509,0.007638,0.01516,0.002248,13.01,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146
50%,0.0,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,0.3242,1.108,2.287,24.53,0.00638,0.02045,0.02589,0.01093,0.01873,0.003187,14.97,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004
75%,1.0,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,0.4789,1.474,3.357,45.19,0.008146,0.03245,0.04205,0.01471,0.02348,0.004558,18.79,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208
max,1.0,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,2.873,4.885,21.98,542.2,0.03113,0.1354,0.396,0.05279,0.07895,0.02984,36.04,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075


## Separacao das Variaveis

In [16]:
X = df.drop(columns=["diagnosis"])
y = df["diagnosis"]

## Divisao entre treino e teste

In [17]:
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size=.2, random_state=42)

## Parametros que vou testar

In [22]:
param_grids = {
    "Logistic Regression": {
        "model__C": uniform(0.01, 10),  # Regularização
        "model__penalty": ["l2", "none"]
    },
    "Naive Bayes": {
    },
    "Decision Tree": {
        "model__max_depth": randint(1, 20),
        "model__min_samples_split": randint(2, 10)
    },
    "Random Forest": {
        "model__n_estimators": randint(10, 200),
        "model__max_depth": randint(1, 20),
        "model__min_samples_split": randint(2, 10)
    },
    "Bagging": {
        "model__n_estimators": randint(10, 200),
        "model__max_samples": uniform(0.5, 1.0)
    },
    "Extra Tree": {
        "model__max_depth": randint(1, 20),
        "model__min_samples_split": randint(2, 10),
        "model__criterion": ["gini", "entropy"]
    },
    "AdaBoost": {
        "model__n_estimators": randint(10, 200),
        "model__learning_rate": uniform(0.01, 2)
    },
    "Gradient Boosting": {
        "model__n_estimators": randint(10, 200),
        "model__learning_rate": uniform(0.01, 2),
        "model__max_depth": randint(1, 10)
    },
    "XGBoost": {
        "model__n_estimators": randint(10, 200),
        "model__learning_rate": uniform(0.01, 2),
        "model__max_depth": randint(1, 10)
    },
    "LightGBM": {
        "model__n_estimators": randint(10, 200),
        "model__learning_rate": uniform(0.01, 2),
        "model__num_leaves": randint(10, 50)
    },
    "CatBoost": {
        "model__iterations": randint(10, 200),
        "model__learning_rate": uniform(0.01, 2),
        "model__depth": randint(1, 10)
    }
}

## Preprocessamento com Pipeline

In [23]:
# Padronizacao
standard_scaler = StandardScaler()

# Nenhum preprocessamento
sem_preprocess = 'passthrough'

In [24]:
# Criacao dos pipelines

pipes = {
    'Logistic Regression': Pipeline([
        ('scaler', standard_scaler),
        ('model', LogisticRegression())
    ]),
    'Naive Bayes': Pipeline([
        ('scaler', standard_scaler),
        ('model', GaussianNB())
    ]),
    'Decision Tree': Pipeline([
        ('scaler', sem_preprocess),
        ('model', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('scaler', sem_preprocess),
        ('model', RandomForestClassifier())
    ]),
    'Bagging': Pipeline([
        ('scaler', sem_preprocess),
        ('model', BaggingClassifier())
    ]),
    'Extra Tree': Pipeline([
        ('scaler', sem_preprocess),
        ('model', ExtraTreesClassifier())
    ]),
    'AdaBoost': Pipeline([
        ('scaler', standard_scaler),
        ('model', AdaBoostClassifier())
    ]),
    'Gradient Boosting': Pipeline([
        ('scaler', standard_scaler),
        ('model', GradientBoostingClassifier())
    ]),
    'XGBoost': Pipeline([
        ('scaler', standard_scaler),
        ('model', XGBClassifier(eval_metric='logloss', use_label_encoder=False))
    ]),
    'LightGBM': Pipeline([
        ('scaler', standard_scaler),
        ('model', LGBMClassifier())
    ]),
    'CatBoost': Pipeline([
        ('scaler', standard_scaler),
        ('model', CatBoostClassifier(verbose=0))
    ])
}

## Aplicacao da RandomizedSearchCV

In [28]:
best_models = {}

for model_name, pipe in pipes.items():
  print(f"Treinando {model_name}")
  param_grid = param_grids.get(model_name, {})
  search = RandomizedSearchCV(
      pipe,
      param_distributions=param_grid,
      n_iter=50,
      scoring='accuracy',
      cv=5,
      random_state=42,
      n_jobs=-1
  )
  search.fit(X_tr, y_tr)
  best_models[model_name] = search
  print(f"Melhor acuracia: {search.best_score_}")
  print(f"Melhor hiperparametros: {search.best_params_}")

Treinando Logistic Regression
Melhor acuracia: 0.9780219780219781
Melhor hiperparametros: {'model__C': 3.7554011884736247, 'model__penalty': 'l2'}
Treinando Naive Bayes
Melhor acuracia: 0.9340659340659341
Melhor hiperparametros: {}
Treinando Decision Tree
Melhor acuracia: 0.9318681318681319
Melhor hiperparametros: {'model__max_depth': 17, 'model__min_samples_split': 5}
Treinando Random Forest
Melhor acuracia: 0.9626373626373625
Melhor hiperparametros: {'model__max_depth': 7, 'model__min_samples_split': 3, 'model__n_estimators': 84}
Treinando Bagging
Melhor acuracia: 0.956043956043956
Melhor hiperparametros: {'model__max_samples': 0.9458327528535911, 'model__n_estimators': 84}
Treinando Extra Tree
Melhor acuracia: 0.9692307692307693
Melhor hiperparametros: {'model__criterion': 'entropy', 'model__max_depth': 11, 'model__min_samples_split': 4}
Treinando AdaBoost
Melhor acuracia: 0.9802197802197803
Melhor hiperparametros: {'model__learning_rate': 1.520722820635305, 'model__n_estimators': 1

## Avaliacao de Resultados

In [30]:
for model_name, search in best_models.items():
    print(f"\nModelo: {model_name}")
    print(f"Melhores hiperparâmetros: {search.best_params_}")
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_ts)
    acc = accuracy_score(y_ts, y_pred)
    print(f"Acurácia: {acc:.2f}")


Modelo: Logistic Regression
Melhores hiperparâmetros: {'model__C': 3.7554011884736247, 'model__penalty': 'l2'}
Acurácia: 0.97

Modelo: Naive Bayes
Melhores hiperparâmetros: {}
Acurácia: 0.96

Modelo: Decision Tree
Melhores hiperparâmetros: {'model__max_depth': 17, 'model__min_samples_split': 5}
Acurácia: 0.93

Modelo: Random Forest
Melhores hiperparâmetros: {'model__max_depth': 7, 'model__min_samples_split': 3, 'model__n_estimators': 84}
Acurácia: 0.96

Modelo: Bagging
Melhores hiperparâmetros: {'model__max_samples': 0.9458327528535911, 'model__n_estimators': 84}
Acurácia: 0.96

Modelo: Extra Tree
Melhores hiperparâmetros: {'model__criterion': 'entropy', 'model__max_depth': 11, 'model__min_samples_split': 4}
Acurácia: 0.97

Modelo: AdaBoost
Melhores hiperparâmetros: {'model__learning_rate': 1.520722820635305, 'model__n_estimators': 143}
Acurácia: 0.96

Modelo: Gradient Boosting
Melhores hiperparâmetros: {'model__learning_rate': 0.5924582803960838, 'model__max_depth': 3, 'model__n_esti