## Análise, tratamento e pré-processamento dos dados

Link do Kaggle: https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
!pip install catboost
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
dados = pd.read_csv('/content/drive/MyDrive/CursoML/BreastCancer/data.csv',sep=',', encoding='iso-8859-1')

In [None]:
dados.head()
dados.tail()

In [None]:
dados['diagnosis'].value_counts()

In [None]:
px.histogram (dados,x = "concavity_worst", nbins=60)

In [None]:
dados.dtypes

In [None]:
dados.isnull().sum()

In [None]:
dados.drop(columns=['id','Unnamed: 32'],inplace=True)

In [None]:
dados.isnull().sum()

In [None]:
dados.head()

In [None]:
dados.to_csv('bc_tratado.csv', sep=';', encoding='utf-8', index = False)

In [None]:
df = pd.DataFrame.copy(dados)

In [None]:
df

In [None]:
df['diagnosis'].replace({'B':0, 'M': 1}, inplace=True)

In [None]:
df

In [None]:
df['diagnosis'].value_counts()

In [None]:
previsores = df.iloc[:, 1:32].values

In [None]:
alvo = df.iloc[:, 0].values

In [None]:
alvo

Escalonando:



In [None]:
previsores_esc = StandardScaler().fit_transform(previsores)

In [None]:
previsores_esc

In [None]:
previsoresdf = pd.DataFrame(previsores_esc)

Teste da redução de dimensionalidade:

In [None]:
pca = PCA(n_components=3)

In [None]:
previsores_pca = pca.fit_transform(previsores)

In [None]:
previsores_pca.shape

In [None]:
pca.explained_variance_ratio_

Treino e teste:

In [None]:
x_treino, x_teste, y_treino, y_teste = train_test_split(previsores_esc, alvo, test_size = 0.3, random_state = 0)

## Naive Bayes

In [None]:
naive = GaussianNB()

In [None]:
naive.fit(x_treino, y_treino)

In [None]:
previsoes_naive = naive.predict(x_teste)

In [None]:
previsoes_naive

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_naive) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_naive)

In [None]:
kfold = KFold(n_splits = 30, shuffle=True, random_state = 5)

In [None]:
modelo = GaussianNB()
resultado = cross_val_score(modelo, previsores, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia média do previsores: 93.82%

Acurácia média do previsores_esc: 93.47%

## Máquina de Suporte de Vetores (SVM)

In [None]:
svm = SVC(kernel='rbf', random_state=1, C = 2)

In [None]:
svm.fit(x_treino, y_treino)

In [None]:
previsoes_svm = svm.predict(x_teste)

In [None]:
previsoes_svm

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_svm) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_svm)

In [None]:
print(classification_report(y_teste, previsoes_svm))

In [None]:
previsoes_treino = svm.predict(x_treino)

In [None]:
previsoes_treino

In [None]:
accuracy_score(y_treino, previsoes_treino)

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
modelo = SVC(kernel='rbf', random_state=1, C = 2)
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia média do previsores: 91.72%

Acurácia média do previsores_esc: 97.88%

## Regressão Logística

In [None]:
logistica = LogisticRegression(random_state=1, max_iter=500, penalty="l2",tol=0.0001, C=1,solver="lbfgs")

In [None]:
logistica.fit(x_treino, y_treino)

In [None]:
previsoes_logistica = logistica.predict(x_teste)
previsoes_logistica

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_logistica) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_logistica)

In [None]:
print(classification_report(y_teste, previsoes_logistica))

In [None]:
previsoes_treino = logistica.predict(x_treino)
previsoes_treino

In [None]:
accuracy_score(y_treino, previsoes_treino)

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
modelo = LogisticRegression(random_state=1, max_iter=600, penalty="l2",tol=0.0001, C=1,solver="lbfgs")

In [None]:
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia média do previsores: 94.71%

Acurácia média do previsores_esc: 98.06%

## Aprendizagem Baseada em Instâncias (KNN)

In [None]:
knn = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p=1)
knn.fit(x_treino, y_treino)

In [None]:
previsoes_knn = knn.predict(x_teste)
previsoes_knn

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_knn) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_knn)

In [None]:
print(classification_report(y_teste, previsoes_knn))

In [None]:
previsoes_knn = knn.predict(x_treino)
previsoes_knn

In [None]:
accuracy_score(y_treino, previsoes_knn)

In [None]:
confusion_matrix(y_treino, previsoes_knn)

In [None]:
modelo = KNeighborsClassifier(n_neighbors=7, metric='minkowski', p = 1)

In [None]:
resultado = cross_val_score(modelo, previsores, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia Média do previsores: 93.13%

Acurácia Média do previsores_esc: 96.65%

## Árvore de Decisão

In [None]:
arvore = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=3)

In [None]:
arvore.fit(x_treino, y_treino)

In [None]:
previsoes_arvore = arvore.predict(x_teste)

In [None]:
previsoes_arvore

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_arvore) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_arvore)

In [None]:
print(classification_report(y_teste, previsoes_arvore))

In [None]:
previsoes_treino = arvore.predict(x_treino)

In [None]:
print("Acurácia do treino: %.2f%%" % (accuracy_score(y_treino, previsoes_treino) * 100.0))

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
modelo = DecisionTreeClassifier(criterion='entropy', random_state = 0, max_depth=3)

In [None]:
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia Média do previsores: 93.67%

Acurácia Média do previsores_esc: 93.67% (Mesmo resultado)

## Random Forest

In [None]:
random = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state = 0, max_depth=4)
random.fit(x_treino, y_treino)

In [None]:
previsoes_random = random.predict(x_teste)
previsoes_random

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_random) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_random)

In [None]:
print(classification_report(y_teste, previsoes_random))

In [None]:
previsores_treino = random.predict(x_treino)

In [None]:
print("Acurácia do treino: %.2f%%" % (accuracy_score(y_treino, previsoes_treino) * 100.0))

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
modelo = RandomForestClassifier(n_estimators=150, criterion='entropy', random_state = 0, max_depth=4)

In [None]:
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia Média de previsores: 95.76%

Acurácia Média de previsores_esc: 95.76%


## XGBoost

In [None]:
xg = XGBClassifier(max_depth=2, learning_rate=0.05, n_estimators=250, objective='binary:logistic', random_state=3)

In [None]:
xg.fit(x_treino,y_treino)

In [None]:
previsoes_xg = xg.predict(x_teste)

In [None]:
previsoes_xg

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_xg) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_xg)

In [None]:
print(classification_report(y_teste, previsoes_xg))

In [None]:
previsoes_treino = xg.predict(x_treino)
previsoes_treino

In [None]:
y_treino

In [None]:
print("Acurácia do treino: %.2f%%" % (accuracy_score(y_treino, previsoes_treino) * 100.0))

In [None]:
modelo = XGBClassifier(max_depth=2, learning_rate=0.05, n_estimators=250, objective='binary:logistic', random_state=3)

In [None]:
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia Média do previsores: 96.65%

Acurácia Média do previsores_esc: 96.65%

## LightBGM

In [None]:
!pip install lightgbm

In [None]:
dataset = lgb.Dataset(x_treino,label=y_treino)

In [None]:
parametros = {'num_leaves':30,'objective':'binary','max_depth':3,'learning_rate':.1,'max_bin':50}

In [None]:
lgbm=lgb.train(parametros,dataset,num_boost_round=200)

In [None]:
previsoes_lgbm = lgbm.predict(x_teste)
previsoes_lgbm

In [None]:
for i in range(0, 171):
    if previsoes_lgbm[i] >= .5:
       previsoes_lgbm[i] = 1
    else:
       previsoes_lgbm[i] = 0

In [None]:
previsoes_lgbm

In [None]:
y_teste

In [None]:
print("Acurácia: %.2f%%" % (accuracy_score(y_teste, previsoes_lgbm) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_lgbm)

In [None]:
previsoes_treino = lgbm.predict(x_treino)
previsoes_treino

In [None]:
for i in range(0, 398):
    if previsoes_treino[i] >= .5:
       previsoes_treino[i] = 1
    else:
       previsoes_treino[i] = 0

In [None]:
previsoes_treino

In [None]:
print("Acurácia do treino: %.2f%%" % (accuracy_score(y_treino, previsoes_treino) * 100.0))

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
modelo = lgb.LGBMClassifier(num_leaves = 100, objective = 'binary',max_depth = 3, learning_rate = .1, max_bin =50)

In [None]:
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

Acurácia Média do previsores: 96.11%

Acurácia Média do previsores_esc: 97.52%

## Catboost

In [None]:
df

In [None]:
previsores2 = df.iloc[:, 1:32]

In [None]:
alvo2 = df.iloc[:, 0]

In [None]:
x_treino, x_teste, y_treino, y_teste = train_test_split(previsores2, alvo2, test_size = 0.3, random_state = 0)

In [None]:
catboost = CatBoostClassifier(task_type='CPU', iterations=100, learning_rate=0.1, depth = 8, random_state = 5,eval_metric="Accuracy")

In [None]:
catboost.fit( x_treino, y_treino, plot=True, eval_set=(x_teste, y_teste))

In [None]:
previsoes_cat = catboost.predict(x_teste)
previsoes_cat

In [None]:
y_teste

In [None]:
print("Acurácia do teste: %.2f%%" % (accuracy_score(y_teste, previsoes_cat) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes_cat)

In [None]:
previsoes_treino = catboost.predict(x_treino)
previsoes_treino

In [None]:
accuracy_score(y_treino, previsoes_treino)

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
modelo = CatBoostClassifier(task_type='CPU', iterations= 200, learning_rate=0.1, depth = 8, random_state = 5,eval_metric="Accuracy")

In [None]:
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)

In [None]:
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

## Redes Neurais

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
redes = MLPClassifier(hidden_layer_sizes=(7), activation='relu', solver='lbfgs', max_iter =800,
tol=0.0001, random_state = 3, verbose = True)

In [None]:
redes.fit(x_treino, y_treino)

In [None]:
previsoes = redes.predict(x_teste)
previsoes

In [None]:
print("Acurácia: %.2f%%" % (accuracy_score(y_teste, previsoes) * 100.0))

In [None]:
confusion_matrix(y_teste, previsoes)

In [None]:
print(classification_report(y_teste, previsoes))

In [None]:
previsoes_treino = redes.predict(x_treino)
previsoes_treino

In [None]:
accuracy_score(y_treino, previsoes_treino)

In [None]:
confusion_matrix(y_treino, previsoes_treino)

In [None]:
kfold = KFold(n_splits = 30, shuffle=True, random_state = 5)

In [None]:
modelo = MLPClassifier(hidden_layer_sizes=(7), activation='relu', solver='adam', max_iter =8000,
                              tol=0.0001, random_state = 3, verbose = True)
resultado = cross_val_score(modelo, previsores_esc, alvo, cv = kfold)
print("Acurácia Média: %.2f%%" % (resultado.mean() * 100.0))

## Resultados

Naive Bayes: 93.82% (Sem escalonamento)

SVM: 97.88% (Escalonado)

**Regressão Logística: 98.06% (Escalonado)**

KNN: 96.65% (Escalonado)

Árvore de decisão: 93.67%

Random Forest: 95.76%

XGBoost: 96.65%

LightGBM: 97.52% (Escalonado)

Catboost: 97.16%

Redes Neurais: 97.17%
