In [1]:
import os
import pandas as pd
import kagglehub

# Baixando o dataset
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
print("Path para os arquivos:", path)

# Listar arquivos CSV
files = [f for f in os.listdir(path) if f.endswith('.csv')]
print("Arquivos CSV encontrados:", files)

# Carregar os DataFrames em um dicionário
dfs = {}
for f in files:
    full_path = os.path.join(path, f)
    df_temp = pd.read_csv(full_path)

    # Transformar a coluna alvo corretamente
    if 'Diabetes_012' in df_temp.columns:
        # Mapear valores 0 → 0 (sem diabetes ou só gravidez), 1 e 2 → 1 (pré-diabetes e diabetes)
        df_temp['Diabetes_binary'] = df_temp['Diabetes_012'].map(lambda x: 0 if x == 0 else 1)
        
        # Dropar a coluna original
        df_temp.drop(columns=['Diabetes_012'], inplace=True)

    elif 'Diabetes_binary' in df_temp.columns:
        # Garantir que está tudo certo
        df_temp['Diabetes_binary'] = df_temp['Diabetes_binary'].map(lambda x: 0 if x == 0 else 1)

    dfs[f] = df_temp

# Verificar diferenças nas colunas
print("\nComparando colunas dos arquivos:")
all_columns = {name: set(df.columns) for name, df in dfs.items()}
for name, columns in all_columns.items():
    print(f"- {name}: {len(columns)} colunas\n")

# Comparar quais colunas são diferentes
base = list(all_columns.values())[0]
for name, columns in all_columns.items():
    diff = base.symmetric_difference(columns)
    if diff:
        print(f"⚠️ Diferenças entre base e {name}: {diff}")
    else:
        print(f"✅ {name} tem as mesmas colunas da base.")

# Identificar colunas comuns
common_columns = set.intersection(*all_columns.values())

# Juntar os DataFrames com base nas colunas comuns
df = pd.concat([df_temp[list(common_columns)] for df_temp in dfs.values()], ignore_index=True)

# Exibir resultado final
print("\n✅ Shape do DataFrame combinado:", df.shape)
print(df["Diabetes_binary"].value_counts())
pd.set_option('display.max_columns', None)
df.head()


ModuleNotFoundError: No module named 'kagglehub'

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error , mean_absolute_percentage_error , mean_squared_error , accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

In [None]:
df.head(20)

In [None]:
rows , col =  df.shape
print(f"Número de Linhas : {rows} \nNúmero de Colunas : {col}")

In [None]:
df.info()

In [None]:
import missingno as msno
msno.bar(df, color='skyblue')
plt.show()


In [None]:
pd.set_option('display.max_rows', None)
df.describe().T

In [None]:
#checking unique values in different variables
unique_values = {}
for col in df.columns:
    unique_values[col] = df[col].value_counts().shape[0]

pd.DataFrame(unique_values, index=['Valores Unicos nas colunas']).transpose()

In [None]:
# Checking duplicates rows in dataset df1 to eliminate them

duplicates = df[df.duplicated()]
print("Duplicate Rows : ",len(duplicates))
duplicates.head()

In [None]:
df_clean = df.drop_duplicates().copy()
# Dimensões do DataFrame original
rows_orig, col_orig = df.shape
print(f"Número de Linhas (original): {rows_orig} \nNúmero de Colunas: {col_orig}")

# Dimensões após remoção de duplicatas
rows_clean, col_clean = df_clean.shape
print(f"Número de Linhas (sem duplicatas): {rows_clean} \nNúmero de Colunas: {col_clean}")

# Quantidade de linhas removidas
print(f"\nTotal de duplicatas removidas: {rows_orig - rows_clean}")

---
# **Visualização dos Dados**



In [None]:
df_clean.hist(figsize = (20,20))
plt.show()

In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns

# Calcular a correlação
correlation = df_clean.corr(numeric_only=True)

# Correlacionar com o target
cor_target = correlation['Diabetes_binary'].drop('Diabetes_binary').sort_values(ascending=False)

# Exibir as variáveis mais correlacionadas
print("📊 Correlação com Diabetes_binary:")
print(cor_target)

# Visualizar com heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cor_target.to_frame(), annot=True, cmap='coolwarm')
plt.title("Correlação com Diabetes_binary")
plt.show()


In [None]:
corr = df_clean.corr(numeric_only=True)  # garante apenas colunas numéricas

fig, ax = plt.subplots(figsize=(18, 9))
sns.heatmap(
    corr,
    annot=True,
    fmt=".2f",
    cmap="coolwarm",
    center=0,
    linewidths=0.5,
    linecolor='gray',
    cbar_kws={"shrink": 0.75},
    ax=ax
)

ax.set_title("Correlação entre as Variáveis", fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()


In [None]:
# Selecionar apenas colunas numéricas
numeric_columns = df_clean.select_dtypes(include='number').columns

# Criar subplots
n_cols = 3
n_rows = -(-len(numeric_columns) // n_cols)  # arredondamento para cima
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 5 * n_rows))
axes = axes.flatten()  # transforma em vetor para acesso simples

# Criar boxplots
for i, column in enumerate(numeric_columns):
    sns.boxplot(data=df_clean, x=column, ax=axes[i])
    axes[i].set_title(f'Boxplot de {column}')

# Remover eixos não usados (caso sobre espaço no grid)
for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


In [None]:
df_clean['Diabetes_binary'].unique()

In [None]:
cols = ['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 'Diabetes_binary']
sns.pairplot(data=df_clean[cols], hue='Diabetes_binary', diag_kind='kde', markers=["o", "s"], palette="Set1")
plt.show()


In [None]:
# Mapeando os valores 0 e 1 para descrições
diabetes_map = {0: "Não Diabético", 1: "Diabético"}

# Contando os valores e renomeando os índices com base no mapa
tabela_diabetes = df_clean["Diabetes_binary"].value_counts().rename(index=diabetes_map)

# Exibindo a tabela
print(tabela_diabetes)


In [None]:
#import matplotlib.pyplot as plt
#import seaborn as sns

# Calcular a correlação
correlation = df_clean.corr(numeric_only=True)

# Correlacionar com o target
cor_target = correlation['Diabetes_binary'].drop('Diabetes_binary').sort_values(ascending=False)

# Exibir as variáveis mais correlacionadas
print("📊 Correlação com Diabetes_binary:")
print(cor_target)

# Visualizar com heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(cor_target.to_frame(), annot=True, cmap='coolwarm')
plt.title("Correlação com Diabetes_binary")
plt.show()


In [None]:
# 1. Separar X e y ...
target = 'Diabetes_binary'
X_full = df_clean.drop(columns=[target])
y = df_clean[target]

# 2. Dividir em treino e teste
X_train_full, X_test_full, y_train, y_test = train_test_split(X_full, y, test_size=0.3, random_state=42, stratify=y)

# 3. Treinar modelo com TODAS as variáveis
model_full = RandomForestClassifier(random_state=42)
model_full.fit(X_train_full, y_train)
y_pred_full = model_full.predict(X_test_full)

print("\n📊 Modelo completo (todas as features):")
print(classification_report(y_test, y_pred_full))

# 4. Selecionar apenas as TOP 5 variáveis
top5_features = ['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol']
X_top5 = df_clean[top5_features]
X_train_top5, X_test_top5, _, _ = train_test_split(X_top5, y, test_size=0.3, random_state=42, stratify=y)

# 5. Treinar modelo com apenas top 5
model_top5 = RandomForestClassifier(random_state=42)
model_top5.fit(X_train_top5, y_train)
y_pred_top5 = model_top5.predict(X_test_top5)

print("\n📊 Modelo com TOP 5 features:")
print(classification_report(y_test, y_pred_top5))


In [None]:
#import pandas as pd
#from sklearn.model_selection import train_test_split
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import LogisticRegression
#from sklearn.metrics import classification_report
from xgboost import XGBClassifier

# Definir target
target = 'Diabetes_binary'

# Definir os 10 melhores features pela correlação que você encontrou
top_10_features = ['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 
                   'Age', 'HeartDiseaseorAttack', 'PhysHlth', 'Stroke', 'MentHlth']

# Top 5 é subconjunto
top_5_features = top_10_features[:5]

# Separar X e y
X_full = df.drop(columns=[target])
y = df_clean[target]

# Conjuntos de features a testar
feature_sets = {
    'Todas (21)': X_full.columns.tolist(),
    'Top 10': top_10_features,
    'Top 5': top_5_features
}

# Modelos a testar
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

# Treinar e avaliar cada modelo em cada conjunto de features
results = []

for feat_name, feat_list in feature_sets.items():
    X = df_clean[feat_list]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

    print(f"\n🔷 Conjunto de features: {feat_name}")
    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f"\n📌 Modelo: {model_name} | Features: {feat_name}")
        print(classification_report(y_test, y_pred, digits=4))


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

# Configurar seaborn
sns.set(style='whitegrid')

# Target
target = 'Diabetes_binary'

# Features
top_10_features = ['GenHlth', 'HighBP', 'BMI', 'DiffWalk', 'HighChol', 
                   'Age', 'HeartDiseaseorAttack', 'PhysHlth', 'Stroke', 'MentHlth']
top_5_features = top_10_features[:5]

# Conjuntos de features
feature_sets = {
    'Todas (21)': df_clean.drop(columns=[target]).columns.tolist(),
    'Top 10': top_10_features,
    'Top 5': top_5_features
}

# Modelos
models = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'KNN': KNeighborsClassifier(),
    'DecisionTree': DecisionTreeClassifier(random_state=42),
    'SVC': SVC(probability=True)
}

# Lista para resultados
all_results = []

# Loop pelos conjuntos de features e modelos
for feat_name, feat_cols in feature_sets.items():
    X = df_clean[feat_cols]
    y = df_clean[target]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y
