In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, recall_score
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("Cleaned_Lung_Cancer.csv")

In [None]:
X = df.drop(columns=['survived'])
y = df['survived']

print("Class distribution in the target variable:")
print(y.value_counts(normalize=True))

class_weights = dict(zip([0,1], [1, (0.779771/0.220229)]))

adasyn = ADASYN(random_state=42)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_train_balanced, y_train_balanced, test_size=0.25, random_state=42)


rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

recall = recall_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")

print("\nDetailed Classification Report:")
print(classification_report(y_test, y_pred))

Class distribution in the target variable:
survived
0    0.779771
1    0.220229
Name: proportion, dtype: float64


In [4]:
X_train_lr, X_test_lr, y_train_lr, y_test_lr = train_test_split(X, y, test_size=0.25, random_state=42)

lr_model = LogisticRegression(random_state=42, class_weight=class_weights, max_iter=1000)
lr_model.fit(X_train_lr, y_train_lr)

y_lr_pred= lr_model.predict(X_test_lr)
lr_cr=classification_report(y_test_lr, y_lr_pred)
print(lr_cr)

              precision    recall  f1-score   support

           0       0.78      0.75      0.77    173292
           1       0.22      0.24      0.23     49208

    accuracy                           0.64    222500
   macro avg       0.50      0.50      0.50    222500
weighted avg       0.65      0.64      0.65    222500



#Gerando os graficos e analise dos dados

In [None]:
# Distribuição das classes antes do balanceamento
sns.countplot(x=y)
plt.title("Distribuição das classes antes do balanceamento")
plt.xlabel("Classe (Survived)")
plt.ylabel("Quantidade")
plt.show()

# Distribuição das classes após o balanceamento (seu código já tem)
sns.countplot(x=y_train_balanced)
plt.title("Distribuição das classes após ADASYN")
plt.xlabel("Classe (Survived)")
plt.ylabel("Quantidade")
plt.show()

In [None]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)

    print(f"\n🔍 {type(model).__name__}")
    print("=" * 50)
    print(classification_report(y_test, y_pred, target_names=["Não Sobreviveu", "Sobreviveu"]))

    plt.figure(figsize=(6, 4))
    sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True, fmt='d', cmap='Blues',
                xticklabels=['Não Sobreviveu', 'Sobreviveu'],
                yticklabels=['Não Sobreviveu', 'Sobreviveu'])
    plt.title(f"Matriz de Confusão - {type(model).__name__}")
    plt.ylabel('Verdadeiro')
    plt.xlabel('Previsto')
    print("Mostrando o gráfico...")
    plt.show()
evaluate_model(rf, X_test, y_test)
evaluate_model(lr_model, X_test_lr, y_test_lr)

In [None]:
from sklearn.metrics import RocCurveDisplay


fig, ax = plt.subplots(figsize=(8, 6))

# Comparando as curvas ROC
# Random Forest com dados balanceados
RocCurveDisplay.from_estimator(rf, X_test, y_test, ax=ax, name="RandomForestClassifier")

# Regressão Logística com dados originais
RocCurveDisplay.from_estimator(lr_model, X_test_lr, y_test_lr, ax=ax, name="LogisticRegression")

# Linha de aleatoriedade (AUC = 0.5)
plt.plot([0, 1], [0, 1], 'k--', label="Aleatório")

plt.title("Curva ROC - Comparação de Modelos")
plt.legend()
plt.show()

In [None]:
# Extração dos coeficientes da Regressão Logística
import numpy as np


coefficients = pd.DataFrame({
    'Feature': X_train_lr.columns,
    'Coef': lr_model.coef_[0],
    'Abs_Coef': np.abs(lr_model.coef_[0])
}).sort_values(by='Abs_Coef', ascending=False)

# Plotando as 10 principais features (impacto absoluto)
plt.figure(figsize=(8, 4))
sns.barplot(
    x="Coef",
    y="Feature",
    data=coefficients.head(10),
    palette="coolwarm"
)
plt.title("Regressão Logística — Top 10 Features")
plt.xlabel("Coeficiente")
plt.ylabel("Variável")
plt.show()


print(coefficients.head(10).to_markdown(index=False))

In [None]:
# Extração das importâncias das features do Random Forest (excluindo 'Unnamed: 0')
importances = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Remover a variável 'Unnamed: 0' se ela estiver presente
importances = importances[importances['Feature'] != 'Unnamed: 0']

# Plotando as 10 principais features
plt.figure(figsize=(8, 4))
sns.barplot(
    x="Importance",
    y="Feature",
    data=importances.head(10),
    palette="viridis"
)
plt.title("Random Forest — Top 10 Features")
plt.xlabel("Importância")
plt.ylabel("Variável")
plt.show()
print(importances.head(10).to_markdown(index=False))

In [None]:

metricas = {}

# Avaliando Random Forest (dados balanceados)
y_pred_rf = rf.predict(X_test)
precision_rf, recall_rf, f1_rf, _ = precision_recall_fscore_support(y_test, y_pred_rf, average='binary')
metricas['Random Forest'] = {
    'Acurácia': accuracy_score(y_test, y_pred_rf),
    'Recall': recall_rf,
    'Precisão': precision_rf,
    'F1-Score': f1_rf,
    'AUC': roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
}

# Avaliando Regressão Logística (dados originais)
y_pred_lr = lr_model.predict(X_test_lr)
precision_lr, recall_lr, f1_lr, _ = precision_recall_fscore_support(y_test_lr, y_pred_lr, average='binary')
metricas['Regressão Logística'] = {
    'Acurácia': accuracy_score(y_test_lr, y_pred_lr),
    'Recall': recall_lr,
    'Precisão': precision_lr,
    'F1-Score': f1_lr,
    'AUC': roc_auc_score(y_test_lr, lr_model.predict_proba(X_test_lr)[:, 1])
}

# Exibindo a tabela comparativa
pd.DataFrame(metricas).T.style.background_gradient(cmap='Blues')