In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
from sklearn.utils import resample
from joblib import dump
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load dataset
df = pd.read_csv("dataset.csv")
df.columns = df.columns.str.lower()

# Step 2: Preprocess
df["gender"] = df["gender"].str.lower().map({"m": 1, "f": 0})
df = df.dropna()

# Step 3: Balance dataset
df_majority = df[df.lung_cancer == 0]
df_minority = df[df.lung_cancer == 1]

df_minority_upsampled = resample(df_minority,
                                 replace=True,
                                 n_samples=len(df_majority),
                                 random_state=42)

df_balanced = pd.concat([df_majority, df_minority_upsampled])

# Step 4: Features and labels
X = df_balanced.drop(columns=["lung_cancer"])
y = df_balanced["lung_cancer"]

# Step 5: Scale features
scaler = StandardScaler()
X_scaled_array = scaler.fit_transform(X)

# ❗️ Re-wrap into DataFrame with original column names
X_scaled = pd.DataFrame(X_scaled_array, columns=X.columns)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.25, random_state=42, stratify=y
)

# Step 7: Train Random Forest
rf_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=5,
    min_samples_split=10,
    class_weight='balanced',
    random_state=42
)
rf_model.fit(X_train, y_train)

# Step 8: Evaluate
y_pred_rf = rf_model.predict(X_test)
y_proba_rf = rf_model.predict_proba(X_test)[:, 1]

print("\n=== Random Forest Classification Report ===")
print(classification_report(y_test, y_pred_rf))
print("Random Forest ROC AUC:", roc_auc_score(y_test, y_proba_rf))
print("Random Forest Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

# Step 9: Save model
dump(rf_model, "rf_lung_model.joblib")




=== Random Forest Classification Report ===
              precision    recall  f1-score   support

           0       0.58      0.57      0.57       371
           1       0.58      0.59      0.58       370

    accuracy                           0.58       741
   macro avg       0.58      0.58      0.58       741
weighted avg       0.58      0.58      0.58       741

Random Forest ROC AUC: 0.5986158665403949
Random Forest Confusion Matrix:
 [[210 161]
 [151 219]]


['rf_lung_model.joblib']