In [4]:
import pandas as pd

bcsc_paths = [
    "../data/bcsc_risk_factors_summarized1_092020.csv",
    "../data/bcsc_risk_factors_summarized2_092020.csv",
    "../data/bcsc_risk_factors_summarized3_092020.csv"
]

df_list = []
for path in bcsc_paths:
    df = pd.read_csv(bcsc_paths)
    print(f"Loaded {path}, shape: {df.shape}")
    df_list.append(df)

bcsc_df = pd.concat(df_list, ignore_index=True)
bcsc_df_clean = bcsc_df[bcsc_df['breast_cancer_history'] != 9].reset_index(drop=True)
print(f"DataFrame shape after dropping: {bcsc_df_clean.shape}")

# Count how many rows dropped
dropped_count = bcsc_df.shape[0] - bcsc_df_clean.shape[0]
print(f"Dropped rows: {dropped_count}")

# 3. Save cleaned concatenated DataFrame
df_clean_path = "../data/bcsc_concatenated_no_hist9.csv"
bcsc_df_clean.to_csv(df_clean_path, index=False)


ValueError: Invalid file path or buffer object type: <class 'list'>

In [None]:
df = pd.read_csv('../data/bcsc_concatenated_no_hist9.csv')
print(risk_factors.head(2))

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, roc_auc_score
import xgboost as xgb
import joblib

for c in ["count", "year"]:
    if c in df.columns:
        df.drop(columns=[c], inplace=True)
        print(f"Dropped column: {c}")

# 4) Filter out 'Unknown' target values -> breast_cancer_history == 9
if 'breast_cancer_history' not in df.columns:
    raise KeyError("Target column 'breast_cancer_history' not found.")
initial_count = df.shape[0]
df = df[df['breast_cancer_history'] != 9].reset_index(drop=True)
filtered_count = df.shape[0]
print(f"Filtered out {initial_count - filtered_count} rows with breast_cancer_history == 9.")

# 5) Separate features (X) and target (y)
X = df.drop(columns=["breast_cancer_history"])

y = df["breast_cancer_history"]

# 6) Handle missing values
for col in X.columns:
    if X[col].isnull().any():
        if X[col].dtype in [np.float64, np.int64]:
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
            print(f"Filled missing numeric column '{col}' with median = {median_val}")
        else:
            mode_vals = X[col].mode()
            fill_val = mode_vals[0] if not mode_vals.empty else "Unknown"
            X[col] = X[col].fillna(fill_val)
            print(f"Filled missing categorical column '{col}' with mode = {fill_val}")

# 7) Encode categorical features
feature_encoders = {}
for col in X.select_dtypes(include=["object", "category"]).columns:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    feature_encoders[col] = le
    print(f"Encoded '{col}' with classes: {list(le.classes_)}")

# 8) Encode target if needed (likely numeric 0 or 1, so skip if numeric)
if y.dtype == 'object' or str(y.dtype).startswith('category'):
    target_le = LabelEncoder()
    y = target_le.fit_transform(y.astype(str))
    print(f"Encoded target 'breast_cancer_history' with classes: {list(target_le.classes_)}")
else:
    target_le = None

# 9) Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y if len(np.unique(y))>1 else None
)
print(f"Training shape: {X_train.shape}, Test shape: {X_test.shape}")

# 10) Train XGBoost classifier
model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42
)
model.fit(X_train, y_train)
print("Model training finished.")

# 11) Evaluate
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
try:
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_pred_prob)
    print(f"ROC AUC: {auc:.4f}")
except:
    print("Could not compute ROC AUC (possible single-class).")

# 12) Save artifacts
MODEL_PATH = os.path.join(DATA_DIR, "bcsc_xgb_model.pkl")
ENCODERS_PATH = os.path.join(DATA_DIR, "bcsc_feature_encoders.pkl")
joblib.dump(model, MODEL_PATH)
print(f"Saved model to {MODEL_PATH}")
joblib.dump(feature_encoders, ENCODERS_PATH)
print(f"Saved feature encoders to {ENCODERS_PATH}")

if target_le is not None:
    TARGET_PATH = os.path.join(DATA_DIR, "bcsc_target_encoder.pkl")
    joblib.dump(target_le, TARGET_PATH)
    print(f"Saved target encoder to {TARGET_PATH}")

