# Cancer Patient Survival Prediction using Neural Networks

This notebook implements a feedforward neural network to predict patient survival status based on clinical features from the China Cancer Patient Records dataset.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import time
import warnings
warnings.filterwarnings('ignore')

print("TensorFlow version:", tf.__version__)
print("GPU available:", tf.config.list_physical_devices('GPU'))

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# Load the dataset
data_path = "/Users/f/.cache/kagglehub/datasets/ak0212/china-cancer-patient-records/versions/1/china_cancer_patients_synthetic.csv"
df = pd.read_csv(data_path)

print(f"Dataset shape: {df.shape}")
print(f"Target distribution:\n{df['SurvivalStatus'].value_counts()}")
print(f"Target distribution (%):\n{df['SurvivalStatus'].value_counts(normalize=True) * 100}")

## Data Preprocessing

In [None]:
# Create a copy for preprocessing
data = df.copy()

# Check missing values
print("Missing values per column:")
print(data.isnull().sum())
print("\nMissing values percentage:")
print((data.isnull().sum() / len(data)) * 100)

In [None]:
# Select features for the model (excluding ID, dates, and target)
feature_columns = ['Gender', 'Age', 'Province', 'Ethnicity', 'TumorType', 'CancerStage', 
                  'TumorSize', 'Metastasis', 'TreatmentType', 'ChemotherapySessions', 
                  'RadiationSessions', 'FollowUpMonths', 'SmokingStatus', 'AlcoholUse', 
                  'GeneticMutation', 'Comorbidities']

# Create feature dataframe
X = data[feature_columns].copy()
y = data['SurvivalStatus'].copy()

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

In [None]:
# Handle missing values and encode categorical variables
from sklearn.preprocessing import LabelEncoder

# Initialize label encoders
label_encoders = {}

# Encode target variable
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
print(f"Target encoding: {dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))}")

# Handle categorical features
categorical_features = ['Gender', 'Province', 'Ethnicity', 'TumorType', 'CancerStage', 
                       'Metastasis', 'TreatmentType', 'SmokingStatus', 'AlcoholUse', 
                       'GeneticMutation', 'Comorbidities']

X_processed = X.copy()

for col in categorical_features:
    # Fill missing values with 'Unknown'
    X_processed[col] = X_processed[col].fillna('Unknown')
    
    # Label encode
    le = LabelEncoder()
    X_processed[col] = le.fit_transform(X_processed[col])
    label_encoders[col] = le
    
    print(f"{col}: {len(le.classes_)} unique values")

# Handle numerical features
numerical_features = ['Age', 'TumorSize', 'ChemotherapySessions', 'RadiationSessions', 'FollowUpMonths']

for col in numerical_features:
    # Fill missing values with median
    X_processed[col] = X_processed[col].fillna(X_processed[col].median())

print(f"\nProcessed feature matrix shape: {X_processed.shape}")
print(f"Missing values after preprocessing: {X_processed.isnull().sum().sum()}")

## Train-Test Split and Feature Scaling

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")
print(f"Training target distribution: {np.bincount(y_train)}")
print(f"Test target distribution: {np.bincount(y_test)}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"Feature scaling completed")
print(f"Training data mean: {X_train_scaled.mean():.6f}")
print(f"Training data std: {X_train_scaled.std():.6f}")

## Neural Network Model Definition

In [None]:
# Define the neural network architecture
def create_model(input_dim, hidden_layers=[128, 64, 32], dropout_rate=0.3, learning_rate=0.001):
    model = keras.Sequential()
    
    # Input layer
    model.add(layers.Dense(hidden_layers[0], activation='relu', input_dim=input_dim))
    model.add(layers.BatchNormalization())
    model.add(layers.Dropout(dropout_rate))
    
    # Hidden layers
    for units in hidden_layers[1:]:
        model.add(layers.Dense(units, activation='relu'))
        model.add(layers.BatchNormalization())
        model.add(layers.Dropout(dropout_rate))
    
    # Output layer (binary classification)
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # Compile model
    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss='binary_crossentropy',
        metrics=['accuracy', 'precision', 'recall']
    )
    
    return model

# Create the model
input_dim = X_train_scaled.shape[1]
model = create_model(input_dim)

# Display model architecture
model.summary()

## Model Training with Tracking

In [None]:
# Define callbacks for training monitoring
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True,
    verbose=1
)

# Reduce learning rate when loss plateaus
reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=10,
    min_lr=1e-7,
    verbose=1
)

# Save best model
model_checkpoint = ModelCheckpoint(
    'best_survival_model.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

callbacks = [early_stopping, reduce_lr, model_checkpoint]

In [None]:
# Train the model with comprehensive tracking
print("Starting model training...")
start_time = time.time()

history = model.fit(
    X_train_scaled, y_train,
    validation_split=0.2,
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

end_time = time.time()
training_time = end_time - start_time

print(f"\nTraining completed!")
print(f"Total training time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print(f"Number of epochs completed: {len(history.history['loss'])}")

## Training Progress Visualization

In [None]:
# Plot training history
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Loss
axes[0, 0].plot(history.history['loss'], label='Training Loss')
axes[0, 0].plot(history.history['val_loss'], label='Validation Loss')
axes[0, 0].set_title('Model Loss')
axes[0, 0].set_xlabel('Epoch')
axes[0, 0].set_ylabel('Loss')
axes[0, 0].legend()
axes[0, 0].grid(True)

# Accuracy
axes[0, 1].plot(history.history['accuracy'], label='Training Accuracy')
axes[0, 1].plot(history.history['val_accuracy'], label='Validation Accuracy')
axes[0, 1].set_title('Model Accuracy')
axes[0, 1].set_xlabel('Epoch')
axes[0, 1].set_ylabel('Accuracy')
axes[0, 1].legend()
axes[0, 1].grid(True)

# Precision
axes[1, 0].plot(history.history['precision'], label='Training Precision')
axes[1, 0].plot(history.history['val_precision'], label='Validation Precision')
axes[1, 0].set_title('Model Precision')
axes[1, 0].set_xlabel('Epoch')
axes[1, 0].set_ylabel('Precision')
axes[1, 0].legend()
axes[1, 0].grid(True)

# Recall
axes[1, 1].plot(history.history['recall'], label='Training Recall')
axes[1, 1].plot(history.history['val_recall'], label='Validation Recall')
axes[1, 1].set_title('Model Recall')
axes[1, 1].set_xlabel('Epoch')
axes[1, 1].set_ylabel('Recall')
axes[1, 1].legend()
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

## Model Evaluation

In [None]:
# Make predictions
print("Evaluating model on test set...")
y_pred_proba = model.predict(X_test_scaled)
y_pred = (y_pred_proba > 0.5).astype(int).flatten()

# Calculate metrics
test_loss, test_accuracy, test_precision, test_recall = model.evaluate(X_test_scaled, y_test, verbose=0)
auc_score = roc_auc_score(y_test, y_pred_proba)

print(f"\n=== Model Performance ===")
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall: {test_recall:.4f}")
print(f"AUC Score: {auc_score:.4f}")

# Classification report
print(f"\n=== Classification Report ===")
target_names = ['Deceased', 'Alive']
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# Confusion Matrix and ROC Curve
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title('Confusion Matrix')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_xticklabels(['Deceased', 'Alive'])
axes[0].set_yticklabels(['Deceased', 'Alive'])

# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
axes[1].plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {auc_score:.2f})')
axes[1].plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
axes[1].set_xlim([0.0, 1.0])
axes[1].set_ylim([0.0, 1.05])
axes[1].set_xlabel('False Positive Rate')
axes[1].set_ylabel('True Positive Rate')
axes[1].set_title('Receiver Operating Characteristic (ROC) Curve')
axes[1].legend(loc="lower right")
axes[1].grid(True)

plt.tight_layout()
plt.show()

## Training Summary

In [None]:
# Final training summary
print("=== TRAINING SUMMARY ===")
print(f"Dataset size: {len(df):,} patients")
print(f"Training set: {len(X_train):,} patients")
print(f"Test set: {len(X_test):,} patients")
print(f"Number of features: {input_dim}")
print(f"Training time: {training_time:.2f} seconds ({training_time/60:.2f} minutes)")
print(f"Epochs completed: {len(history.history['loss'])}")
print(f"Best validation loss: {min(history.history['val_loss']):.4f}")
print(f"Best validation accuracy: {max(history.history['val_accuracy']):.4f}")
print(f"Final test accuracy: {test_accuracy:.4f}")
print(f"Final AUC score: {auc_score:.4f}")

# Model architecture summary
print(f"\n=== MODEL ARCHITECTURE ===")
total_params = model.count_params()
print(f"Total parameters: {total_params:,}")
print(f"Model layers: {len(model.layers)}")