In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import tensorflow as tf
import sklearn
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras import layers

os.environ['KERAS_BACKEND'] = 'tensorflow'

In [None]:
import keras

print("Keras version:", keras.__version__)

In [None]:
from typing import Literal


def get_running_environment() -> Literal['local', 'colab', 'kaggle', 'marimo', 'jupyter']:
    """
    Returns the current running environment.
    """
    if 'COLAB_GPU' in os.environ:
        return 'colab'
    elif 'KAGGLE_URL_BASE' in os.environ:
        return 'kaggle'
    elif 'MARIMO' in os.environ:
        return 'marimo'
    elif 'JUPYTERHUB_USER' in os.environ:
        return 'jupyter'
    else:
        return 'local'
current_env = get_running_environment()

def get_data_path(running_env: str) -> str:
    """
    Returns the path to the data directory based on the running environment.
    """
    if running_env == 'colab':
        return '/content/data'
    elif running_env == 'kaggle':
        return '/kaggle/input/histopathologic-cancer-detection'
    elif running_env == 'marimo':
        return '/data'
    else:
        return './data/histopathologic-cancer-detection'
    
def set_random_seed(seed: int = 42):
    """
    Sets the random seed for reproducibility.
    """
    np.random.seed(seed)
    tf.random.set_seed(seed)
    print(f"Random seed set to {seed}")

data_dir = get_data_path(current_env)
print(f"Data path: {data_dir} (for {current_env} environment)")

RANDOM_SEED = 42

set_random_seed(RANDOM_SEED)

In [None]:
IMAGE_SIZE = 96
IMAGE_CHANNEL = 3
SAMPLE_SIZE = 1000
BATCH_SIZE = 32
TRAIN_IMAGE_DIR = os.path.join(data_dir, 'train')
TEST_IMAGE_DIR = os.path.join(data_dir, 'test')

In [None]:
df = pd.read_csv(os.path.join(data_dir, 'train_labels.csv'))

In [None]:
negative_samples = df[df['label'] == 0].sample(SAMPLE_SIZE, random_state=RANDOM_SEED)
positive_samples = df[df['label'] == 1].sample(SAMPLE_SIZE, random_state=RANDOM_SEED)

In [None]:
train_processed_dir = os.path.join(data_dir, "train_processed")
test_processed_dir = os.path.join(data_dir, "test_processed")

In [None]:
train_ds = image_dataset_from_directory(
    train_processed_dir,
    validation_split=0.2,
    subset="training",
    seed=123,
    image_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    label_mode='binary'  # Binary labels (0 or 1)
)

In [None]:
val_ds = image_dataset_from_directory(
    train_processed_dir,
    validation_split=0.2,
    subset="validation",
    seed=123,
    image_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    label_mode='binary'
)

In [None]:
test_ds = image_dataset_from_directory(
    test_processed_dir,
    image_size=(IMAGE_SIZE, IMAGE_SIZE),
    batch_size=BATCH_SIZE,
    label_mode=None,  # No labels for test
    shuffle=False  # Preserve order for submission
)

In [None]:
plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
    for i in range(9):
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(np.array(images[i]).astype("uint8"))
        plt.title(int(labels[i]))
        plt.axis("off")

In [None]:
def build_custom_cnn():
    model = tf.keras.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=(96,96,3)),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(128, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.5),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC()])
    return model

custom_model = build_custom_cnn()
custom_model.summary()

In [19]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping

# Class weights
labels = df['label'].values
class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train custom CNN
history_custom = custom_model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    class_weight=class_weight_dict,
    callbacks=[early_stop]
)

Epoch 1/50


2025-07-22 06:48:39.687094: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 21ms/step - accuracy: 0.5251 - auc: 0.5452 - loss: 4.0432 - val_accuracy: 0.7808 - val_auc: 0.8588 - val_loss: 0.5035
Epoch 2/50
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 23ms/step - accuracy: 0.7388 - auc: 0.7993 - loss: 0.6293 - val_accuracy: 0.8055 - val_auc: 0.8706 - val_loss: 0.5116
Epoch 3/50
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m129s[0m 23ms/step - accuracy: 0.7347 - auc: 0.7908 - loss: 0.7947 - val_accuracy: 0.8147 - val_auc: 0.8785 - val_loss: 0.4788
Epoch 4/50
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 21ms/step - accuracy: 0.7422 - auc: 0.7937 - loss: 1.1749 - val_accuracy: 0.7828 - val_auc: 0.8747 - val_loss: 1.2748
Epoch 5/50
[1m5501/5501[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 22ms/step - accuracy: 0.7463 - auc: 0.7977 - loss: 1.6053 - val_accuracy: 0.7177 - val_auc: 0.8719 - val_loss: 1.3798
Epoch 6/50
[1m5

In [21]:
# Predict
test_ids = [f[:-4] for f in os.listdir(test_processed_dir) if f.endswith('.jpg')]
predictions = custom_model.predict(test_ds).flatten()
submission = pd.DataFrame({'id': test_ids, 'label': (predictions > 0.5).astype(int)})
submission.to_csv('submission.csv', index=False)

[1m1796/1796[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 4ms/step
