In [11]:
# Cell 1: Setup and Imports

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import plotly.express as px
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn.decomposition import PCA

from pathlib import Path
import os
import glob
from collections import defaultdict, Counter

import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(" All packages imported successfully!")

 All packages imported successfully!


In [12]:

def set_seed(seed=42):
    """Set random seed for reproducibility"""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    import random
    random.seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [13]:
def explore_he_dataset(base_path="data/external/data"):
    """
    Explore the H&E dataset structure and catalog all available files
    """
    base_path = Path(base_path)
    
    if not base_path.exists():
        print(f" Path {base_path} does not exist!")
        return None
    
    # Find all ROI folders
    roi_folders = [f for f in base_path.iterdir() if f.is_dir() and f.name.startswith('ROI')]
    roi_folders.sort()
    
    print(f"📁 Found {len(roi_folders)} ROI folders")
    
    dataset_info = []
    he_files_found = 0
    mask_files_found = 0
    
    for roi_folder in roi_folders:
        he_path = roi_folder / "input" / "he"
        
        if he_path.exists():
            # Find all TIFF files
            all_tiff_files = list(he_path.glob("*.tiff")) + list(he_path.glob("*.tif"))
            
            # Separate H&E images from masks
            he_images = []
            mask_images = []
            
            for tiff_file in all_tiff_files:
                filename_lower = tiff_file.name.lower()
                if "mask" in filename_lower:
                    mask_images.append(tiff_file)
                else:
                    he_images.append(tiff_file)
            
            print(f"  📁 {roi_folder.name}: {len(he_images)} H&E images, {len(mask_images)} masks")
            
            if he_images:
                for he_img in he_images:
                    roi_name = roi_folder.name
                    if "BENIGN" in roi_name.upper():
                        tissue_type = "prostate"
                        cancer_status = "benign"
                    elif "PROSTATE" in roi_name.upper():
                        tissue_type = "prostate"
                        if "TMA" in roi_name.upper():
                            cancer_status = "malignant"
                        else:
                            cancer_status = "unknown"
                    elif "KIDNEY" in roi_name.upper():
                        tissue_type = "kidney"
                        cancer_status = "kidney"
                    elif "LIVER" in roi_name.upper():
                        tissue_type = "liver"
                        cancer_status = "liver"   
                    else:
                        tissue_type = "unknown"
                        cancer_status = "unknown"
                  
                    corresponding_mask = None
                    base_name = he_img.stem  
                    
                    for mask in mask_images:
                        mask_base = mask.stem
                        if (base_name in mask_base or 
                            mask_base.replace("_mask", "").replace("mask", "") == base_name or
                            any(part in mask_base for part in base_name.split("_")[:2])): 
                            corresponding_mask = mask
                            break
                    
                    dataset_info.append({
                        'roi_folder': roi_name,
                        'he_image_path': str(he_img),
                        'mask_path': str(corresponding_mask) if corresponding_mask else None,
                        'tissue_type': tissue_type,
                        'cancer_status': cancer_status,
                        'file_size_mb': he_img.stat().st_size / (1024*1024)
                    })
                    
                    he_files_found += 1
                    if corresponding_mask:
                        mask_files_found += 1
    
    df = pd.DataFrame(dataset_info)
    
    if len(df) > 0:
        print(f"  Total H&E images found: {he_files_found}")
        print(f" Total masks found: {mask_files_found}")
        print(f" Average file size: {df['file_size_mb'].mean():.1f} MB")
        print(f" Tissue types: {df['tissue_type'].value_counts().to_dict()}")
        print(f" Cancer status distribution: {df['cancer_status'].value_counts().to_dict()}")
        
        # Show sample files
        print(f"\n First 5 samples:")
        display(df.head())
        
        return df
    else:
        print("No H&E images found!")
        print("Check if the folder structure matches: ROI*/input/he/*.tiff")
        return None

# Explore the dataset
dataset_df = explore_he_dataset()

📁 Found 63 ROI folders
  📁 ROI009_LIVER_B12: 1 H&E images, 0 masks
  📁 ROI010_PROSTATE_TMA001: 1 H&E images, 1 masks
  📁 ROI011_PROSTATE_TMA002: 1 H&E images, 1 masks
  📁 ROI012_PROSTATE_TMA003: 1 H&E images, 1 masks
  📁 ROI013_PROSTATE_TMA004: 1 H&E images, 1 masks
  📁 ROI014_PROSTATE_TMA005: 1 H&E images, 1 masks
  📁 ROI015_PROSTATE_TMA006: 1 H&E images, 1 masks
  📁 ROI016_PROSTATE_TMA007: 1 H&E images, 1 masks
  📁 ROI017_PROSTATE_TMA008: 1 H&E images, 1 masks
  📁 ROI018_PROSTATE_TMA009: 1 H&E images, 1 masks
  📁 ROI019_PROSTATE_Benign_TMA008N: 1 H&E images, 1 masks
  📁 ROI020_PROSTATE_Benign_TMA011N: 1 H&E images, 1 masks
  📁 ROI021_PROSTATE_TMA012: 1 H&E images, 1 masks
  📁 ROI022_PROSTATE_TMA013: 1 H&E images, 1 masks
  📁 ROI023_PROSTATE_TMA015: 1 H&E images, 1 masks
  📁 ROI024_PROSTATE_TMA016: 1 H&E images, 1 masks
  📁 ROI025_PROSTATE_TMA017: 1 H&E images, 1 masks
  📁 ROI026_PROSTATE_TMA018: 1 H&E images, 1 masks
  📁 ROI027_PROSTATE_TMA019: 1 H&E images, 1 masks
  📁 ROI028_PROSTA

Unnamed: 0,roi_folder,he_image_path,mask_path,tissue_type,cancer_status,file_size_mb
0,ROI009_LIVER_B12,data/external/data/ROI009_LIVER_B12/input/he/R...,,liver,liver,21.594986
1,ROI010_PROSTATE_TMA001,data/external/data/ROI010_PROSTATE_TMA001/inpu...,data/external/data/ROI010_PROSTATE_TMA001/inpu...,prostate,malignant,22.366226
2,ROI011_PROSTATE_TMA002,data/external/data/ROI011_PROSTATE_TMA002/inpu...,data/external/data/ROI011_PROSTATE_TMA002/inpu...,prostate,malignant,21.028091
3,ROI012_PROSTATE_TMA003,data/external/data/ROI012_PROSTATE_TMA003/inpu...,data/external/data/ROI012_PROSTATE_TMA003/inpu...,prostate,malignant,20.263443
4,ROI013_PROSTATE_TMA004,data/external/data/ROI013_PROSTATE_TMA004/inpu...,data/external/data/ROI013_PROSTATE_TMA004/inpu...,prostate,malignant,19.11647
