In [3]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import glob

In [4]:
# --- CONFIGURATION ---
CLINICAL_PATH = r"E:\NLP\Assignment\1&2\Esophageal_Cancer_Project\Data\Clinical\Esophageal_Dataset.csv"
IMAGE_ROOT = r"E:\NLP\Assignment\1&2\Esophageal_Cancer_Project\Data\Imaging\Endoscopy-esophagus"
OUTPUT_DIR = r"E:\NLP\Assignment\1&2\Esophageal_Cancer_Project\Data"

In [6]:
# --- 1. LOAD AND CLEAN CLINICAL DATA ---
print("Loading Clinical Data...")
df_clinical = pd.read_csv(CLINICAL_PATH)

# Drop useless columns
cols_to_drop = [
    'Unnamed: 0', 'patient_id', 'bcr_patient_uuid',
    'patient_barcode', 'tissue_source_site', 'icd_10',
    'bcr_patient_barcode', 'identifier'
]
df_clinical = df_clinical.drop(columns=[c for c in cols_to_drop if c in df_clinical.columns], errors='ignore')

# Handle Target
target_col = 'person_neoplasm_cancer_status'
df_clinical = df_clinical.dropna(subset=[target_col])

# Encode Target
le = LabelEncoder()
df_clinical['target'] = le.fit_transform(df_clinical[target_col])
print(f"Clinical Data Loaded. Classes: {le.classes_}")

# --- FIX: ROBUST MISSING VALUE HANDLING ---
# First, drop columns that are completely empty (all NaNs)
df_clinical = df_clinical.dropna(axis=1, how='all')

# Now fill remaining missing values safely
for col in df_clinical.columns:
    # Skip target and image_path if they exist
    if col in ['target', 'image_path']:
        continue

    # If column is object/string
    if df_clinical[col].dtype == 'object':
        if df_clinical[col].notna().sum() > 0:  # Check if there is at least one valid value
            df_clinical[col] = df_clinical[col].fillna(df_clinical[col].mode()[0])
        else:
            df_clinical = df_clinical.drop(columns=[col]) # Drop if mode calculation fails

    # If column is numeric
    else:
        if df_clinical[col].notna().sum() > 0: # Check if there is at least one valid value
            df_clinical[col] = df_clinical[col].fillna(df_clinical[col].median())
        else:
            df_clinical = df_clinical.drop(columns=[col]) # Drop if median calculation fails

print(f"Cleaned Clinical Data Shape: {df_clinical.shape}")

Loading Clinical Data...
Clinical Data Loaded. Classes: ['TUMOR FREE' 'WITH TUMOR']
Cleaned Clinical Data Shape: (3650, 74)


In [7]:
# --- 2. GATHER IMAGE PATHS ---
print("\nGathering Image Paths...")
esophagus_dir = os.path.join(IMAGE_ROOT, 'esophagus')
no_esophagus_dir = os.path.join(IMAGE_ROOT, 'no-esophagus')

# Get all .jpg images
cancer_images = glob.glob(os.path.join(esophagus_dir, "*.jpg"))
normal_images = glob.glob(os.path.join(no_esophagus_dir, "*.jpg"))

print(f"Found {len(cancer_images)} Cancer images")
print(f"Found {len(normal_images)} Normal images")


Gathering Image Paths...
Found 1689 Cancer images
Found 8973 Normal images


In [8]:
# --- 3. SPLIT DATA (CRITICAL STEP) ---
# We split Clinical and Images SEPARATELY to ensure no leakage
# Split ratios: 70% Train, 15% Val, 15% Test

# Clinical Splits
clin_train, clin_temp = train_test_split(df_clinical, test_size=0.3, stratify=df_clinical['target'], random_state=42)
clin_val, clin_test = train_test_split(clin_temp, test_size=0.5, stratify=clin_temp['target'], random_state=42)

# Image Splits (Cancer)
img_can_train, img_can_temp = train_test_split(cancer_images, test_size=0.3, random_state=42)
img_can_val, img_can_test = train_test_split(img_can_temp, test_size=0.5, random_state=42)

# Image Splits (Normal)
img_norm_train, img_norm_temp = train_test_split(normal_images, test_size=0.3, random_state=42)
img_norm_val, img_norm_test = train_test_split(img_norm_temp, test_size=0.5, random_state=42)

In [9]:
# --- 4. PAIRING FUNCTION ---
def pair_data(clinical_df, cancer_imgs, normal_imgs):
    """
    Assigns an image path to a clinical row based on target label.
    If we run out of unique images, we resample (reuse) them.
    """
    clinical_df = clinical_df.copy()
    image_paths = []

    for idx, row in clinical_df.iterrows():
        if row['target'] == 1: # Cancer
            # Pick random image from cancer list
            img_path = np.random.choice(cancer_imgs)
        else: # Normal
            # Pick random image from normal list
            img_path = np.random.choice(normal_imgs)
        image_paths.append(img_path)

    clinical_df['image_path'] = image_paths
    return clinical_df

print("\nPairing Data...")
# Pair Training Data
train_df = pair_data(clin_train, img_can_train, img_norm_train)
# Pair Validation Data
val_df = pair_data(clin_val, img_can_val, img_norm_val)
# Pair Test Data
test_df = pair_data(clin_test, img_can_test, img_norm_test)


Pairing Data...


In [10]:
# --- 5. SAVE FILES (UPDATED DIRECTORY) ---
# Define the specific Multimodal directory
OUTPUT_DIR = r"E:\NLP\Assignment\1&2\Esophageal_Cancer_Project\Data\Multimodal"

# Create the directory if it doesn't exist
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"\nSaving CSVs to: {OUTPUT_DIR}...")

# Save the DataFrames
train_df.to_csv(os.path.join(OUTPUT_DIR, "train_data.csv"), index=False)
val_df.to_csv(os.path.join(OUTPUT_DIR, "val_data.csv"), index=False)
test_df.to_csv(os.path.join(OUTPUT_DIR, "test_data.csv"), index=False)

print("✅ Data Preparation Complete. Files saved.")
print(f"Train Size: {len(train_df)}")
print(f"Val Size: {len(val_df)}")
print(f"Test Size: {len(test_df)}")


Saving CSVs to: E:\NLP\Assignment\1&2\Esophageal_Cancer_Project\Data\Multimodal...
✅ Data Preparation Complete. Files saved.
Train Size: 2555
Val Size: 547
Test Size: 548
