In [13]:
from PIL import Image
import numpy as np

def load_image(path):
    img = Image.open(path).convert("RGB").resize((96, 96))
    return np.array(img).flatten() / 255.0  # Normalize

In [16]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from sklearn.decomposition import IncrementalPCA
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

# Config
BASE_DIR = r"C:\Users\iveyc\Desktop\Week3Kaggle"
TRAIN_DIR = os.path.join(BASE_DIR, "train")
LABEL_CSV = os.path.join(BASE_DIR, "train_labels.csv")
BATCH_SIZE = 1000
N_COMPONENTS = 150

# Image loader
def load_image(path):
    img = Image.open(path).convert("RGB").resize((96, 96))
    return np.array(img).flatten() / 255.0

# Load labels
df = pd.read_csv(LABEL_CSV)
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# Fit PCA incrementally
pca = IncrementalPCA(n_components=N_COMPONENTS)
for i in range(0, len(train_df), BATCH_SIZE):
    batch = train_df.iloc[i:i+BATCH_SIZE]
    X_batch = [load_image(os.path.join(TRAIN_DIR, row['id'] + ".tif")) for _, row in batch.iterrows()]
    pca.partial_fit(X_batch)

# Train classifier incrementally
clf = SGDClassifier(loss="log_loss", max_iter=1000)
for i in range(0, len(train_df), BATCH_SIZE):
    batch = train_df.iloc[i:i+BATCH_SIZE]
    X_batch = [load_image(os.path.join(TRAIN_DIR, row['id'] + ".tif")) for _, row in batch.iterrows()]
    X_pca = pca.transform(X_batch)
    y_batch = batch['label'].values
    clf.partial_fit(X_pca, y_batch, classes=[0, 1])

# Evaluate
X_val = [load_image(os.path.join(TRAIN_DIR, row['id'] + ".tif")) for _, row in val_df.iterrows()]
X_val_pca = pca.transform(X_val)
y_val = val_df['label'].values
preds = clf.predict_proba(X_val_pca)[:, 1]
auc = roc_auc_score(y_val, preds)
print(f"AUC: {auc:.4f}")

AUC: 0.7006
