In [None]:
!pip uninstall torch torchvision torchaudio -y

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu

In [None]:
import os
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
from PIL import Image
import matplotlib.pyplot as plt

# Paths to your dataset
train_dir = r"/Users/julietteliao/Desktop/face-detector/deep/data/train_images"    # folder containing training images
test_dir  = r"/Users/julietteliao/Desktop/face-detector/deep/data/test_images"     # folder containing test images

# Define transformations (convert to gray-scale, tensor, normalize)
transform = transforms.Compose(
    [transforms.Grayscale(),   # transforms to gray-scale (1 input channel)
     transforms.Resize((92, 112)),  # match the original calculation
     transforms.ToTensor(),    # transforms to Torch tensor (needed for PyTorch)
     transforms.Normalize(mean=(0.5,), std=(0.5,))]) # subtracts mean (0.5) and divides by std (0.5) -> resulting values in (-1, +1)

# Define two pytorch datasets (train/test) 
train_data = torchvision.datasets.ImageFolder(train_dir, transform=transform)
test_data  = torchvision.datasets.ImageFolder(test_dir, transform=transform)

valid_size = 0.2   # proportion of validation set (80% train, 20% validation)
batch_size = 32    

# Define randomly the indices of examples to use for training and for validation
num_train = len(train_data)
indices_train = list(range(num_train))
np.random.shuffle(indices_train)
split_tv = int(np.floor(valid_size * num_train))
train_new_idx, valid_idx = indices_train[split_tv:], indices_train[:split_tv]

# subset_size = 500  # number of images you want to use for quick test
# train_new_idx = train_new_idx[:subset_size]
# valid_idx = valid_idx[:int(subset_size*0.2)]  # keep 20% for validation



# Define two "samplers" that will randomly pick examples from the training and validation set
train_sampler = SubsetRandomSampler(train_new_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

# Dataloaders (take care of loading the data from disk, batch by batch, during training)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=train_sampler, num_workers=1)
valid_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, sampler=valid_sampler, num_workers=1)
test_loader  = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=1)

classes = ('noface','face')  # indicates that "1" means "face" and "0" non-face (only used for display)

# Quick check: see number of examples
print(f"Training samples: {len(train_new_idx)}")
print(f"Validation samples: {len(valid_idx)}")
print(f"Test samples: {len(test_data)}")
print(f"Train samples: {len(train_data)}")


# Visualize a few images from the training dataset
def show_images(loader, num_images=8):
    data_iter = iter(loader)
    images, labels = next(data_iter)
    fig, axes = plt.subplots(1, num_images, figsize=(15,3))
    for i in range(num_images):
        img = images[i].squeeze().numpy()  # remove channel dimension
        axes[i].imshow(img, cmap='gray')
        axes[i].set_title(classes[labels[i]])
        axes[i].axis('off')
    plt.show()

show_images(train_loader)


In [None]:
!pip install matplotlib
!pip install tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, 3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.fc1 = nn.Linear(32 * 23 * 28, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [None]:
# Loss funciton n stuff
import torch.optim as optim

model = SimpleCNN()
criterion = nn.CrossEntropyLoss()  # for 2-class classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [None]:
from tqdm import tqdm
import torch

N_EPOCHS = 5
LEARNING_RATE = 0.001
SAVE_BEST = True
MODEL_PATH = "model_best.pth"

best_accuracy = 0.0

for epoch in range(N_EPOCHS):
    model.train()
    running_loss = 0.0
    
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{N_EPOCHS}"):
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    avg_train_loss = running_loss / len(train_loader)
    
    model.eval()
    valid_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for images, labels in valid_loader:
            outputs = model(images)
            loss = criterion(outputs, labels)
            valid_loss += loss.item()
            _, predictions = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predictions == labels).sum().item()
    
    avg_valid_loss = valid_loss / len(valid_loader)
    accuracy = correct / total * 100
    
    print(f"\nEpoch {epoch+1}/{N_EPOCHS}")
    print(f"  Train Loss: {avg_train_loss:.4f}")
    print(f"  Valid Loss: {avg_valid_loss:.4f}")
    print(f"  Valid Accuracy: {accuracy:.2f}%")
    
    if SAVE_BEST:
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            torch.save(model.state_dict(), MODEL_PATH)
            print(f"  New best model saved! (Acc: {accuracy:.2f}%)")
    else:
        torch.save(model.state_dict(), MODEL_PATH)

print(f"\nTraining complete! Best accuracy: {best_accuracy:.2f}%")


In [None]:
model.load_state_dict(torch.load("model_best.pth"))

In [None]:
import torch
from tqdm import tqdm

model.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in tqdm(test_loader, desc="Testing"):
        outputs = model(images) 
        _, predictions = torch.max(outputs, 1)
        total += labels.size(0)       
        correct += (predictions == labels).sum().item() 

accuracy = correct / total * 100
print(f"Test Accuracy: {accuracy:.2f}%")


In [None]:
import torch
import torch.nn.functional as F
from torchvision import transforms, ops
from PIL import Image
import numpy as np

class DetectionConfig:
    STEP_SIZE = 40
    WINDOW_SIZE = (112, 92)
    SCORE_THRESHOLD = 0.8
    NMS_THRESHOLD = 0.15
    PYRAMID_SCALE = 1.2
    MIN_PYRAMID_SIZE = (112, 92)
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    TRANSFORM = transforms.Compose([
        transforms.Grayscale(num_output_channels=1),
        transforms.Resize((92, 112)),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.5,), std=(0.5,))
    ])

def sliding_window(image, model, config):
    detections = []
    img_np = np.array(image)
    H, W = img_np.shape[:2]
    window_width, window_height = config.WINDOW_SIZE

    for y in range(0, H - window_height + 1, config.STEP_SIZE):
        for x in range(0, W - window_width + 1, config.STEP_SIZE):
            patch = image.crop((x, y, x + window_width, y + window_height))
            patch_tensor = config.TRANSFORM(patch).unsqueeze(0).to(config.DEVICE)
            with torch.no_grad():
                output = model(patch_tensor)
                prob = torch.softmax(output, dim=1)
                score = prob[0, 1].item()
            if score >= config.SCORE_THRESHOLD:
                detections.append((x, y, x + window_width, y + window_height, score))
    return detections

def image_pyramid(image, config):
    pyramid = [image]
    current = image
    while True:
        w = int(current.size[0] / config.PYRAMID_SCALE)
        h = int(current.size[1] / config.PYRAMID_SCALE)
        if w < config.MIN_PYRAMID_SIZE[0] or h < config.MIN_PYRAMID_SIZE[1]:
            break
        current = current.resize((w, h), Image.BILINEAR)
        pyramid.append(current)
    return pyramid

def detect_faces(image, model, config=None, verbose=True):
    if config is None:
        config = DetectionConfig()
    model.eval()
    if image.mode != 'L':
        image = image.convert('L')

    all_detections = []
    pyramid = image_pyramid(image, config)
    for scaled_image in pyramid:
        scale_w = image.width / scaled_image.width
        scale_h = image.height / scaled_image.height
        detections = sliding_window(scaled_image, model, config)
        for x1, y1, x2, y2, score in detections:
            all_detections.append((
                int(x1 * scale_w),
                int(y1 * scale_h),
                int(x2 * scale_w),
                int(y2 * scale_h),
                score
            ))

    if not all_detections:
        return []

    boxes = torch.tensor([[x1, y1, x2, y2] for x1, y1, x2, y2, score in all_detections],
                         dtype=torch.float, device=config.DEVICE)
    scores = torch.tensor([score for _, _, _, _, score in all_detections],
                          dtype=torch.float, device=config.DEVICE)
    keep = ops.nms(boxes, scores, iou_threshold=config.NMS_THRESHOLD)
    final_boxes = boxes[keep]
    final_scores = scores[keep]

    return [(int(x1.item()), int(y1.item()), int(x2.item()), int(y2.item()), score.item()) 
            for (x1, y1, x2, y2), score in zip(final_boxes, final_scores)]

In [None]:
import os
from PIL import Image
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms

fddb_root = r"/Users/julietteliao/Desktop/face-detector/deep/data/detector_test_images/Dataset_FDDB/Dataset_FDDB/images"
fddb_labels = r"/Users/julietteliao/Desktop/face-detector/deep/data/detector_test_images/Dataset_FDDB/Dataset_FDDB/label.txt"
nonface_root = r"/Users/julietteliao/Desktop/face-detector/deep/data/test_images"

transform = transforms.Compose([
    transforms.Grayscale(),
    transforms.ToTensor(),
])

def parse_fddb_labels(label_file):
    gt_boxes = {}
    with open(label_file, 'r') as f:
        lines = f.readlines()
    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if line.startswith('#'):
            img_path = line[2:].strip()
            boxes = []
            i += 1
            while i < len(lines) and not lines[i].startswith('#'):
                coords = lines[i].strip().split()
                if len(coords) >= 4:
                    x1, y1, x2, y2 = map(int, coords[:4])
                    boxes.append((x1, y1, x2, y2))
                i += 1
            if boxes:
                gt_boxes[img_path] = boxes
        else:
            i += 1
    return gt_boxes

print("Loading ground truth bounding boxes...")
ground_truth = parse_fddb_labels(fddb_labels)
print(f"Loaded {len(ground_truth)} images with ground truth boxes")

class FaceDetectionDataset(Dataset):
    def __init__(self, face_root, nonface_root, ground_truth, transform=None):
        self.transform = transform
        self.samples = []
        self.ground_truth = ground_truth
        
        for year in os.listdir(face_root):
            year_path = os.path.join(face_root, year)
            if not os.path.isdir(year_path):
                continue
            for month in os.listdir(year_path):
                month_path = os.path.join(year_path, month)
                if not os.path.isdir(month_path):
                    continue
                for day in os.listdir(month_path):
                    day_path = os.path.join(month_path, day)
                    if not os.path.isdir(day_path):
                        continue
                    big_path = os.path.join(day_path, "big")
                    if not os.path.isdir(big_path):
                        continue
                    for fname in os.listdir(big_path):
                        if fname.lower().endswith(('.jpg', '.jpeg', '.png', '.pgm')):
                            fpath = os.path.join(big_path, fname)
                            rel_path = f"{year}/{month}/{day}/big/{fname}"
                            if rel_path in self.ground_truth:
                                self.samples.append((fpath, 1, rel_path))

    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        fpath, label, rel_path = self.samples[idx]
        image = Image.open(fpath).convert("L")

        gt_boxes = []
        if rel_path and rel_path in self.ground_truth:
            gt_boxes = self.ground_truth[rel_path]
        
        if self.transform:
            image = self.transform(image)
        
        return image, label, gt_boxes, fpath

test_dataset = FaceDetectionDataset(fddb_root, nonface_root, ground_truth, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=0)

print(f"Total test samples: {len(test_dataset)}")
faces = sum([1 for _, label, _ in test_dataset.samples])
nonfaces = len(test_dataset) - faces
print(f"Face images: {faces}, Non-face images: {nonfaces}")

In [None]:
def compute_iou(box1, box2):
    x1_1, y1_1, x2_1, y2_1 = box1
    x1_2, y1_2, x2_2, y2_2 = box2

    x1_i = max(x1_1, x1_2)
    y1_i = max(y1_1, y1_2)
    x2_i = min(x2_1, x2_2)
    y2_i = min(y2_1, y2_2)

    if x2_i < x1_i or y2_i < y1_i:
        return 0.0

    intersection = (x2_i - x1_i) * (y2_i - y1_i)
    area1 = (x2_1 - x1_1) * (y2_1 - y1_1)
    area2 = (x2_2 - x1_2) * (y2_2 - y1_2)
    union = area1 + area2 - intersection

    return intersection / union if union > 0 else 0.0


def match_detections_to_gt(detections, gt_boxes, iou_threshold=0.3):
    if len(gt_boxes) == 0 and len(detections) == 0:
        return 0, 0, 0
    if len(gt_boxes) == 0:
        return 0, len(detections), 0
    if len(detections) == 0:
        return 0, 0, len(gt_boxes)

    gt_matched = [False] * len(gt_boxes)
    true_positives = 0
    false_positives = 0

    for det in detections:
        det_box = (det[0], det[1], det[2], det[3])
        matched_any = False
        for gt_idx, gt_box in enumerate(gt_boxes):
            if compute_iou(det_box, gt_box) >= iou_threshold:
                true_positives += 1
                gt_matched[gt_idx] = True
                matched_any = True
        if not matched_any:
            false_positives += 1

    false_negatives = sum([1 for m in gt_matched if not m])
    return true_positives, false_positives, false_negatives

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from tqdm import tqdm

num_images = 3000
iou_threshold = 0.3

total_tp = 0
total_fp = 0
total_fn = 0
total_nonface_correct = 0
total_nonface_wrong = 0
processed = 0

for images, labels, gt_boxes_list, fpaths in tqdm(test_loader, desc="Evaluating"):
    if processed >= num_images:
        break
    
    image_tensor = images[0]
    label = labels[0].item()
    
    if len(gt_boxes_list) > 0 and len(gt_boxes_list[0]) > 0:
        raw_boxes = gt_boxes_list[0]
        gt_boxes = []
        if isinstance(raw_boxes, (list, tuple)):
            for i in range(0, len(raw_boxes), 4):
                if i + 3 < len(raw_boxes):
                    box = (int(raw_boxes[i]), int(raw_boxes[i+1]),
                           int(raw_boxes[i+2]), int(raw_boxes[i+3]))
                    gt_boxes.append(box)
        else:
            gt_boxes = [tuple(box) for box in raw_boxes]
    else:
        gt_boxes = []

    fpath = fpaths[0]
    image_pil = transforms.ToPILImage()(image_tensor.squeeze())
    detections = detect_faces(image_pil, model)
    
    if label == 1:
        tp, fp, fn = match_detections_to_gt(detections, gt_boxes, iou_threshold)
        total_tp += tp
        total_fp += fp
        total_fn += fn
    else:
        if len(detections) == 0:
            total_nonface_correct += 1
        else:
            total_nonface_wrong += 1
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))

    axes[0].imshow(image_pil, cmap='gray')
    axes[0].set_title("Ground Truth")
    axes[0].axis('off')
    for gt_box in gt_boxes:
        x1, y1, x2, y2 = gt_box
        rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1,
                                 linewidth=2, edgecolor='green', facecolor='none')
        axes[0].add_patch(rect)

    axes[1].imshow(image_pil, cmap='gray')
    axes[1].set_title(f"Detections: {len(detections)}")
    axes[1].axis('off')

    if label == 1:
        for det in detections:
            x1, y1, x2, y2, score = det
            det_box = (x1, y1, x2, y2)
            correct = any(compute_iou(det_box, gt_box) >= iou_threshold for gt_box in gt_boxes)
            color = 'green' if correct else 'red'
            rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1,
                                     linewidth=2, edgecolor=color, facecolor='none')
            axes[1].add_patch(rect)
            axes[1].text(x1, y1 - 5, f"{score:.2f}", color='yellow', fontsize=8)
    else:
        for x1, y1, x2, y2, score in detections:
            rect = patches.Rectangle((x1, y1), x2 - x1, y2 - y1,
                                     linewidth=2, edgecolor='red', facecolor='none')
            axes[1].add_patch(rect)
            axes[1].text(x1, y1 - 5, f"{score:.2f}", color='yellow', fontsize=8)

    plt.tight_layout()
    plt.show()
    plt.close(fig)

    processed += 1

precision = total_tp / (total_tp + total_fp) * 100 if (total_tp + total_fp) > 0 else 0
recall = total_tp / (total_tp + total_fn) * 100 if (total_tp + total_fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("\n" + "="*60)
print("DETECTION EVALUATION RESULTS")
print("="*60)
print(f"True Positives: {total_tp}")
print(f"False Positives: {total_fp}")
print(f"False Negatives: {total_fn}")
print(f"Precision: {precision:.2f}%")
print(f"Recall: {recall:.2f}%")
print(f"F1-Score: {f1_score:.2f}%")
print("\nNon-face images:")
print(f"  Correct: {total_nonface_correct}")
print(f"  Wrong: {total_nonface_wrong}")
print("="*60)