In [94]:
import pandas as pd
from pathlib import Path
import numpy as np


In [95]:
DATA_ROOT = Path(r"C:\Users\Charm\Downloads\images") 

In [96]:
#we need to confirm data presence, even though we know it exists

list(DATA_ROOT.iterdir())


[WindowsPath('C:/Users/Charm/Downloads/images/testing'),
 WindowsPath('C:/Users/Charm/Downloads/images/training')]

In [97]:
#we need to comfirm whats containing inside training
#the folders define the labels that will be needed 
list((DATA_ROOT / "training").iterdir())


[WindowsPath('C:/Users/Charm/Downloads/images/training/flip'),
 WindowsPath('C:/Users/Charm/Downloads/images/training/notflip')]

In [98]:
#we RESTRICT TO .JPG files because we verified the dataset during inspection
#so we are going to go through them and count how many are there 
len(list((DATA_ROOT / "training" / "flip").glob("*.jpg")))


1162

In [99]:
#the next count should be for not flip
len(list((DATA_ROOT /"training" / "notflip").glob("*.jpg")))

1230

## Sofar we have training counts :
 1. flip: 1162
 2. notflip: 1230
    This is almost 50/50
## This is close to balanced, and it means :
 1. We likely don't need class weights... this helps in punishing mistakes on rare classes heavily 
 2. F1 balances between precision: "when the model saysflip, is it right? and Recall: does it catch most flips? so won't really be distorted by extreme imbalance... because classes are similar in size
 3. A basic baseline model(the simplest reasonable model) is valid

In [100]:
#we are going to store every .jpg filepath in the flip and store it
#we are going to use this in our metadata table 
#(train_flip_path) is a list of path objects... where eachpath points to one 
#image file in the training/flip directory

train_flip_paths = list((DATA_ROOT / "training" / "flip").glob("*.jpg"))

#WE are gonna do the same thing for the other side, which is not flip 
#this is important because we need both classes to build a metadata . _ 
train_notflip_paths = (list((DATA_ROOT /"testing" / "flip").glob("*.jpg")))

In [101]:
#next, we need to build a metadata table where each image path is paired with a label
# metadata should contain the full filepath and a numeric label
#because models operate on data loaded from a path and expect it in numeric form for loss(function) computation

flip_rows = [(p, 1) for p in train_flip_paths] # now each flip is labelled.. contains (path, label)

notflip_rows = [(p, 0) for p in train_notflip_paths]
 # now each notflip is labelled.. contains (path, label)

# We need to mix these up so that training doesn't see all flips first and all not-flips.
#we don't want bias .. we don't train on class A and then class B, we train on mixed batches
train_rows = flip_rows + notflip_rows

In [102]:
#at this point, our data is no longer images, its a metadata about images
#we have (path, label), this is tabular information
#we switch to pandas dataframe, to add new columns, groupby, split train vs validation by video
#one row will represent one image frame, identified by its filepath, with its corresponding class label

df = pd.DataFrame(train_rows, columns=["filepath", "label"]) #it adds colums(filename, label)

#we then extract video_id from file name ... because file path has folders and slashes, and splitting, this can cause a mix 
#for every row we extract just the (image) filename and store it in filename
df["filename"] = df["filepath"].apply(lambda p: Path(p).name)

#now we create one for video, we taking the file, split on _ and keep the first part as video 
#if we split  by frames it create data leakage.. same frames can be in train, val or test 
# we want all frames from the same videos to stay in the same split (VideoID_FrameNumber.jpg)

df["video_id"] = df["filename"].apply(lambda x: x.split("_")[0])



In [103]:
df["video_id"].nunique() #how many unique videos exist in training set

65

In [104]:
#we need to shuffle videos first before setting validation, to avoid bias, and make sure validation is random
#bias would be: videos recorded same day, same lighting, same document type, etc

video_ids = df["video_id"].unique() #we store all videos in numpy array
rng = np.random.default_rng(42) #we need a fixed seed so that sampling is the same all times
val_video_ids = rng.choice(video_ids, size=13, replace=False) #we randomly select 13 unique vidoes for validation
#we create a boolean mask for validation rows
df["is_val"] = df["video_id"].isin(val_video_ids) #is_val will assign true or false.. shows which vid are for val and train
val_frames = df[df["is_val"]].shape[0] #gets validation frame count
val_frames #gives number of frame within validation





322

In [105]:
#to check class balance we will groupby label
df[df["is_val"]]["label"].value_counts()


label
1    255
0     67
Name: count, dtype: int64

## What we are aiming for: 
1. the model takes a single image frame as input and outputs a prediction of whether the page is flipping or not
2.  A small CNN is the simplest model that can capture spatial patterns in images.
3.  it exists to validate the data and task, not maximize performance
So the model will learn by making predictions, measuring how wrong they are, and gradually adjusting itself to make fewer mistakes.
## Since we are focused on bent page edges, curved lines, diagonal boundaries, and local distortion. 
* using small filters, let the network learn simple visual patterns first and combine them across layers to recognize larger structures, like flipping a page...
* The idea of pooling will be useful to help the network generalize what matters(eg, what side of the page was bent happen... pooling (it doesn't matter) was the page bent?)
F1 measures how good our flip/not-flip decisions are across videos. How often did we correctly predict
## FINAL BIG IDEA :
1. Convolution:
   What visual cues exist --------> edges, bends, motion artifacts
2. pooling
   Do these cues exist anywhere -------> rebust to location
3. Fully Connected------> Given all the cues, is it a flip?

In [106]:

from torch.utils.data import Dataset, DataLoader   #LETS you feed images to models in small batches
from torchvision import transforms   # provided standard image processing/augmentation functions
from PIL import Image   #reads/open image from disk into memory
from pathlib import Path   #cleaner way to work with file paths
import torch   #main engine that runs the neural network math (tensors, GPU/CPU ops, training, etc)

# DATASET CLASS (the menu for my data ) you don't give everything at once, one at a time
class FrameDataset(Dataset):
    def __init__(self, df, transform=None):
        self.df = df.reset_index(drop=True)   # avoids index issues after filtering so every row has a simple index
        self.transform = transform               #saves the rules for images, which should be modified(resize/rotate..)

    #how many items do you have .. number of rows
    def __len__(self):
        return len(self.df)

    #get me item say #57
    def __getitem__(self, idx):
        row = self.df.iloc[idx]               #go to the row number x in my table
        filepath = row["filepath"]            # column name must be "filepath"
        label = row["label"]                  # column name must be "label"

        image = Image.open(filepath).convert("RGB")     #open the image from disk and force it to be a normal colour 

        if self.transform is not None:
            image = self.transform(image)            #if we have transform rules, apply them to the image 

        # BCEWithLogitsLoss expects float labels (0.0/1.0) form modeal to understand
        label = torch.tensor(label, dtype=torch.float32)

        return image, label


#TRANSFORMS ... chains of them in order
train_transform = transforms.Compose([
    transforms.RandomRotation(degrees=10),               #slightly rotate the image so the model  dont panic
    transforms.Resize((224, 224)),                       #make every image the exact same size so the model sees consistent input
    transforms.ToTensor(),                               #convert images into numbers like[3,224, 224]
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])                                                     #these are ImageNet normalization stats , important cause resnets was trainined on pretrained imagenet

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),                    #for val we don't do random changes. We test the model on a stable version of the image
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225]),
])


# break DATAFRAME INTO TRAIN / VAL
train_df = df[~df["is_val"]].reset_index(drop=True)  #one the model learns from
val_df   = df[df["is_val"]].reset_index(drop=True)   #checks if it truly learns 

print("Train samples:", len(train_df), "Val samples:", len(val_df))  #quick count to confirm we actually have data in both groups


#  DATASETS objects attach transform: its likecreating two menus
# training has augmentation, validation does not 
train_dataset = FrameDataset(train_df, transform=train_transform)
val_dataset   = FrameDataset(val_df, transform=val_transform)


#DATALOADERS.. training takes 32 images at a time, and shuffle so the models don't learn in a predictable order
# validation checks data in stable order-no shuffling needed (num_workers=0) avoids windows multipprocssing crushes
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True,  num_workers=0)
val_loader   = DataLoader(val_dataset,   batch_size=32, shuffle=False, num_workers=0)


#Now let's pull one batch to confirm everything works before training
images, labels = next(iter(train_loader))
print("images:", images.shape)   # expected: (B, 3, 224, 224)
print("labels:", labels.shape)   # expected: (B,)


Train samples: 1130 Val samples: 322
images: torch.Size([32, 3, 224, 224])
labels: torch.Size([32])


In [107]:
#MODEL GIVES PRETRAINED cnns(LIKE RESNET)... .nn contains neural network layers and loses 
from torchvision import models
import torch.nn as nn

# 1) if we have GPU(makes training faster ) , we can use it otherwise use cpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

# 2) im staring with a model that already learned useful patterns from millions of images
model = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
in_features = model.fc.in_features                 #checks how many features the model produces before the final decision
model.fc = nn.Linear(in_features, 1)   # I'm replacing the original 100-class with a simple yes/no decision head
model = model.to(device)               #moves the model into the choses machine(cpu/gpu)

# 3) loss function... the judge that punishes wrong yes/no predictions
criterion = nn.BCEWithLogitsLoss()

# 4) optimizer .. makes small improvements after each batch
#lr=1e-4 is a safe starting learning rate 
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)        #Adam updates parameters using gradients. lr=1e-4 is a cautious learning rate.

# 5) one-batch forward pass sanity check (NO training yet)
model.train()                                               #here model is intraining model
images, labels = next(iter(train_loader))                    #grab one batch of images(tensr + labels) and answers
images = images.to(device)
labels = labels.to(device).float().unsqueeze(1)   # shape (B,1(reshape to ,atch model output)) to match logits( we move images and labels to same device as model)

logits = model(images)                   # produce one raw score per image
loss = criterion(logits, labels)         #BCE(the judge) compares model score to the trut and produce a penatlty number

print("logits shape:", logits.shape)
print("labels shape:", labels.shape)
print("loss:", loss.item())


device: cpu
logits shape: torch.Size([32, 1])
labels shape: torch.Size([32, 1])
loss: 0.836327314376831


## WHAT BCEWithLogitsLoss Actually Does
So it's like two steps combined :
## 1.  Turns the model's raw score into a probability
   * Big positive number --> close to 1( very confdent flip)
   * Big negative number --> close to 0 ( very confident not-flip)
   * Near 0 --> i'm unsure
   * These steps are what sigmoid normally does
## 2.  Compares that probability to the real answer
   * If true labels is 1(flip):
      * High prob --> smal penalty
      * Low prob --> big penanlty
   * if the true label is 0(not flip):
      * Low prob --> small penalty
      * High prob --> big penalty
## Loss does both steps together more safely than doing them separately  
* The loss is a score that tells us how wrong the model is-- a high loss means its guessing, a lower means its learning

In [109]:
#this cell will teach the model once, then check whether it actually learned.
# We will runone epoch... Show every training image to the model once, in small batches
# after we test the model on validation images, it has never learned before

import torch
from torchmetrics.classification import BinaryF1Score # a fair grading rubric for yes/no questions

# --- Metric helper --- this defines how we judge answers .. anything above fifty counts as yes
f1_metric = BinaryF1Score(threshold=0.5).to(device)


#an epoch = one full pass through all trainingimages, like studying one entire book once
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()               #training is on
    running_loss = 0.0          

    f1_metric.reset()            #we start fresh for the training session

    #we study 32 batches at once instead of everything
    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device).unsqueeze(1)  # (B,1) e move everything to the same cpu and align the answer so they match format prediction

        # 1) forward
        logits = model(images)                  # (B,1) we look and say I think it's a flip, like raw opinions, not answers
        loss = criterion(logits, labels)        # this creates a penalty score after compare answers

        # 2) backward + update i.e learning from mistakes 
        optimizer.zero_grad()  # errase old result from training
        loss.backward()        #figure out what went wrong
        optimizer.step()        #adjust understanding slightly

        running_loss += loss.item() * images.size(0)   # we add up how bad the mistakes were, for all images
                                                       # I think big batches would count more cuz they have more images

        # 3) F1 needs predictions (0/1). logits -> prob -> pred
        probs = torch.sigmoid(logits)           # (B,1) values in [0,1]
        preds = (probs >= 0.5).float()          # (B,1) 0 or 1
        f1_metric.update(preds, labels)         # update the report, like did we catch flips?

    avg_loss = running_loss / len(loader.dataset)  #average mistakes level accross all study questions
    avg_f1 = f1_metric.compute().item()            #overall F1 score for the session
    return avg_loss, avg_f1

#everything in validation , looks the same as training
#just no backprogration, no optimizer and no learning , cant learn during exam
@torch.no_grad() #this is exam mode now
def validate_one_epoch(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0

    f1_metric.reset()

    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device).unsqueeze(1)  # (B,1)

        logits = model(images)
        loss = criterion(logits, labels)

        running_loss += loss.item() * images.size(0)

        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).float()
        f1_metric.update(preds, labels)

    avg_loss = running_loss / len(loader.dataset)
    avg_f1 = f1_metric.compute().item()
    return avg_loss, avg_f1




In [110]:
# We're going to run the training
num_epochs = 5    #how many val test we taking 


#each loop train, validate and report results
# e.g., like how did u do while studying / and how did u do on the exam 
for epoch in range(1, num_epochs + 1):
    train_loss, train_f1 = train_one_epoch(model, train_loader, criterion, optimizer, device)
    val_loss, val_f1     = validate_one_epoch(model, val_loader, criterion, device)

    #we announce results on the screenboard
    print(
        f"Epoch {epoch}/{num_epochs} | "
        f"Train Loss: {train_loss:.4f}  Train F1: {train_f1:.4f} | "
        f"Val Loss: {val_loss:.4f}  Val F1: {val_f1:.4f}"
    )


Epoch 1/5 | Train Loss: 0.6251  Train F1: 0.7870 | Val Loss: 0.5231  Val F1: 0.8839
Epoch 2/5 | Train Loss: 0.4940  Train F1: 0.8871 | Val Loss: 0.5155  Val F1: 0.8839
Epoch 3/5 | Train Loss: 0.4720  Train F1: 0.8863 | Val Loss: 0.5663  Val F1: 0.8839
Epoch 4/5 | Train Loss: 0.4478  Train F1: 0.8866 | Val Loss: 0.5628  Val F1: 0.8761
Epoch 5/5 | Train Loss: 0.4178  Train F1: 0.8919 | Val Loss: 0.5958  Val F1: 0.8246


## The model learned quickly, reached a strong performance, and generalized well to new images.
## Training longer did not improve real-world performance, so the best model was selected early