In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F #helps call activation functions
import torch.optim as optim #runs the optimizer
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms #helps prepare the data into tensors and normalizes pixel values
from PIL import Image
import os
from sklearn.model_selection import train_test_split


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import zipfile

zip_path = "/content/drive/My Drive/Colab Notebooks/Copy of train_data"
extract_to = "/content/train_data"


In [5]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref: #use 'r' bc we're reading the files not changing them
    zip_ref.extractall(extract_to)

In [6]:
import pandas as pd

csv_path = '/content/drive/My Drive/Colab Notebooks/Copy of train.csv'

train_df = pd.read_csv(csv_path)

train_df.head()

Unnamed: 0.1,Unnamed: 0,file_name,label
0,0,train_data/a6dcb93f596a43249135678dfcfc17ea.jpg,1
1,1,train_data/041be3153810433ab146bc97d5af505c.jpg,0
2,2,train_data/615df26ce9494e5db2f70e57ce7a3a4f.jpg,1
3,3,train_data/8542fe161d9147be8e835e50c0de39cd.jpg,0
4,4,train_data/5d81fa12bc3b4cea8c94a6700a477cf2.jpg,1


In [7]:
label_counts = train_df['label'].value_counts(normalize=True) * 100
print(label_counts)

label
1    50.0
0    50.0
Name: proportion, dtype: float64


In [None]:
#sampled_df = train_df.sample(n=1000)
#sampled_df.iloc[1]["label"]
#label_1_df = train_df[train_df['label'] == 1]


In [7]:

# Define the image transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize all images to 224x224
    transforms.ToTensor(),  # Convert image to PyTorch tensor
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])  # Normalize to [-1,1] range because we want values to be centered around 0 for each
                                                                    # RGB channel
])

# Custom Dataset class
class ImageDataset(Dataset): #inherits from torch.utils.data.Dataset. we create a class to use dataloader
    def __init__(self, df, image_dir, transform=None):
        self.df = df
        self.image_dir = image_dir #the root folder where the images are stored.
        self.transform = transform #applies preprocessing to image

    def __len__(self): #this is necessary to run the data loader
        return len(self.df) # number of samples in dataset

    def __getitem__(self, idx):
        # Extract file name and label at specific index
        img_name = os.path.basename(self.df.iloc[idx]["file_name"])  # Remove any extra path
        img_path = os.path.join(self.image_dir, img_name)  # Ensure correct path construction
        label = self.df.iloc[idx]["label"] #extracts label from csv file

        # Open image
        image = Image.open(img_path).convert("RGB") #keeps the number of channels consistent

        # Apply transformations
        if self.transform: #only if a transform is provided
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long) # the output is a [3, 224, 224] with values normalized to -1 to 1
                                                            # converts the label into a pytorch tensor bc nn.CrossEntropyLoss() expects class labels to be torch.long integers

# Read the CSV data
csv_path = '/content/drive/My Drive/Colab Notebooks/Copy of train.csv'
train_df = pd.read_csv(csv_path)

# Split the dataset into train and validation
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# Create training and validation datasets
train_dataset = ImageDataset(train_data, "/content/train_data", transform=transform)
val_dataset = ImageDataset(val_data, "/content/train_data", transform=transform)

# Create DataLoaders for train and validation sets
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True) # data loader helps groups and shuffles images
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# CNN model definition
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1)

        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) # we have a pooling layer after each conv layer

        self.fc1 = nn.Linear(128 * 28 * 28, 512)  # image size is (224, 224)
        self.fc2 = nn.Linear(512, 2)  # binary classification (labels 0 and 1)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x))) #reLU (max 0 and x) #avoid vanishing gradient problem
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))

        x = x.view(x.size(0), -1)  # flatten before passing to fully connected layers
        x = F.relu(self.fc1(x))
        x = self.fc2(x)

        return x

# Initialize model
model = CNNModel()

# Define loss function (CrossEntropyLoss for classification)
criterion = nn.CrossEntropyLoss()

# Use Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [6]:
import os
import pandas as pd

# Directory containing the image files
image_dir = "/content/train_data"

# Create a list to store metadata
image_data = []

# Loop through each image in the directory
for image_file in os.listdir(image_dir): #for each file in the folder located in image directory
    # Check if it's a .jpg file
    if image_file.endswith(".jpg"):
        # Get the full file path
        file_path = os.path.join(image_dir, image_file)

        # Get the file size in KB
        file_size = os.path.getsize(file_path) / 1024  # Size in KB (from B --> KB)

        # Store the image metadata
        image_data.append({
            "file_name": image_file,
            "file_size_kb": file_size,
        })


In [None]:


# Check device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #GPU is faster than CPU
model.to(device)

# Training and validation loop
num_epochs = 2
for epoch in range(num_epochs):
    # training phase
    model.train()
    running_loss = 0.0 #tracks training loss for that epoch for all batches
    for images, labels in train_loader: #each iteration corresponds to one batch within the train_loader
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()  # reset gradients
        outputs = model(images)  # forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # backpropagation
        optimizer.step()  # update weights so the model performs better with the next batch

        running_loss += loss.item() #add current batch's loss (sum of the loss over all batches)

    # validation phase
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():  # No gradient calculation for validation
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)  # Forward pass to get predicted scores
            _, predicted = torch.max(outputs, 1)  # Get the predicted class --> ouputs max value and index. We only care about the index
            total += labels.size(0) #count number of labels within the batch
            correct += (predicted == labels).sum().item()

    # Calculate accuracy
    accuracy = 100 * correct / total # we can use accuracy because the data is evenly split
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader):.4f}, Validation Accuracy: {accuracy:.2f}%")

print("Training Finished!")


Epoch 1/2, Loss: 0.2793, Validation Accuracy: 94.05%
Epoch 2/2, Loss: 0.1292, Validation Accuracy: 97.30%
Training Finished!
