In [None]:
# Used for -
import numpy as np # Images stored as array values
import pandas as pd # Tabular data
import matplotlib.pyplot as plt # Plotting graphs
from PIL import Image # Helps open and manipulate images
from glob import glob # Find files in directories

from sklearn.model_selection import train_test_split # Splits data into training and testing
from sklearn import metrics # Evaluation

from zipfile import ZipFile # Unzips dataset if zipped
import cv2 # Open CV for image processing and computer vision
import gc # Garbage Collection
import os # File and Folder Operations

import tensorflow as tf # Main deep learning framework
from tensorflow import keras # Used to build neural networks
from keras import layers 

import warnings # ignore unnecessary warnings
warnings.filterwarnings('ignore')

In [None]:
# Dataset Exploration

lung_path = "data/lung_colon_image_set/lung_image_sets"

# List categories inside lung_image_sets
categories = os.listdir(lung_path)
print("Categories found:", categories)

# Count the amount images in each category
for category in categories:
    # Folder path of each category
    folder = os.path.join(lung_path, category)

    # Glob finds and returns a list of file paths matching the pattern *.jpeg
    images = glob(os.path.join(folder,"*.jpeg"))
    print(f"{category}: {len(images)} images")

Categories found: ['lung_aca', 'lung_n', 'lung_scc']
lung_aca: 5000 images
lung_n: 5000 images
lung_scc: 5000 images


In [None]:
# Load and Preprocess images

IMG_SIZE = 128 # Resize all images to 128x128 pixels
data = [] # Store image arrays
labels = [] # Store numeric labels

# Define categories and assign numeric labels
categories = ['lung_aca', 'lung_n', 'lung_scc']

# loop through each category
for i, category in enumerate(categories):
    # Full path to category folder
    folder = os.path.join(lung_path,category)

    # Get all .jpeg images in this folder
    images = glob(os.path.join(folder,"*.jpeg"))

    # For quick testing
    images = images[:500]

    # loop through all image paths in category
    for image_path in images:
        # Read the image using OpenCV
        img = cv2.imread(image_path)

        # Resize the image to 128x128
        img = cv2.resize(img,(IMG_SIZE, IMG_SIZE))

        # Append the image to the data list
        data.append(img)

        # Append the label(index: i) to labels list
        labels.append(i)

# Convert lists to NumPy arrays

# Normalize pixel to range [0,1]
data = np.array(data) / 255.0 # instead of [255,255,255] = [1,1,1] if white
labels = np.array(labels) # Convert labels to NumPy array

# Print shapes to confirm
print("Data shape:", data.shape) # (15000, 128, 128, 3)
print("Labels shape:", labels.shape)# (15000,)

Data shape: (1500, 128, 128, 3)
Labels shape: (1500,)


In [None]:
# Split data into training and testing sets

# Split Data: 80% Training, 20% Testing
# Data = All images, Labels = Shape ID's, 
# test_size = 20% of data for test, random_state = creates reproducible random split
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)


# Print shapes to confirm
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

Training data shape: (1200, 128, 128, 3)
Testing data shape: (300, 128, 128, 3)
Training labels shape: (1200,)
Testing labels shape: (300,)


In [None]:
#Building the CNN Model

#Sequential: Build the model layer by layer in order
from tensorflow.keras import Sequential


# Conv2D: Finds patterns in the image
# MaxPooling2D: Makes the image smaller while keeping important parts
# Flatten: Turns image data into a simle list for next layers
# Dense: Decides between categorization using learned patterns
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

#Initialize a Sequential model
model = Sequential()

# First Convolutional Layer: 32 filters, 
# 3x3 kernel(3x3 pixels at a time), 
# ReLU activation(Keeps positve values, removes negatives)
# Input_shape = (X,Y,color channels)
model.add(Conv2D(32,(3,3), activation="relu"), input_shape=(128,128,3))



