In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import matplotlib.pyplot as plt
import numpy as np
import cv2
import os

# -----------------------------
# Paths and Config
# -----------------------------
DATASET_PATH = "path_to_dataset"  # folder containing class subfolders
IMG_SIZE = (224, 224)           
BATCH_SIZE = 32
EPOCHS = 10

# Auto-detect number of classes from folders
NUM_CLASSES = len(next(os.walk(DATASET_PATH))[1])

# -----------------------------
# 1. Image Data Generators
# -----------------------------
train_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    rotation_range=15,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True
)

test_datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2
)

train_data = train_datagen.flow_from_directory(
    DATASET_PATH,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='training'
)

test_data = test_datagen.flow_from_directory(
    DATASET_PATH,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical',
    subset='validation'
)

"""
Note: 

=========================================================================
IMAGE CLASSIFICATION LOSS FUNCTIONS EXPLAINED
=========================================================================

1. Multi-Class Classification (NUM_CLASSES > 2)

   - Problem type:
     You have more than 2 classes, e.g., CIFAR-10 (10 classes), CUB-200 (200 classes), flowers dataset (5 classes), etc.

   - Labels:
     One-hot encoded vectors:
       Example: If 3 classes, class 2 label -> [0,1,0]

   - Model Output Layer:
       Dense(NUM_CLASSES, activation='softmax')
       - Softmax ensures all output probabilities sum to 1
       - Each neuron corresponds to probability of one class

   - Loss Function:
       'categorical_crossentropy'
       Formula:
           Loss = -Σ (y_i * log(p_i))
           Where y_i = actual label (0 or 1)
                 p_i = predicted probability for class i
                 Σ = sum over all classes
       Behavior:
           - Initial loss depends on number of classes: ~ ln(NUM_CLASSES)
           - For 10 classes: ~2.3, for 200 classes: ~5.3
           - Loss decreases as model predictions become closer to actual labels

   - When to use 'sparse_categorical_crossentropy':
       - If labels are integers (0,1,2,...N-1) instead of one-hot
       - Model output still uses softmax

=========================================================================
2. Binary Classification (NUM_CLASSES = 2)

   - Problem type:
     Only 2 classes, e.g., dogs vs cats, tumor vs normal, yes/no predictions

   - Labels:
       Single scalar per sample:
       Example: dog=1, cat=0

   - Model Output Layer:
       Dense(1, activation='sigmoid')
       - Sigmoid outputs probability of class 1
       - Range: [0,1]

   - Loss Function:
       'binary_crossentropy'
       Formula:
           Loss = - [ y * log(p) + (1-y) * log(1-p) ]
           Where y = actual label (0 or 1)
                 p = predicted probability of class 1
       Behavior:
           - Initial loss ≈ 0.693 (~ -ln(0.5)) for random predictions
           - Loss decreases as model predicts correct class

   - Prediction:
       - pred_class = 1 if pred > 0.5 else 0
       - Use threshold 0.5 for sigmoid output

=========================================================================
3. Notes on Loss and Accuracy

   - Loss measures "how wrong the model is", accuracy measures "how many predictions are correct"
   - High loss = poor predictions, Low loss = predictions closer to actual
   - For multi-class, initial loss depends on number of classes
   - For binary, initial loss starts near 0.693
   - Always monitor both loss and accuracy during training
   - Data augmentation, batch size, learning rate, and network complexity affect loss trends

=========================================================================
4. Summary Table

   Problem Type        | Output Layer        | Loss Function
   ------------------- | ----------------- | ----------------------
   Multi-class (N>2)   | Dense(N, softmax) | categorical_crossentropy
   Multi-class int lbl | Dense(N, softmax) | sparse_categorical_crossentropy
   Binary (2 classes)  | Dense(1, sigmoid) | binary_crossentropy

=========================================================================
5. Practical Tips

   - For binary classification with ImageDataGenerator:
       class_mode='binary'
   - For multi-class classification with ImageDataGenerator:
       class_mode='categorical'
   - Start with a simple CNN architecture and gradually increase depth
   - Use Dropout for regularization to reduce overfitting
   - Use appropriate batch size and epochs depending on dataset size
   - Always normalize pixel values (0-255 → 0-1)
"""

"""
=========================================================================
COMMON CNN ARCHITECTURES AND WHEN TO USE THEM
=========================================================================

1. SIMPLE CNN (Custom Small CNN)
---------------------------------
- Structure:
    - 2-3 Conv2D layers with ReLU activation
    - MaxPooling2D after each Conv2D
    - Flatten → Dense layers → Output
- Use case:
    - Small datasets or beginner projects (MNIST, CIFAR-10)
    - Quick prototyping
    - Less computational resources
- Pros:
    - Simple, easy to understand
    - Fast to train
- Cons:
    - Limited accuracy on complex datasets
    - Cannot capture very deep features

------------------------------------------------------------------------
2. VGG16 / VGG19
-----------------
- Structure:
    - Deep stack of 3x3 Conv layers (16 or 19 weight layers)
    - MaxPooling after each block
    - Fully connected layers at the end
- Use case:
    - Medium-sized datasets
    - When high accuracy is desired and GPU resources are available
    - Transfer learning: pretrained weights on ImageNet
- Pros:
    - Simple uniform architecture
    - Works well with transfer learning
- Cons:
    - Very heavy (large number of parameters)
    - Slow training

------------------------------------------------------------------------
3. ResNet (Residual Networks)
-----------------------------
- Structure:
    - Residual blocks with skip connections
    - Allows very deep networks (50, 101, 152 layers)
- Use case:
    - Large datasets (ImageNet, CIFAR-100)
    - Complex image classification problems
    - Avoid vanishing gradient problem in deep networks
- Pros:
    - Deep architectures without degradation
    - High accuracy
- Cons:
    - More complex to implement from scratch
    - Slower inference for very deep versions

------------------------------------------------------------------------
4. Inception (GoogLeNet / InceptionV3)
---------------------------------------
- Structure:
    - Parallel convolution paths (1x1, 3x3, 5x5)
    - Concatenation of outputs
- Use case:
    - Complex datasets where multi-scale feature extraction is beneficial
    - Large datasets with high variability
- Pros:
    - Efficient feature extraction at multiple scales
    - Good accuracy with relatively smaller model size
- Cons:
    - Complex architecture
    - Harder to modify for custom datasets

------------------------------------------------------------------------
5. MobileNet / EfficientNet
---------------------------
- Structure:
    - Depthwise separable convolutions (MobileNet)
    - Efficient scaling of width, depth, and resolution (EfficientNet)
- Use case:
    - Edge devices, mobile applications, embedded systems
    - When computation and memory are limited
- Pros:
    - Lightweight, fast inference
    - Good trade-off between accuracy and efficiency
- Cons:
    - Slightly lower accuracy than heavy models on very complex datasets

------------------------------------------------------------------------
6. DenseNet
-----------
- Structure:
    - Dense blocks: each layer connects to all previous layers
- Use case:
    - When you want feature reuse and efficient gradient flow
    - Medium to large datasets
- Pros:
    - Fewer parameters than ResNet for similar depth
    - Excellent feature propagation
- Cons:
    - More memory usage during training
    - Slightly slower than simple CNNs

------------------------------------------------------------------------
7. When to Choose What
-----------------------
- Small dataset & simple task → Small custom CNN
- Limited resources → MobileNet / EfficientNet
- Medium dataset & transfer learning → VGG / ResNet
- Complex dataset & very deep model → ResNet / DenseNet / Inception
- Multi-scale feature importance → Inception
- Real-time inference needed → MobileNet / EfficientNet

------------------------------------------------------------------------
8. Transfer Learning Tips
-------------------------
- Use pretrained weights on ImageNet for medium/large datasets
- Freeze base layers and train top layers first
- Fine-tune later if dataset is large enough
- Benefits:
    - Faster convergence
    - Higher accuracy with fewer data

=========================================================================
SUMMARY
- CNN choice depends on:
    - Dataset size
    - Number of classes
    - Computational resources
    - Required inference speed
- For general image classification projects, you can start with:
    - Small CNN for experimentation
    - VGG/ResNet for robust solutions
    - MobileNet/EfficientNet for deployment on low-resource devices
"""


# -----------------------------
# 2. Build CNN Model
# -----------------------------
model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)),
    MaxPooling2D(2,2),
    
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    
    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

# -----------------------------
# 3. Train Model
# -----------------------------
history = model.fit(
    train_data,
    validation_data=test_data,
    epochs=EPOCHS
)

# -----------------------------
# 4. Plot Accuracy and Loss
# -----------------------------
plt.figure(figsize=(12,5))

plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='Train Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title("Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()

plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title("Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()

plt.show()

# -----------------------------
# 5. Predict a Single Image
# -----------------------------
class_names = list(train_data.class_indices.keys())

def preprocess_image(img_path, img_size=IMG_SIZE):
    img = cv2.imread(img_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, img_size)
    img = img / 255.0
    return np.expand_dims(img, axis=0)

# Example: pick first image from first class folder
first_class = os.listdir(DATASET_PATH)[0]
test_image_path = os.path.join(DATASET_PATH, first_class, os.listdir(os.path.join(DATASET_PATH, first_class))[0])

img_input = preprocess_image(test_image_path)
pred_probs = model.predict(img_input)
pred_idx = np.argmax(pred_probs)
pred_class = class_names[pred_idx]

print("Predicted Class:", pred_class)

plt.imshow(cv2.cvtColor(cv2.imread(test_image_path), cv2.COLOR_BGR2RGB))
plt.title("Predicted: " + pred_class)
plt.axis('off')
plt.show()

