<a href="https://colab.research.google.com/github/geexe/cat-facial-expression-recognition/blob/main/Cat_Facial_Expression_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### START HERE!

1. Setup

In [4]:
!pip install tensorflow albumentations opencv-python
!pip install wandb

Collecting wandb
  Downloading wandb-0.17.5-py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting gitpython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-2.12.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython!=3.1.29,>=1.0.0->wandb)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading wandb-0.17.5-py3-none-ma

In [3]:
import cv2
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import wandb

In [6]:
# Clone the repository
!git clone https://github.com/kkengg/cat-facial-expression-recognition

Cloning into 'cat-facial-expression-recognition'...
remote: Enumerating objects: 1424, done.[K
remote: Counting objects: 100% (339/339), done.[K
remote: Compressing objects: 100% (267/267), done.[K
remote: Total 1424 (delta 260), reused 72 (delta 72), pack-reused 1085[K
Receiving objects: 100% (1424/1424), 48.67 MiB | 35.15 MiB/s, done.
Resolving deltas: 100% (450/450), done.


In [4]:
import pandas as pd

# Load the CSV file
file_path = '/content/cat-facial-expression-recognition/landmark_all.csv'
df = pd.read_csv(file_path)

# Combine landmark, vector_x, and vector_y into a single identifier
df['landmark_vector_x'] = df.apply(lambda row: f"{row['landmark']}_x", axis=1)
df['landmark_vector_y'] = df.apply(lambda row: f"{row['landmark']}_y", axis=1)

# Concatenate the two pivot tables
df_wide = df.pivot_table(index=['filename', 'class', 'image_path'],
                         columns='landmark',
                         values=['vector_x', 'vector_y'],
                         aggfunc='first')

df_wide.columns = ['{}_{}'.format(col[0], col[1]) for col in df_wide.columns.values]

# Reset the index to turn the filename back into a column
df_wide.reset_index(inplace=True)
df_wide.drop(columns=['filename'], inplace=True)

# Save the transformed dataframe to a new CSV file
#output_file_path = 'path/to/vector_data_wide.csv'
#df_wide.to_csv(output_file_path, index=False)

In [None]:
print(df_wide.head(20))
len(df_wide)

2. Define Augmentation Functions

Define the functions for augmenting images and landmarks.

In [5]:
import tensorflow as tf
import albumentations as A
import numpy as np
from PIL import Image

# Image augmentation function using albumentations
def augment_image(image):
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ColorJitter(p=0.3),
        A.RandomBrightnessContrast(p=0.3)
    ])
    augmented = transform(image=image)
    return augmented['image']

# Landmark augmentation function
def augment_landmarks(landmarks, image_shape, transform):
    h, w = image_shape[:2]
    # Reshape landmarks to (num_landmarks, 2)
    landmarks = landmarks.reshape(-1, 2)
    # Apply the same transformations as image
    augmented_landmarks = transform(image=np.zeros(image_shape), keypoints=landmarks)['keypoints']
    # Flatten the landmarks back to original shape
    return np.array(augmented_landmarks).flatten()

# Combined augmentation function
def augment_image_and_landmarks(image, landmarks):
    # Define transformation with both image and keypoints
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ColorJitter(p=0.3),
        A.RandomBrightnessContrast(p=0.3)
    ], keypoint_params=A.KeypointParams(format='xy'))

    augmented = transform(image=image, keypoints=landmarks.reshape(-1, 2))
    augmented_image = augmented['image']
    augmented_landmarks = np.array(augmented['keypoints']).flatten()

    return augmented_image, augmented_landmarks

3. Apply Augmentations During Data Loading

Integrate the augmentations into the tf.data.Dataset pipeline.

In [6]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from PIL import Image
import os

# Load landmarks and labels
landmarks_df = df_wide
image_paths = landmarks_df['image_path'].values
landmarks = landmarks_df.drop(columns=['image_path','class']).values

# Example labels (You should replace this with your actual labels)
labels = landmarks_df['class'].values

# Create label mapping
label_to_int = {label: idx for idx, label in enumerate(np.unique(labels))}
int_to_label = {idx: label for label, idx in label_to_int.items()}

# Convert string labels to integer labels
integer_labels = np.array([label_to_int[label] for label in labels])

In [7]:
def load_and_preprocess_image(image_path, target_size=(150, 150)):
    try:
        # Open and resize the image
        img = Image.open(image_path)
        img = img.resize(target_size)

        # Convert image to numpy array and ensure it's in uint8 format
        img_array = np.array(img, dtype=np.uint8)

        # If the image is grayscale, convert it to RGB
        if len(img_array.shape) == 2 or (len(img_array.shape) == 3 and img_array.shape[2] == 1):
            img_array = cv2.cvtColor(img_array, cv2.COLOR_GRAY2RGB)

        # Normalize the image
        img_array = img_array / 255.0

        filename = image_path
        return img_array
    except Exception as e:
        print(f"Error loading image {image_path}: {e}")
        return None


    # Load and preprocess images
target_size = (224,224)
images = ([load_and_preprocess_image(img_path, target_size=target_size) for img_path in image_paths])

In [22]:
images

[array([[[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        ...,
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]],
 
        [[0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.],
         ...,
         [0., 0., 0.],
         [0., 0., 0.],
         [0., 0., 0.]]]),
 array([[[0.09411765, 0.09411765, 0.09411765],
        

In [8]:
# Check for Grayscale image
for img in images:
  if len(img.shape) == 2:
    print(img.shape)

In [9]:
# Train-test split
X_train_images, X_val_images, X_train_landmarks, X_val_landmarks, y_train, y_val = train_test_split(
    images, landmarks, integer_labels, test_size=0.3, random_state=42
)

In [38]:
X_train_landmarks.shape

(210, 96)

In [10]:
# Convert labels to categorical
num_classes = len(np.unique(labels))
print(num_classes)
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_val = tf.keras.utils.to_categorical(y_val, num_classes)

3


In [11]:
def augment_image_and_landmarks(image, landmarks):
    # Define transformation with both image and keypoints
    transform = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomRotate90(p=0.5),
        A.ColorJitter(p=0.3),
        A.RandomBrightnessContrast(p=0.3)
    ], keypoint_params=A.KeypointParams(format='xy', remove_invisible=False))

    h, w, _ = image.shape
    landmarks = landmarks.reshape(-1, 2)

    # Ensure image is in uint8 format
    image = (image * 255).astype(np.uint8)

    # Apply the transformations
    augmented = transform(image=image, keypoints=landmarks)
    augmented_image = augmented['image']
    augmented_landmarks = augmented['keypoints']

    # Clip keypoints to be within the image boundaries
    #clipped_landmarks = np.clip(augmented_landmarks, a_min=[0, 0], a_max=[w, h])

    #return augmented_image , clipped_landmarks.flatten()
    return augmented_image , np.array(augmented_landmarks).flatten()

# Function to augment data
def augment_data(images, landmarks):
    augmented_images = []
    augmented_landmarks = []
    for img, lnd in zip(images, landmarks):
        aug_img, aug_lnd = augment_image_and_landmarks(img, lnd)
        augmented_images.append(aug_img)
        augmented_landmarks.append(aug_lnd)
    return np.array(augmented_images), np.array(augmented_landmarks)

# Apply augmentations to the training data
augmented_X_train_images, augmented_X_train_landmarks = augment_data(X_train_images, X_train_landmarks)

# Create a tf.data.Dataset
def create_dataset(images, landmarks, labels, batch_size):
    dataset = tf.data.Dataset.from_tensor_slices(((images, landmarks), labels))
    dataset = dataset.shuffle(buffer_size=len(images)).batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

batch_size = 32

train_dataset = create_dataset(augmented_X_train_images, augmented_X_train_landmarks, y_train, batch_size)
validation_dataset = create_dataset(X_val_images, X_val_landmarks, y_val, batch_size)

4. Create and Train the Multimodal Model

In [None]:
from keras.models import Model
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten, concatenate
from keras.optimizers import Adam

# Image input branch
image_input = Input(shape=(150, 150, 3))
x = Conv2D(32, (3, 3), activation='relu')(image_input)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(64, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Conv2D(128, (3, 3), activation='relu')(x)
x = MaxPooling2D((2, 2))(x)
x = Flatten()(x)

# Landmark input branch
landmark_input = Input(shape=(landmarks.shape[1],))
y = Dense(64, activation='relu')(landmark_input)
y = Dense(32, activation='relu')(y)

# Combine the outputs from the two branches
combined = concatenate([x, y])

# Add a few more layers
z = Dense(128, activation='relu')(combined)
z = Dense(num_classes, activation='softmax')(z)

# Create the model
model = Model(inputs=[image_input, landmark_input], outputs=z)

# Compile the model
model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=10
)

# Evaluate the model
loss, accuracy = model.evaluate(validation_dataset)
print(f'Validation Loss: {loss}')
print(f'Validation Accuracy: {accuracy}')

Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 2s/step - accuracy: 0.3600 - loss: 378.6329 - val_accuracy: 0.3333 - val_loss: 1.1838
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2s/step - accuracy: 0.3521 - loss: 11.0843 - val_accuracy: 0.3333 - val_loss: 1.1859
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2s/step - accuracy: 0.4555 - loss: 2.7739 - val_accuracy: 0.3111 - val_loss: 1.1628
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 2s/step - accuracy: 0.4705 - loss: 1.1151 - val_accuracy: 0.3444 - val_loss: 1.1736
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 2s/step - accuracy: 0.5316 - loss: 0.8827 - val_accuracy: 0.3111 - val_loss: 1.1692
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 2s/step - accuracy: 0.7009 - loss: 0.6289 - val_accuracy: 0.3000 - val_loss: 1.1643
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━

In [22]:
# Choose Base Model and other parameters
image_input = Input(shape=(224, 224, 3))
base_model = tf.keras.applications.DenseNet121(weights='imagenet', include_top=False, input_tensor=image_input)
num_epochs = 20
learning_rate = 1e-3

In [29]:
from tensorflow.keras import layers, models
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Flatten, concatenate, GlobalAveragePooling2D, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50V2
from tensorflow.keras.optimizers import Adam

# Add custom layers
# num_custom_classes = 3
x = base_model.output
x = GlobalAveragePooling2D()(x)

# Landmark input branch
landmark_input = Input(shape=(X_train_landmarks.shape[1],))
y = Dense(128, activation='relu')(landmark_input)
y = Dense(128, activation='relu')(y)
y = Dense(512, activation='relu')(y)

# Combine the outputs from the two branches
combined = concatenate([x, y])

# Add a few more layers
z = Dense(256, activation='relu')(combined)
z = Dense(num_classes, activation='softmax')(z)

# Create the model
current_model = Model(inputs=[image_input, landmark_input], outputs=z)

# If trainable is True, we can set train the layers after last_layer_to_freeze
freeze_model = True
last_layer_to_freeze = 426

if freeze_model == True:
  base_model.trainable = False

elif freeze_model == True:
  for layer in base_model.layers[:last_layer_to_freeze]:
    layer.trainable = False
  for layer in base_model.layers[last_layer_to_freeze:]:
    layer.trainable = True

# Compile the model
current_model.compile(
                      optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss='categorical_crossentropy',
                      metrics=['accuracy']
                      )

# Callback to save models and weights
save_weights_only = False
save_dir = '/content/cat-facial-image-recognition/saved_model/'

if save_weights_only:
  checkpoint_filepath = save_dir + base_model.name + "bestmodel_epoch{epoch:03d}_valloss{val_loss:.2f}.weights.h5"
else:
  checkpoint_filepath = save_dir + base_model.name + "bestmodel_epoch{epoch:03d}_valloss{val_loss:.2f}.keras"
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( filepath=checkpoint_filepath,
                                                                save_weights_only=save_weights_only,
                                                                monitor='val_loss',
                                                                mode='min',
                                                                verbose = 1,
                                                                save_best_only=True )

In [30]:
current_model.summary()
tf.keras.utils.plot_model(multimodal_model, to_file='cnn1_nonsequential1.png', show_shapes=True, show_dtype=False, show_layer_names=True, dpi=96)

In [31]:
history = current_model.fit(
    train_dataset,
    #steps_per_epoch=train_dataset.samples // train_dataset.batch_size,
    validation_data=validation_dataset,
    #validation_steps=validation_dataset.samples // validation_dataset.batch_size,
    epochs=num_epochs,
    #callbacks=[model_checkpoint_callback, wandb_callback]
)

Epoch 1/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 9s/step - accuracy: 0.3555 - loss: 16.0666 - val_accuracy: 0.3000 - val_loss: 1.9788
Epoch 2/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 9s/step - accuracy: 0.3991 - loss: 9.1553 - val_accuracy: 0.3667 - val_loss: 2.4463
Epoch 3/20
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 7s/step - accuracy: 0.4285 - loss: 5.1373 - val_accuracy: 0.3667 - val_loss: 2.5290
Epoch 4/20


KeyboardInterrupt: 

In [None]:
# Set up wandb

from wandb.integration.keras import WandbCallback

!wandb login 872265a615a7284a38f47af24b872eaee0dfcfa6

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Cat Facial Expression Recognition",

    # track hyperparameters and run metadata
    config={
    "learning_rate": learning_rate,
    "architecture": base_model.name,
    "dataset": "custom",
    "epochs": num_epochs,
    }
)

# Set wandb callback
wandb_callback = WandbCallback(
    monitor="val_loss", verbose=0, mode="auto", save_weights_only=(False),
    log_weights=(False), log_gradients=(False), save_model=(False),
    training_data=None, validation_data=None, labels=None, predictions=36,
    generator=None, input_type=None, output_type=None, log_evaluation=(False),
    validation_steps=None, class_colors=None, log_batch_frequency=None,
    log_best_prefix="best_", save_graph=(True), validation_indexes=None,
    validation_row_processor=None, prediction_row_processor=None,
    infer_missing_processors=(True), log_evaluation_frequency=0,
    compute_flops=(False)
)