# Guardian Angel - Activity Recognition

1. Kid Level Activity Recognition

+ Objective: 
    - Develop a system that identifies `specific activities` performed by individual children in video frames, 
    - categorizing them into predefined activities such as `Playing`, `Learning`, and `Violence`.

+ Technologies and Methods:

    - Convolutional Neural Networks (CNNs): 
        - Use CNNs to analyze visual data from video frames for robust feature extraction specific to various activities.

    - Action Recognition Models:
        - Implement action recognition models that can distinguish between different types of activities `based on movement patterns and context`.

    - Temporal Segmentation Networks (TSNs): 
        - `Utilize TSNs` to capture long-range temporal structures in the video data, enhancing the ability to recognize prolonged activities.

    - Transfer Learning: 
        - `Employ pre-trained` models on large datasets and `fine-tune them` to the specific task of recognizing child activities to `improve accuracy and reduce training time`.

    - Pose Estimation: 
        - `Apply pose estimation` techniques to `understand body positions and movements` that are indicative of different activities.

2. Scene Level Activity Recognition for All Kids

+ Objective: 
    - Create a system capable of `recognizing and analyzing activities` involving `multiple children within a scene`, providing an overview of the collective activity in the environment.

+ Technologies and Methods:
    
    - Scene Recognition Algorithms: 
        - Use scene recognition techniques to understand the broader context of the environment which helps in interpreting group activities.

    - Graph Neural Networks (GNNs): 
        - Implement GNNs to model interactions between multiple individuals in a scene, which is vital for understanding collective activities.

    - Multiple Object Tracking (MOT): 
        - Employ MOT systems to track multiple children simultaneously, ensuring accurate activity recognition even in dynamic scenes.

    - Deep Learning for Video Classification: 
        - Utilize deep learning models designed for video classification to analyze and classify complex activities involving multiple participants.

my supervior said we not have many images for training but we have same videos for training but we make testing we only will have one image for kid and caregiver so how handel this problem?

# Data Acquisition

### Step 1: Extract Frames from the Video

In [5]:
import cv2
import os

In [6]:
def extract_frames(video_path, output_folder, frame_interval):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)  # Get the frame rate of the video
    frame_count = 0
    extracted_count = 0
    success, frame = cap.read()
    
    while success:
        if frame_count % (fps * frame_interval) < 1: # one frame per second is extracted accurately
            frame_filename = os.path.join(output_folder, f"frame_{frame_count}.jpg")
            cv2.imwrite(frame_filename, frame)
            extracted_count += 1
        success, frame = cap.read()
        frame_count += 1
    
    cap.release()
    print(f"Extracted {extracted_count} frames from the video: {video_path}")


In [7]:
video_learning = r'..\datasets\videos\Learning.mp4'
video_playing = r'..\datasets\videos\Playing.mp4'

output_folder_learning = r'..\datasets\videos\Learning_frames'
output_folder_playing = r'..\datasets\videos\Playing_frames'

frame_interval = 1  # Extract one frame every 1 second

extract_frames(video_learning, output_folder_learning, frame_interval)
extract_frames(video_playing, output_folder_playing, frame_interval)

Extracted 0 frames from the video: ..\datasets\videos\Learning.mp4
Extracted 0 frames from the video: ..\datasets\videos\Playing.mp4


Step 3: Face Detection and Annotation


In [8]:
def detect_faces(image, face_detector):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    faces = face_detector.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30))
    return faces

face_detector = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml')

In [9]:
face_detector

< cv2.CascadeClassifier 00000285655A9D50>

In [None]:
def save_detected_faces(frame_folder, output_kids, output_caregivers):
    if not os.path.exists(output_kids):
        os.makedirs(output_kids)
    if not os.path.exists(output_caregivers):
        os.makedirs(output_caregivers)

    frame_files = [os.path.join(frame_folder, f) for f in os.listdir(frame_folder) if f.endswith('.jpg')]
    for frame_file in frame_files:
        image = cv2.imread(frame_file)
        faces = detect_faces(image, face_detector)
        for i, (x, y, w, h) in enumerate(faces):
            face = image[y:y+h, x:x+w]
            face_filename = os.path.join(output_kids, f"{os.path.basename(frame_file).split('.')[0]}_face_{i}.jpg")
            cv2.imwrite(face_filename, face)
    print(f"Saved detected faces to {output_kids}")

# Define paths
kids_frame_folder = r'Learning_frames'
caregiver_frame_folder = r'Playing_frames'

output_kids = r'\Kids_faces'
output_caregivers = r'\Caregiver_faces'

# Extract faces
save_detected_faces(kids_frame_folder, output_kids, output_caregivers, 'kid')
save_detected_faces(caregiver_frame_folder, output_kids, output_caregivers, 'caregiver')

### Preprocess Frames

In [None]:
from tensorflow.keras.preprocessing.image import img_to_array, load_img
import numpy as np
import os

In [None]:
def preprocess_image(image_path, target_size):
    image = load_img(image_path, target_size=target_size)
    image = img_to_array(image)
    image = image / 255.0  # Normalize to [0, 1]
    return image

def preprocess_frames(frame_folder, target_size):
    processed_images = []
    frame_files = [os.path.join(frame_folder, f) for f in os.listdir(frame_folder) if f.endswith('.jpg')]
    for frame_file in frame_files:
        processed_image = preprocess_image(frame_file, target_size)
        processed_images.append(processed_image)
    return np.array(processed_images)

target_size = (224, 224)  # Example target size

In [None]:
# Preprocess Learning frames
learning_frame_folder = r'..\datasets\videos\Learning_frames'
learning_images = preprocess_frames(learning_frame_folder, target_size)
print(f"Preprocessed {len(learning_images)} Learning frames.")

Preprocessed 85 Learning frames.


In [None]:
# Preprocess Playing frames
playing_frame_folder = r'..\datasets\videos\Playing_frames'
playing_images = preprocess_frames(playing_frame_folder, target_size)
print(f"Preprocessed {len(playing_images)} Playing frames.")


Preprocessed 733 Playing frames.


### Combine Labels with Preprocessed Images

Assume labels for learning and playing are 0 and 1 respectively. We create the combined dataset:

In [None]:
learning_labels = np.zeros(len(learning_images))  # Label for Learning is 0
playing_labels = np.ones(len(playing_images))    # Label for Playing is 1

# Combine images and labels
all_images = np.concatenate((learning_images, playing_images), axis=0)
all_labels = np.concatenate((learning_labels, playing_labels), axis=0)

print(f"Total images: {all_images.shape[0]}")
print(f"Total labels: {all_labels.shape[0]}")

Total images: 818
Total labels: 818


### Train/Test Split


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(all_images, all_labels, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape[0]} samples")
print(f"Testing set: {X_test.shape[0]} samples")

Training set: 654 samples
Testing set: 164 samples


### Build and Fine-Tune the Model


In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical

In [None]:
def build_fine_tuned_model(base_model, num_classes):
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    x = Dense(1024, activation='relu')(x)
    predictions = Dense(num_classes, activation='softmax')(x)
    model = Model(inputs=base_model.input, outputs=predictions)
    return model

# Convert labels to categorical one-hot encoding
y_train_cat = to_categorical(y_train, num_classes=2)
y_test_cat = to_categorical(y_test, num_classes=2)

base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model = build_fine_tuned_model(base_model, num_classes=2)  # 2 classes: Playing, Learning

model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train_cat, epochs=10, batch_size=32, validation_data=(X_test, y_test_cat))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1e8979174d0>

### Evaluate the Model

In [None]:
loss, accuracy = model.evaluate(X_test, y_test_cat)
print(f"Test accuracy: {accuracy * 100:.2f}%")

Test accuracy: 86.59%


### Single Frame Prediction

In [None]:
def single_frame_prediction(image_path, model, target_size):
    image = preprocess_image(image_path, target_size)
    image = np.expand_dims(image, axis=0)  # Add batch dimension
    prediction = model.predict(image)
    return prediction

# Example usage:
single_frame = r'..\datasets\videos\Learning_frames\frame_0.jpg'
prediction = single_frame_prediction(single_frame, model, target_size)
print(f"Prediction: {np.argmax(prediction)}")  # 0 for Learning, 1 for Playing

Prediction: 1
