In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# feature extraction
def extract_features_from_video(video_path):
    cap = cv2.VideoCapture(video_path)
    features = []
    prev_positions = []
    
    backSub = cv2.createBackgroundSubtractorMOG2(detectShadows=True)

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        # convert to grayscale
        gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        
        # apply background subtraction to detect flies
        fgMask = backSub.apply(gray)
        contours, _ = cv2.findContours(fgMask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # analyze each detected fly
        positions = []
        for contour in contours:
            if cv2.contourArea(contour) > 100:  # filter small noise    
                x, y, w, h = cv2.boundingRect(contour)
                positions.append((x + w // 2, y + h // 2))  # center of fly
                
        # calculate movement features
        if prev_positions:
            for i, pos in enumerate(positions):
                if i < len(prev_positions):
                    dx = pos[0] - prev_positions[i][0]
                    dy = pos[1] - prev_positions[i][1]
                    speed = np.sqrt(dx**2 + dy**2)
                    features.append({
                        "dx": dx,
                        "dy": dy,
                        "speed": speed,
                        "direction": np.arctan2(dy, dx),
                    })
        prev_positions = positions
    
    cap.release()
    return pd.DataFrame(features)

# combine features & labels
def prepare_dataset(video_folder, labels_csv, output_csv):
    labels = pd.read_csv(labels_csv)
    dataset = []
    
    for _, row in labels.iterrows():
        video_path = os.path.join(video_folder, row["video_name"])
        print(f"Processing {video_path}..."),
        features = extract_features_from_video(video_path)
        features["label"] = row["label"]
        features["video_name"] = row["video_name"]  # add video name for context
        dataset.append(features)
    
    combined_dataset = pd.concat(dataset, ignore_index=True)
    
    # save dataset to CSV
    combined_dataset.to_csv(output_csv, index=False)
    print(f"Features saved to {output_csv}")
    
    return combined_dataset