In [None]:
!pip install opencv-python-headless numpy torch
!git clone https://github.com/WongKinYiu/yolov7.git

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
%cd yolov7
!wget https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt -P weights/


/content/yolov7
--2024-08-05 11:57:54--  https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7.pt
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/511187726/b0243edf-9fb0-4337-95e1-42555f1b37cf?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240805%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240805T115755Z&X-Amz-Expires=300&X-Amz-Signature=fe54657276e9d08112e6710cbd94f1f05ee1224be9b5bad33ea5240cd327969d&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=511187726&response-content-disposition=attachment%3B%20filename%3Dyolov7.pt&response-content-type=application%2Foctet-stream [following]
--2024-08-05 11:57:55--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/511187726/b0243edf-9fb0-4337-95e1-42555f1b37cf

## Bounding boxes version (successful)

In [None]:
import cv2
import torch
import numpy as np
import os
import time
from google.colab.patches import cv2_imshow
from models.experimental import attempt_load
from utils.general import non_max_suppression, scale_coords
from utils.plots import plot_one_box

# Load the YOLOv7 model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = attempt_load('weights/yolov7.pt', map_location=device)
model.eval()

# Function to preprocess image
def preprocess_image(image):
    img = cv2.resize(image, (640, 640))  # Resize image to the input size expected by YOLOv7
    img = img[..., ::-1]  # Convert BGR to RGB
    img = img / 255.0  # Normalize to [0, 1]
    img = np.transpose(img, (2, 0, 1))  # Change to (C, H, W)
    img = torch.tensor(img).float().unsqueeze(0).to(device)  # Add batch dimension and move to device
    return img

# Load video file
video_path = '/content/drive/MyDrive/competition archive/spin dataset 1 (360p)/2019COC_AG_SP_1.mp4'
cap = cv2.VideoCapture(video_path)

# Prepare for saving the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_path = '/content/output_video1.mp4'
out = cv2.VideoWriter(output_path, fourcc, 20.0, (int(cap.get(3)), int(cap.get(4))))

# Start timer
start_time = time.time()

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess the frame
    img = preprocess_image(frame)

    # Run inference
    with torch.no_grad():
        preds = model(img)[0]
        preds = non_max_suppression(preds, conf_thres=0.5, iou_thres=0.4)

    # Process and visualize predictions
    for det in preds:
        if len(det):
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], frame.shape).round()  # Rescale coordinates
            for *xyxy, conf, cls in det:
                label = f'Class: {int(cls)} Conf: {conf:.2f}'
                plot_one_box(xyxy, frame, label=label, color=(0, 255, 0), line_thickness=2)  # Draw bounding box

    # Save frame to output video
    out.write(frame)




    # Optionally, display a few frames in Colab
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# End timer and print total time
end_time = time.time()
print(f"Total processing time: {end_time - start_time} seconds")

cap.release()
out.release()
cv2.destroyAllWindows()

# Display the output video
from IPython.display import Video
Video(output_path)


KeyboardInterrupt: 

## keypoints version (successful)

In [None]:
!pip install mediapipe

Collecting mediapipe
  Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe)
  Downloading protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe)
  Downloading sounddevice-0.4.7-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-4.25.4-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.4.7-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.

In [None]:
import cv2
import mediapipe as mp
import numpy as np
import torch
from models.experimental import attempt_load
from utils.general import non_max_suppression, scale_coords
from utils.plots import plot_one_box

# Initialize MediaPipe Pose
mp_pose = mp.solutions.pose
pose = mp_pose.Pose()

# Load YOLOv7 model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = attempt_load('weights/yolov7.pt', map_location=device)
model.eval()

# Load video file
video_path = '/content/drive/MyDrive/competition archive/spin dataset 1 (360p)/2019COC_AG_SP_1.mp4'
cap = cv2.VideoCapture(video_path)

# Prepare for saving the output video
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
output_path = '/content/output_video_with_pose.mp4'
out = cv2.VideoWriter(output_path, fourcc, 20.0, (int(cap.get(3)), int(cap.get(4))))

# Function to process each frame
def process_frame(frame):
    # Preprocess the frame for YOLOv7
    img = cv2.resize(frame, (640, 640))  # Resize for YOLOv7 input
    img = img / 255.0  # Normalize
    img = np.transpose(img, (2, 0, 1))  # Change to (C, H, W)
    img = np.expand_dims(img, 0)  # Add batch dimension
    img = torch.tensor(img, dtype=torch.float32).to(device)

    # Run YOLOv7 inference
    with torch.no_grad():
        preds = model(img)[0]
        preds = non_max_suppression(preds, conf_thres=0.5, iou_thres=0.4)

    # MediaPipe Pose connections (pair of indices)
    connections = [
        (11, 13), (13, 15),  # Left arm
        (12, 14), (14, 16),  # Right arm
        (11, 12),            # Body
        (23, 24), (24, 26),  # Left leg
        (22, 23), (24, 25),  # Right leg
        (25, 27), (27, 29),  # Right foot
        (28, 29), (27, 28)   # Left foot
    ]

    # Process YOLOv7 detections
    for det in preds:
        if len(det):
            det[:, :4] = scale_coords(img.shape[2:], det[:, :4], frame.shape).round()  # Rescale coordinates
            for *xyxy, conf, cls in det:
                x1, y1, x2, y2 = map(int, xyxy)

                # Ensure coordinates are within the frame boundaries
                x1, y1 = max(0, x1), max(0, y1)
                x2, y2 = min(frame.shape[1], x2), min(frame.shape[0], y2)

                # Crop the detected object area for pose estimation
                cropped = frame[y1:y2, x1:x2]

                # Check if the cropped image is valid
                if cropped.size == 0:
                    continue

                # Convert cropped image to RGB for Pose Estimation
                cropped_rgb = cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB)

                # Run Pose Estimation on the cropped image
                results = pose.process(cropped_rgb)

                # Overlay pose landmarks if detected
                if results.pose_landmarks:
                    # Extract landmarks
                    landmarks = results.pose_landmarks.landmark
                    h, w, _ = cropped.shape
                    keypoints = [(int(landmark.x * w), int(landmark.y * h)) for landmark in landmarks]

                    # Draw keypoints
                    for (x, y) in keypoints:
                        if 0 <= x < w and 0 <= y < h:
                            cv2.circle(cropped, (x, y), 5, (0, 0, 255), -1)

                    # Draw skeleton connections
                    for start_idx, end_idx in connections:
                        if (0 <= start_idx < len(keypoints)) and (0 <= end_idx < len(keypoints)):
                            start_point = keypoints[start_idx]
                            end_point = keypoints[end_idx]
                            if (0 <= start_point[0] < w and 0 <= start_point[1] < h and
                                0 <= end_point[0] < w and 0 <= end_point[1] < h):
                                cv2.line(cropped, start_point, end_point, (0, 255, 0), 2)

                # Replace cropped region with the one with keypoints
                frame[y1:y2, x1:x2] = cropped

    return frame



while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break

    # Process the frame
    frame_with_pose = process_frame(frame)

    # Save frame to output video
    out.write(frame_with_pose)

# Release resources
cap.release()
out.release()

Fusing layers... 
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block
RepConv.fuse_repvgg_block


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
