In [1]:
!pip install ultralytics
!pip install tensorflow

Collecting ultralytics
  Downloading ultralytics-8.3.170-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8.0->ultralytics)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8.0->ultralytics)
  Downloading n

In [None]:
# ================================================
# 🥊 Player Distance Estimator (with shot scaling)
# ================================================

import cv2
import numpy as np
from ultralytics import YOLO
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
import os

# Load YOLOv8 Player Detection model
player_model = YOLO("playerIdentificationModel.pt")

# Load Camera Shot Type Classification model
shot_model = load_model("final_camera_model_finetuned.keras")
label_map = ['closeup', 'long', 'medium']  # Make sure this matches your model

# Scaling factors based on camera shot type
scaling_factors = {
    "closeup": 0.05,   # feet per pixel
    "medium": 0.15,
    "long": 0.30
}

# Helper: Predict shot type
def predict_shot_type(frame, img_size=224):
    img = cv2.resize(frame, (img_size, img_size))
    img = img_to_array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    pred = shot_model.predict(img, verbose=0)[0]
    label = label_map[np.argmax(pred)]
    return label

# Helper: Estimate distance between 2 players
def estimate_player_distance(frame):
    results = player_model(frame)
    boxes = results[0].boxes.xyxy.cpu().numpy()

    if len(boxes) != 2:
        return None, "❌ Frame skipped (need exactly 2 players)"

    # Get centroids and average height
    centroids = []
    heights = []

    for box in boxes:
        x1, y1, x2, y2 = box
        cx = (x1 + x2) / 2
        cy = (y1 + y2) / 2
        h = y2 - y1
        centroids.append((cx, cy))
        heights.append(h)

    # Pixel distance between centroids
    d_pixels = np.linalg.norm(np.array(centroids[0]) - np.array(centroids[1]))

    # Average height (to optionally normalize)
    avg_height = np.mean(heights)

    # Predict shot type and apply scaling
    shot_type = predict_shot_type(frame)
    scale = scaling_factors.get(shot_type, 0.15)  # default to medium

    # Estimate real-world distance in feet
    distance_feet = d_pixels * scale

    return round(distance_feet, 2), shot_type


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


  saveable.load_own_variables(weights_store.get(inner_path))


In [None]:
import cv2
import numpy as np
from ultralytics import YOLO
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
import os

# Load models
player_model = YOLO("playerIdentificationModel.pt")
shot_model = load_model("final_camera_model_finetuned.keras")
label_map = ['closeup', 'long', 'medium']  # match your model output
scaling_factors = {
    "closeup": 0.05,
    "medium": 0.15,
    "long": 0.30
}

# Predict shot type
def predict_shot_type(frame, img_size=224):
    img = cv2.resize(frame, (img_size, img_size))
    img = img_to_array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    pred = shot_model.predict(img, verbose=0)[0]
    return label_map[np.argmax(pred)]

# Draw boxes and estimate distance
def annotate_frame(frame):
    results = player_model(frame)
    boxes = results[0].boxes.xyxy.cpu().numpy()

    if len(boxes) != 2:
        return frame  # Skip frame if not exactly 2 players

    centroids = []
    heights = []

    for box in boxes:
        x1, y1, x2, y2 = map(int, box)
        cx = (x1 + x2) // 2
        cy = (y1 + y2) // 2
        centroids.append((cx, cy))
        heights.append(y2 - y1)

        # Draw player bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    # Compute pixel distance
    d_pixels = int(np.linalg.norm(np.array(centroids[0]) - np.array(centroids[1])))
    avg_height = np.mean(heights)

    # Get camera shot type and scale
    shot_type = predict_shot_type(frame)
    scale = scaling_factors.get(shot_type, 0.15)
    real_distance = round(d_pixels * scale, 2)

    # Draw line and distance
    cv2.line(frame, centroids[0], centroids[1], (255, 0, 0), 2)
    mid_point = ((centroids[0][0] + centroids[1][0]) // 2,
                 (centroids[0][1] + centroids[1][1]) // 2)
    cv2.putText(frame, f"{real_distance} ft ({shot_type})", mid_point,
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    return frame

# 🎞️ Analyze a full video
def analyze_video_with_distance(input_path, output_path="output_with_distance.mp4"):
    cap = cv2.VideoCapture(input_path)
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out    = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"📹 Processing {frame_count} frames...")

    frame_num = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_num += 1
        annotated_frame = annotate_frame(frame)
        out.write(annotated_frame)

        if frame_num % 20 == 0:
            print(f"✅ Processed {frame_num}/{frame_count} frames")

    cap.release()
    out.release()
    print(f"🎉 Done! Output saved to: {output_path}")


In [None]:
analyze_video_with_distance("test_video.mp4")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

0: 384x640 2 fighters, 122.3ms
Speed: 2.0ms preprocess, 122.3ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 127.7ms
Speed: 2.0ms preprocess, 127.7ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)
✅ Processed 640/2277 frames

0: 384x640 2 fighters, 134.7ms
Speed: 2.2ms preprocess, 134.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 123.6ms
Speed: 3.6ms preprocess, 123.6ms inference, 0.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 128.5ms
Speed: 2.6ms preprocess, 128.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 136.8ms
Speed: 2.3ms preprocess, 136.8ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 202.0ms
Speed: 3.1ms preprocess, 202.0ms inference, 3.8ms postprocess per image at shape (1, 3, 384, 64

# With reference Scale

In [4]:
import cv2
import numpy as np
from ultralytics import YOLO
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.image import img_to_array
import os

# Load models
player_model = YOLO("playerIdentificationModel.pt")
shot_model = load_model("final_camera_model_finetuned.keras")
label_map = ['closeup', 'long', 'medium']  # match your model output
scaling_factors = {
    "closeup":  0.595,
    "medium": 1.1,
    "long": 1.8
}

# Predict shot type
def predict_shot_type(frame, img_size=224):
    img = cv2.resize(frame, (img_size, img_size))
    img = img_to_array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    pred = shot_model.predict(img, verbose=0)[0]
    return label_map[np.argmax(pred)]

# Draw boxes and estimate distance with revised logic
def annotate_frame(frame):
    results = player_model(frame)
    boxes = results[0].boxes.xyxy.cpu().numpy()

    if len(boxes) != 2:
        return frame  # Skip frame if not exactly 2 players

    centroids = []
    heights = []

    for box in boxes:
        x1, y1, x2, y2 = map(int, box)
        cx = (x1 + x2) // 2
        cy = (y1 + y2) // 2
        centroids.append((cx, cy))
        heights.append(y2 - y1)

        # Draw player bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), (0, 255, 0), 2)

    # Step 1: Calculate pixel distance between centroids
    d_pixels = np.linalg.norm(np.array(centroids[0]) - np.array(centroids[1]))

    # Step 2: Calculate average bounding box height
    avg_height = np.mean(heights)

    # Step 3: Predict camera shot type
    shot_type = predict_shot_type(frame)

    # Step 4: Get scaling factor for the shot type
    scale = scaling_factors.get(shot_type, 0.15)  # default scale if missing

    # Step 5: Convert pixel distance to real-world distance (feet)
    KNOWN_PLAYER_HEIGHT_FEET = 5.83   # Adjust if needed

    real_distance = (d_pixels / avg_height) * KNOWN_PLAYER_HEIGHT_FEET * scale
    real_distance = round(real_distance, 2)

    # Draw line and distance text on the frame
    cv2.line(frame, centroids[0], centroids[1], (255, 0, 0), 2)
    mid_point = ((centroids[0][0] + centroids[1][0]) // 2,
                 (centroids[0][1] + centroids[1][1]) // 2)
    cv2.putText(frame, f"{real_distance} ft ({shot_type})", mid_point,
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0, 255, 255), 2)

    return frame

# 🎞️ Analyze a full video
def analyze_video_with_distance(input_path, output_path="output_with_distance.mp4"):
    cap = cv2.VideoCapture(input_path)
    width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps    = cap.get(cv2.CAP_PROP_FPS)

    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out    = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    print(f"📹 Processing {frame_count} frames...")

    frame_num = 0
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_num += 1
        annotated_frame = annotate_frame(frame)
        out.write(annotated_frame)

        if frame_num % 20 == 0:
            print(f"✅ Processed {frame_num}/{frame_count} frames")

    cap.release()
    out.release()
    print(f"🎉 Finished processing. Saved to {output_path}")

# Example usage:
analyze_video_with_distance("test_video.mp4", "op3333.mp4")


📹 Processing 307 frames...

0: 384x640 1 fighter, 208.5ms
Speed: 6.5ms preprocess, 208.5ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 186.5ms
Speed: 3.8ms preprocess, 186.5ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 207.5ms
Speed: 3.7ms preprocess, 207.5ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 191.4ms
Speed: 4.0ms preprocess, 191.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 195.2ms
Speed: 4.7ms preprocess, 195.2ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 195.8ms
Speed: 3.9ms preprocess, 195.8ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 143.9ms
Speed: 12.5ms preprocess, 143.9ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 124.6ms
Speed: 4.0ms preprocess, 124.6ms in

# Json OP

In [7]:
def extract_frame_distances(video_path, distance_threshold=3.5):
    cap = cv2.VideoCapture(video_path)
    frame_data = []
    frame_num = 0

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_num += 1

        results = player_model(frame)
        boxes = results[0].boxes.xyxy.cpu().numpy()

        if len(boxes) != 2:
            continue  # Skip if not exactly 2 players

        centroids = []
        heights = []

        for box in boxes:
            x1, y1, x2, y2 = map(int, box)
            cx = (x1 + x2) // 2
            cy = (y1 + y2) // 2
            centroids.append((cx, cy))
            heights.append(y2 - y1)

        d_pixels = np.linalg.norm(np.array(centroids[0]) - np.array(centroids[1]))
        avg_height = np.mean(heights)
        shot_type = predict_shot_type(frame)
        scale = scaling_factors.get(shot_type, 0.15)
        KNOWN_PLAYER_HEIGHT_FEET = 5.83
        real_distance = (d_pixels / avg_height) * KNOWN_PLAYER_HEIGHT_FEET * scale
        real_distance = round(real_distance, 2)

        frame_data.append({
            "frame": int(frame_num),
            "distance": float(real_distance),
            "intense_exchange": bool(real_distance <= distance_threshold)
        })


    cap.release()
    return frame_data


In [8]:
import json

data = extract_frame_distances("test_video.mp4")
print(json.dumps(data[:5], indent=2))  # Print first 5 entries for preview
with open("frame_distances.json", "w") as f:
    json.dump(data, f, indent=2)


0: 384x640 1 fighter, 144.4ms
Speed: 5.9ms preprocess, 144.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 139.9ms
Speed: 4.0ms preprocess, 139.9ms inference, 1.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 134.3ms
Speed: 3.1ms preprocess, 134.3ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 140.0ms
Speed: 6.7ms preprocess, 140.0ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 fighter, 132.9ms
Speed: 4.1ms preprocess, 132.9ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 130.8ms
Speed: 3.8ms preprocess, 130.8ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 131.4ms
Speed: 3.9ms preprocess, 131.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 2 fighters, 134.8ms
Speed: 7.1ms preprocess, 134.8ms inference, 1.0ms postprocess p