In [1]:
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "key.json"

In [2]:
import google.auth
credentials, project = google.auth.default()
print(getattr(credentials, "service_account_email", "Using user credentials"))

semantic-retriever@gcp-x-mlb-hackathon.iam.gserviceaccount.com


In [3]:
# !pip install google-cloud-videointelligence

In [13]:
from google.cloud import videointelligence

def analyze_video(video_uri):
    client = videointelligence.VideoIntelligenceServiceClient()
    features = [
        videointelligence.Feature.SHOT_CHANGE_DETECTION,  # Scene boundaries
        videointelligence.Feature.LABEL_DETECTION,         # Label detection (includes segment, shot, and frame-level)
    ]

    operation = client.annotate_video(input_uri=video_uri, features=features)
    result = operation.result(timeout=300)

    # Extract scene boundaries (shot annotations)
    scenes = []
    for shot in result.annotation_results[0].shot_annotations:
        start_time = shot.start_time_offset.seconds
        end_time = shot.end_time_offset.seconds
        scenes.append((start_time, end_time))
        
    # Extract scene labels (objects/actions)
    scene_labels = []
    for label in result.annotation_results[0].segment_label_annotations:
        for segment in label.segments:
            scene_labels.append((segment.segment.start_time_offset.seconds, segment.segment.end_time_offset.seconds, label.entity.description))


    # Extract shot-level labels (more precise per scene)
    shot_labels = []
    for label in result.annotation_results[0].shot_label_annotations:
        for shot in label.segments:
            shot_start = shot.segment.start_time_offset.seconds
            shot_end = shot.segment.end_time_offset.seconds
            # shot.confidence indicates the confidence level of the detected label.
            shot_labels.append((shot_start, shot_end, label.entity.description, shot.confidence))

    # Optionally, extract frame-level labels for even more granularity
    frame_labels = []
    for label in result.annotation_results[0].frame_label_annotations:
        for frame in label.frames:
            frame_time = frame.time_offset.seconds
            frame_labels.append((frame_time, label.entity.description, frame.confidence))

    return scenes, scene_labels, shot_labels, frame_labels

# Example Usage
video_url = "gs://mlb_hackathon_bucket/videos/video.mp4"
scenes, scene_labels, shot_labels, frame_labels = analyze_video(video_url)

print("Scenes:", scenes)
print("Scene labels:", scene_labels)
print("Shot-level Labels:", shot_labels)
print("Frame-level Labels:", frame_labels)


Scenes: [(0, 4), (5, 12), (12, 15), (15, 18)]
Scene labels: [(0, 18, 'pitcher'), (0, 18, 'stadium'), (0, 18, 'player'), (0, 18, 'crowd'), (0, 18, 'audience'), (0, 18, 'baseball'), (0, 18, 'baseball player'), (0, 18, 'arena'), (0, 18, 'baseball positions'), (0, 18, 'home run'), (0, 18, 'pitch'), (0, 18, 'ball game'), (0, 18, 'bat and ball games'), (0, 18, 'games'), (0, 18, 'team sport'), (0, 18, 'baseball park'), (0, 18, 'baseball field'), (0, 18, 'sports'), (0, 18, 'sport venue')]
Shot-level Labels: [(0, 4, 'pitcher', 0.9644664525985718), (12, 15, 'pitcher', 0.46593257784843445), (0, 4, 'player', 0.9449904561042786), (5, 12, 'player', 0.9447277784347534), (12, 15, 'player', 0.8705179691314697), (15, 18, 'player', 0.873670220375061), (5, 12, 'basketball', 0.5874577164649963), (0, 4, 'baseball umpire', 0.5114926099777222), (0, 4, 'baseball field', 0.9325483441352844), (12, 15, 'baseball field', 0.4231873154640198), (15, 18, 'baseball field', 0.7510219812393188), (0, 4, 'referee', 0.31523