In [2]:
import os
from pinecone import Pinecone
import pinecone
from typing import Dict, List
from dotenv import load_dotenv
from backend.video_processing import process_video

load_dotenv()

# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")  # Replace with your Pinecone API key

pc = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = "groq-video-analyzer"

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        spec=pinecone.IndexSpec(
            dimension=768,  # Adjust dimension as per embedding
            metric='cosine'  # or 'euclidean' or 'dotproduct' as per your requirement
        )
    )

pinecone_index = pc.Index(INDEX_NAME)


In [3]:
import shutil

# Delete all images in the 'frames' folder
frames_folder = "frames"
if os.path.exists(frames_folder):
    shutil.rmtree(frames_folder)
    print(f"All images in '{frames_folder}' folder have been deleted.")
else:
    print(f"'{frames_folder}' folder does not exist.")

# Recreate the empty 'frames' folder
os.makedirs(frames_folder, exist_ok=True)
print(f"Empty '{frames_folder}' folder has been created.")


pinecone_index.delete(delete_all=True)

All images in 'frames' folder have been deleted.
Empty 'frames' folder has been created.


{}

In [28]:
import cv2
import base64
from groq import Groq
from transformers import AutoModel

video_path = "../videos/paris_short.mp4"
task_id = "123"

cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * 2)  # every 2 seconds

frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_frames = frame_count // frame_interval

current_frame = 0
frame_number = 0

client = Groq()

while True:
    ret, frame = cap.read()
    # Calculate timestamp based on frame number and frame interval
    timestamp = frame_number * frame_interval / fps
    print(f"Timestamp for frame {frame_number}: {timestamp:.2f} seconds")
    if not ret:
        break

    if current_frame % frame_interval == 0:
        # Check if the frame is mainly full same color (like full black or full white)
        frame_number += 1
        if not (cv2.countNonZero(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)) == 0 or cv2.countNonZero(cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)) == frame.size // 3):
            # Resize frame proportionally to 1120 width before saving
            target_width = 1120
            height, width, _ = frame.shape
            aspect_ratio = height / width
            target_height = int(target_width * aspect_ratio)
            resized_frame = cv2.resize(frame, (target_width, target_height))

            # Save resized frame to disk
            frame_filename = f"frames/{task_id}_frame_{frame_number}.jpg"
            os.makedirs(os.path.dirname(frame_filename), exist_ok=True)
            cv2.imwrite(frame_filename, resized_frame)
        else:
            print(f"Skipped frame {frame_number} as it is mainly full same color.")
            continue

        # Read image and encode to base64 and binary
        with open(frame_filename, "rb") as image_file:
            image_data = image_file.read()
            encoded_string = base64.b64encode(image_data).decode('utf-8')
            binary_data = image_data

        # Print encoded string
        print(f"Encoded string for frame {frame_number}:")
        print(encoded_string[:100] + "..." if len(encoded_string) > 100 else encoded_string)

        # Print binary data info
        print(f"Binary data for frame {frame_number}:")
        print(f"Size: {len(binary_data)} bytes")
        print(f"First 20 bytes: {binary_data[:20]}")
        
        print()  # Add a blank line for better readability between frames

        # Call GROQ API to describe image
        completion = client.chat.completions.create(
            model="llama-3.2-11b-vision-preview",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Describe this image in detail in 3-4 sentences. Provide the following details:\n\nA short summary of the scene.\nAssumed location (urban, rural, indoor, outdoor, etc.).\nAssumed time of day.\nAssumed weather conditions.\nNumber of people or animals and their description (clothing, posture, expression).\nDescription of actions being performed.\nVisible objects or buildings.\nType of ground and sky conditions.\nOverall composition of the image."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{encoded_string}"
                            }
                        }
                    ]
                }
            ],
            temperature=1,
            max_tokens=1024,
            top_p=1,
            stream=False,
            stop=None,
        )
        frame_description = completion.choices[0].message.content
        print(frame_description)

        # generate embedding
        embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
        frame_description_embedding = embedding_model.encode(frame_description).tolist()

        # insert into pinecone
        metadata = {
            "task_id": task_id,
            "timecode": current_frame / fps,
            "timestamp": timestamp,
            "frame_number": frame_number,
            "frame_path": f'frames/{task_id}_frame_{frame_number}.jpg',
            "video_path": '../videos/paris_short.mp4',
            "description": frame_description
        }
        upsert_response = pinecone_index.upsert(
            vectors=[
                {
                    "id": f"{task_id}_frame_{frame_number}",
                    "values": frame_description_embedding,
                    "metadata": metadata
                }
            ]
        )
        print(upsert_response)

        # # Update progress
        # progress = int((frame_number / total_frames) * 100)

    current_frame += 1

cap.release()



Timestamp for frame 0: 0.00 seconds
Skipped frame 1 as it is mainly full same color.
Timestamp for frame 1: 1.97 seconds
Skipped frame 2 as it is mainly full same color.
Timestamp for frame 2: 3.94 seconds
Skipped frame 3 as it is mainly full same color.
Timestamp for frame 3: 5.91 seconds
Skipped frame 4 as it is mainly full same color.
Timestamp for frame 4: 7.87 seconds
Encoded string for frame 5:
/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoK...
Binary data for frame 5:
Size: 33034 bytes
First 20 bytes: b'\xff\xd8\xff\xe0\x00\x10JFIF\x00\x01\x01\x00\x00\x01\x00\x01\x00\x00'

Here is a detailed description of the given image, highlighting the specified characteristics:



**Summary of the Scene**
The image appears to be a silhouette of a person standing in front of various buildings and other structures in what seems to be a park or garden.



**Location**
The location appears to be an urban setting, similar to a park or garden.


In [4]:
from groq import Groq

client = Groq()

completion = client.chat.completions.create(
    model="llama-3.2-11b-vision-preview",
    messages=[
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "Hello"
                }
            ]
        }
    ],
    temperature=1,
    max_tokens=1024,
    top_p=1,
    stream=False,
    stop=None,
    )
frame_description = completion.choices[0].message.content
print(frame_description)

# ... rest of your code ...

How can I assist you today?


In [21]:
from transformers import AutoModel

embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
frame_description = 'The image is an aerial view of a cityscape with the word "PARIS" written in large white letters over the top of it.\n\nIt appears to show a city in the early morning, with a white sun rising just before the cityscape, casting a warm glow on the scene. This suggests an early morning moment.\n\nThe scene appears to be outdoors, showcasing the cityscape, and probably in an urban setting, possibly in Paris.\n\nIn the image, there are no people depicted. The bright glow from the sun suggests an early morning. There is no rain or precipitation. The ground is an urban area, and I suspect this may be some city with a river or riverfront pictured with rows of buildings. The sky is a warm light brown/dark yellow similar to a sunrise with a few clouds in the top corner of the frame.\n\nBackground objects visible include buildings, a river, possibly a bridge, white boats on the water, cars, trees, and flags and a bright golden sun. The sky is either reddish brown due to morning sun or is a clear, sunny sky with the sun peeking above the top right corner of the frame.\n\nWithout taking into account the subject of the word "Paris", the composition appears to be a cityscape with the sun rising over the buildings in the near future. The sun is not in a prominent part of the image or the middle, but still a bright light that highlights the city to be the main subject of the composition.'
frame_description_embedding = embedding_model.encode(frame_description).tolist()

task_id = '123'
frame_number = 1

metadata = {
    "task_id": task_id,
    "timecode": '00:00:00',
    "frame_path": f'frames/{task_id}_frame_{frame_number}.jpg',
    "video_path": '../videos/paris_short.mp4',
    "description": frame_description
}
upsert_response = pinecone_index.upsert(
    vectors=[
        {
            "id": f"{task_id}_frame_{frame_number}",
            "values": frame_description_embedding,
            "metadata": metadata
        }
    ]
)
print(upsert_response)


{'upserted_count': 1}


In [25]:
from transformers import AutoModel

embedding_model = AutoModel.from_pretrained('jinaai/jina-embeddings-v2-base-en', trust_remote_code=True)
user_query = 'eiffel tower'
user_query_embedding = embedding_model.encode(user_query).tolist()

result = pinecone_index.query(
vector=user_query_embedding,
top_k=5, #this is the number of results that are returned
include_values=False,
    include_metadata=True
)
print(result)

{'matches': [{'id': '123_frame_12',
              'metadata': {'description': 'A close-up view of the Eiffel Tower '
                                          'in Paris, France shows its '
                                          'intricate iron latticework, exposed '
                                          'through many open sections, '
                                          'indicating work having been done to '
                                          'repair or maintain it.. At the base '
                                          'of the tower, through trees to the '
                                          'left and the streets of Paris at '
                                          'the bottom of the image, are many '
                                          'tall steel structural elements and '
                                          'curved supports, and a center arch '
                                          'spans the bottom of the tower. In '
                   

In [1]:
!uvicorn main:app --host 0.0.0.0 --port 8000 --reload


In [None]:
uvicorn backend.main:app --host 0.0.0.0 --port 8000 --reload