In [1]:
import os
from pinecone import Pinecone
import pinecone
from typing import Dict, List
from dotenv import load_dotenv
from video_processing import process_video

load_dotenv()

# Initialize Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")  # Replace with your Pinecone API key

pc = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = "groq-video-analyzer"

if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        spec=pinecone.IndexSpec(
            dimension=768,  # Adjust dimension as per embedding
            metric='cosine'  # or 'euclidean' or 'dotproduct' as per your requirement
        )
    )

index = pc.Index(INDEX_NAME)


  from tqdm.autonotebook import tqdm


In [11]:
import shutil

# Delete all images in the 'frames' folder
frames_folder = "frames"
if os.path.exists(frames_folder):
    shutil.rmtree(frames_folder)
    print(f"All images in '{frames_folder}' folder have been deleted.")
else:
    print(f"'{frames_folder}' folder does not exist.")

# Recreate the empty 'frames' folder
os.makedirs(frames_folder, exist_ok=True)
print(f"Empty '{frames_folder}' folder has been created.")

All images in 'frames' folder have been deleted.
Empty 'frames' folder has been created.


In [12]:
import cv2
import base64
from groq import Groq

video_path = "../videos/paris_short.mp4"
task_id = "123"

cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * 2)  # every 2 seconds

frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_frames = frame_count // frame_interval

current_frame = 0
frame_number = 0

client = Groq()

while True:
    ret, frame = cap.read()
    if not ret:
        break

    if current_frame % frame_interval == 0:
        frame_number += 1
        # Save frame to disk
        frame_filename = f"frames/{task_id}_frame_{frame_number}.jpg"
        os.makedirs(os.path.dirname(frame_filename), exist_ok=True)
        cv2.imwrite(frame_filename, frame)

        # Read image and encode to base64 and binary
        with open(frame_filename, "rb") as image_file:
            image_data = image_file.read()
            encoded_string = base64.b64encode(image_data).decode('utf-8')
            binary_data = image_data

        # Print encoded string
        print(f"Encoded string for frame {frame_number}:")
        print(encoded_string[:100] + "..." if len(encoded_string) > 100 else encoded_string)

        # Print binary data info
        print(f"Binary data for frame {frame_number}:")
        print(f"Size: {len(binary_data)} bytes")
        print(f"First 20 bytes: {binary_data[:20]}")
        
        print()  # Add a blank line for better readability between frames

        # # Call GROQ API to describe image
        completion = client.chat.completions.create(
            model="llama-3.2-11b-vision-preview",
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": "Describe this image in detail in 3-4 sentences. Provide the following details:\n\nA short summary of the scene.\nAssumed location (urban, rural, indoor, outdoor, etc.).\nAssumed time of day.\nAssumed weather conditions.\nNumber of people or animals and their description (clothing, posture, expression).\nDescription of actions being performed.\nVisible objects or buildings.\nType of ground and sky conditions.\nOverall composition of the image."
                        },
                        {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{encoded_string}"
                            }
                        }
                    ]
                }
            ],
            temperature=1,
            max_tokens=1024,
            top_p=1,
            stream=False,
            stop=None,
        )

        print(completion.choices[0].message)
        # # Call GROQ API to generate embedding
        # embedding = await generate_embedding(session, description)

        # # Insert into Pinecone
        # metadata = {
        #     "task_id": task_id,
        #     "timecode": current_frame / fps,
        #     "frame_path": frame_filename,
        #     "video_path": video_path
        # }
        # upsert_response = index.upsert(
        #     vectors=[
        #         {
        #             "id": f"{task_id}_frame_{frame_number}",
        #             "values": embedding,
        #             "metadata": metadata
        #         }
        #     ]
        # )

        # # Update progress
        # progress = int((frame_number / total_frames) * 100)

    current_frame += 1

cap.release()



Encoded string for frame 1:
/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoK...

ChatCompletionMessage(content='Unfortunately, there is no image to analyze in the provided text. Without an image, it is impossible to provide a detailed description of the scene, location, time of day, weather conditions, number of people or animals, their description, any actions being performed, visible objects or buildings, the type of ground and sky conditions, or the overall composition of the image. If you can provide the image, I would be happy to help you with a detailed analysis.', role='assistant', function_call=None, tool_calls=None)
Encoded string for frame 2:
/9j/4AAQSkZJRgABAQAAAQABAAD/2wBDAAIBAQEBAQIBAQECAgICAgQDAgICAgUEBAMEBgUGBgYFBgYGBwkIBgcJBwYGCAsICQoK...

ChatCompletionMessage(content="Unfortunately, I'm a large language model, I don't have the capability to visually access or process images, so I cannot provide a detailed description

In [1]:
!uvicorn main:app --host 0.0.0.0 --port 8000 --reload
