In [2]:
pip install opencv-python pytesseract numpy transformers torch


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [4]:

import cv2

def extract_frames(video_path, interval=1):
    video = cv2.VideoCapture(video_path)
    frames = []
    count = 0
    while video.isOpened():
        ret, frame = video.read()
        if not ret:
            break
        if count % interval == 0:
            frames.append(frame)
        count += 1
    video.release()
    return frames

ImportError: libGL.so.1: cannot open shared object file: No such file or directory

In [None]:
import pytesseract
import cv2
from PIL import Image

def ocr_frame(frame):
    img = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    text = pytesseract.image_to_string(img)
    return text

In [None]:
import cv2
from PIL import Image

def describe_image(frame, model, feature_extractor, tokenizer):
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    generated_ids = model.generate(pixel_values)
    generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    return generated_text

In [None]:

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

def process_video(video_path):
    frames = extract_frames(video_path)
    
    # Load models for image captioning
    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    
    results = []
    for frame in frames:
        ocr_text = ocr_frame(frame)
        description = describe_image(frame, model, feature_extractor, tokenizer)
        results.append({
            "ocr_text": ocr_text,
            "description": description
        })
    
    return results

In [None]:

def save_to_file(results, output_file):
    with open(output_file, 'w') as f:
        for result in results:
            f.write(f"OCR Text: {result['ocr_text']}\n")
            f.write(f"Description: {result['description']}\n\n")

In [None]:
# Example usage
video_path = "coding_video_sample.mp4"
output_file = "video_content.txt"

results = process_video(video_path)
save_to_file(results, output_file)

print(f"Video content has been processed and saved to {output_file}")