In [13]:
# Check if the GPU is Detected

import torch
print(torch.cuda.is_available())

False


In [14]:
import os
import cv2
import glob
import json
import torch
import requests
import numpy as np
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms

from PIL import Image
from dotenv import load_dotenv
from google.colab import files
from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline

load_dotenv()

True

In [7]:
video_path = r"input/VID.mp4"
output_dir = r"output/frames"

os.makedirs(output_dir, exist_ok=True)

vidcap = cv2.VideoCapture(video_path)
success, image = vidcap.read()
count = 0
frame_skip = 30                                 # Adjust accordingly to control frame extraction rate

while success:
    if count % frame_skip == 0:
        cv2.imwrite(os.path.join(output_dir, f"frame_{count}.jpg"), image)
    success, image = vidcap.read()
    count += 1

print(f"Extracted {len(os.listdir(output_dir))} frames to '{output_dir}'")

Extracted 0 frames to '/content/drive/MyDrive/Video-to-Text Summarization/frames'


In [16]:
# Load pretrained ResNet (remove last fully-connected layer)
resnet = models.resnet50(weights=True)              # You can use resnet18 for faster/slimmer model
resnet.eval()
feature_extractor = torch.nn.Sequential(*list(resnet.children())[:-1])

# Image preprocessing pipeline for ResNet
preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225])
])

In [19]:
frame_files = sorted(glob.glob("/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_*.jpg"))
features = []

for path in frame_files:
    image = Image.open(path).convert("RGB")
    input_tensor = preprocess(image).unsqueeze(0)           # Add batch dimension

    with torch.no_grad():
        output = feature_extractor(input_tensor)
        vector = output.squeeze().cpu().numpy()

    features.append(vector)

features = np.array(features)
print(f"Extracted feature vectors for {len(features)} frames, vector shape: {features[0].shape}")

Extracted feature vectors for 200 frames, vector shape: (2048,)


In [23]:
class FrameImportanceLSTM(nn.Module):
    def __init__(self, input_size=2048, hidden_size=256, num_layers=2):
        super(FrameImportanceLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size*2, 1)

    def forward(self, x):
        h, _ = self.lstm(x)
        scores = torch.sigmoid(self.fc(h)).squeeze(-1)
        return scores

# Instantiate the model
model = FrameImportanceLSTM()
model.eval()

features_tensor = torch.tensor(features).unsqueeze(0).float()
with torch.no_grad():
    importance_scores = model(features_tensor).squeeze().cpu().numpy()

# Select key frames above a threshold
import numpy as np
threshold = 0.5
key_indices = np.where(importance_scores > threshold)[0]
key_frames = [frame_files[i] for i in key_indices]
print("Selected key frames for summary:", key_frames)

Selected key frames for summary: ['/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_0.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1020.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1050.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1080.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1110.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1140.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1170.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_120.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1200.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1230.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1260.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames/frame_1290.jpg', '/content/drive/MyDrive/Video-to-Text Summarization/frames

In [27]:
# Download pretrained BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
caption_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")


In [28]:
def caption_frame(image_path, processor, model):
    raw_image = Image.open(image_path).convert('RGB')
    inputs = processor(raw_image, return_tensors="pt")
    with torch.no_grad():
        out = model.generate(**inputs)
        caption = processor.decode(out[0], skip_special_tokens=True)
    return caption

# Generate and print summaries
video_summary = []
for idx, frame_path in enumerate(key_frames):
    caption = caption_frame(frame_path, processor, caption_model)
    video_summary.append(f"Scene {idx+1}: {caption}")
    print(f"Scene {idx+1}: {caption}")


Scene 1: opening and closing of the screen
Scene 2: a plant with a stem and leaves on it
Scene 3: opening closing of stomata a large number of sponges in the stomata
Scene 4: opening and closing of stomata
Scene 5: opening and closing of smta
Scene 6: a large green leaf with a small green leaf on it
Scene 7: a large green leaf with a small green leaf with a small green leaf with a small green leaf with
Scene 8: opening and closing of stomata
Scene 9: a large, green leaf with a small leaf inside it is labeled to the center of the plant
Scene 10: a large, long - lived cell that has been to be used for the use of the cell
Scene 11: opening and closing of stomata
Scene 12: opening and closing of stomata
Scene 13: opening and closing of stomata
Scene 14: a large, long - lived cell that has been to be used for the use of the cell
Scene 15: opening clogi of smta a large number of smta cells in the leaf
Scene 16: opening closing of stomata
Scene 17: opening and closing of stomata
Scene 18: ope

In [29]:
with open("/content/drive/MyDrive/Video-to-Text Summarization/video_summary.txt", "w") as f:
    for line in video_summary:
        f.write(line + "\n")

In [31]:
summarizer = pipeline("summarization", model="t5-small")

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
f = open("/content/drive/MyDrive/Video-to-Text Summarization/video_summary.txt", "r")
summary_raw = f.read()

response = requests.post(
    os.getenv(("API_URL")),
    headers = {
        "Authorization":f"Bearer {os.getenv("API_KEY")}",
    },
    data = json.dumps({
        "model":"tngtech/deepseek-r1t2-chimera:free",
        "messages":[{
            "role":"user",
            "content": f'''
            Rewrite and summarize this scientific video breakdown for a general audience. Only give the summary, no other intro or outro.:

            {summary_raw}''',
      }]
    })
)

print(response.json())

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Video-to-Text Summarization/video_summary.txt'

In [9]:
print(response.json()['choices'][0]['message']['content'])

AI summary:
 
This video explains how plants regulate gas exchange and water through tiny leaf pores called stomata. Stomata continually open and close, controlled by surrounding guard cells which swell or shrink due to water pressure changes. When guard cells fill with water, the stomata open, allowing carbon dioxide in for photosynthesis while releasing oxygen and water vapor. The thick-walled guard cells prevent collapse, ensuring efficient operation. The process connects to larger plant functions, including growth, water circulation, and adaptations in both land and aquatic plants. Repeated visuals of stomata, guard cells, and diagrams emphasize this dynamic balance critical to plant survival.
