In [None]:
# import libraries
import os
import cv2
import torch
import requests
from PIL import Image
from transformers import pipeline
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from rake_nltk import Rake
import nltk

In [2]:
def extract_keyframes(video_path, output_dir, frame_interval):
    os.makedirs(output_dir, exist_ok = True)
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    keyframe_count = 0

    while cap.isOpened():
        ret, frame = cap.read()

        if not ret:
            break

        if frame_count % frame_interval == 0:
            keyframe_path = os.path.join(output_dir, f"keyframe_{keyframe_count}.jpg")
            cv2.imwrite(keyframe_path, frame)
            keyframe_count += 1

        frame_count += 1

    cap.release()

    return [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.endswith(".jpg")]

def generate_captions(image_paths):
    captions = []
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    for image_path in image_paths:
        image = Image.open(image_path).convert("RGB")
        # unconditional image captioning
        inputs = processor(image, return_tensors = "pt")
        output = model.generate(**inputs, max_new_tokens = 30)
        decoded_caption = processor.decode(output[0], skip_special_tokens = True)
        captions.append(decoded_caption)

    return captions

def generate_keywords_with_llm(captions):
    # concatenate all captions into a single string, separated by commas
    caption_text = repr(captions)
    print(caption_text)
    # load pre-trained gpt2 model and tokenizer
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    # load other larger pre-trained llm
    # model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
    # tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

    # create a Hugging Face pipeline for text generation
    generator = pipeline("text-generation", model = model, tokenizer = tokenizer, max_new_tokens = 150, do_sample = True, temperature = 0.9)

    # wrap the pipeline in langchain's huggingfacepipeline
    llm = HuggingFacePipeline(pipeline = generator)

    # define a prompt template for extracting keywords
    prompt_template = PromptTemplate(
        input_variables=["captions"],
        template="""
Given the following list of captions extracted from video keyframes, extract a set of meaningful and relevant keywords or short phrases from the captions. These keywords should represent the main objects, subjects, and actions described in the captions. The goal is to summarize the most important concepts in a concise list of keywords.

Here are the captions:
{captions}

Extracted Keywords:
"""
    )

    # create an llm chain using the huggingfacepipeline
    llm_chain = LLMChain(prompt = prompt_template, llm = llm)

    # run the chain to extract keywords
    keywords = llm_chain.run({"captions": caption_text})

    print("extracted keywords:", keywords)

    return keywords

def extract_keywords_tfidf(captions):
    vectorizer = TfidfVectorizer(stop_words = 'english')
    tfidf_matrix = vectorizer.fit_transform(captions)

    feature_names = vectorizer.get_feature_names_out()
    scores = tfidf_matrix.sum(axis = 0).A1
    
    sorted_indices = scores.argsort()[::-1]
    sorted_keywords = [(feature_names[i], scores[i]) for i in sorted_indices]
    
    keywords = [keyword for keyword, score in sorted_keywords]
    
    return keywords

def extract_keywords_rake(captions):
    rake = Rake()
    text = " ".join(captions)
    rake.extract_keywords_from_text(text)
    ranked_phrases = rake.get_ranked_phrases_with_scores()
    keywords = [phrase for score, phrase in ranked_phrases]

    return keywords

def process_video(video_path, frame_interval = 30):
    print("extracting keyframes.....")
    keyframes_dir = "keyframes"
    keyframes = extract_keyframes(video_path, keyframes_dir, frame_interval)

    print("generating captions.....")
    captions = generate_captions(keyframes)

    print("generating keywords.....")
    # generate_keywords_with_llm(captions)
    # other_keywords = extract_keywords_rake(captions)
    keywords = extract_keywords_tfidf(captions)

    return captions, keywords

video_path = "video/mayvortexx181100248.mp4"
captions, keywords = process_video(video_path)

captions, keywords

extracting keyframes.....
generating captions.....
generating keywords.....


(['a bunch of flowers on a table',
  'a bunch of flowers in a vase',
  'a bouquet of flowers on a table',
  'beautiful flowers in vase on table',
  'a bouquet of flowers with a blur effect',
  'a bouquet of flowers on a table'],
 ['flowers',
  'table',
  'bouquet',
  'bunch',
  'vase',
  'beautiful',
  'effect',
  'blur'])