In [1]:
!pip install torch torchvision transformers decord
!pip install numpy pandas



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
%cd /content/drive/MyDrive/Research/YoutubePortfolio/Benchmarking

/content/drive/MyDrive/Research/YoutubePortfolio/Benchmarking


In [4]:
import pandas as pd
import os
import numpy as np
from transformers import LlavaNextVideoProcessor, LlavaNextVideoForConditionalGeneration
import torch
from decord import VideoReader, cpu
from torchvision import transforms
from decord import VideoReader, cpu
from torchvision.transforms import Resize, ToPILImage, Compose
from PIL import Image

In [7]:
data_path = './video_transcriptions.csv'
video_segments_path = './video_segments'
output_dir='./model_outputs_video_segments'

frame_rate = 0.25  # frames per second
max_frame_size = 512  # Maximum size for frame resizing

# This works
#model_id = "llava-hf/LLaVA-NeXT-Video-7B-hf"
#model_id = "llava-hf/LLaVA-NeXT-Video-7B-32K-hf"
#model_id = "llava-hf/LLaVA-NeXT-Video-7B-DPO-hf"
model_id = "llava-hf/LLaVA-NeXT-Video-34B-hf"
#model_id = "llava-hf/LLaVA-NeXT-Video-34B-DPO-hf"

In [8]:
def load_llava_next_model():
    #token = "
    processor = LlavaNextVideoProcessor.from_pretrained(model_id)
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        low_cpu_mem_usage=True,
    ).to(0)

    return processor, model

def query_llava_next(processor, model, prompt, input_frames):
    if input_frames is None or input_frames.shape[0] == 0:
        raise ValueError("No frames provided for processing.")
    inputs_video = processor(text=prompt, videos=input_frames, padding=True, return_tensors="pt").to(model.device)

    output = model.generate(**inputs_video, max_new_tokens=2048)
    return processor.decode(output[0], skip_special_tokens=True)

In [9]:
def create_prompt(series, lm_type = "lm", whole = True, price = False):
    """
    Generates a structured prompt based on the provided title, transcript, language model type, and whether we.

    Args:
        series (pd.Series): The pandas series where we get title, transcript, segment_transcript
        processor: For VLM specifically
        lm_type (str, optional): The type of language model the prompt is intended for (default is "lm").
                                 Options are "lm" or "vlm". vlm has a slightly larger prompt to incorporate facial expression.
        whole (bool, optional): Specifies whether to use the entire transcript (True) or a subset (False).
                                Defaults to True.
        price (bool, optional): Only available for vlm. Can the vlm detect price.

    Returns:
        str: A formatted prompt stored in the variable `prompt`, ready for input into a language model.
    """

    # title
    # transcript
    # 'segment_transcript'

    title = series['video_title']
    transcript = series['transcript']
    segment_transcript = series['segment_transcript']

    # The "video is provided along" part of the prompt is from hour video
    # https://huggingface.co/datasets/HourVideo/HourVideo/blob/main/prompts/baseline_evaluations/gemini-1.5-pro/qa_eval.yaml
    if lm_type == "vlm":
        video_statement = ("\nThe video is provided along with this prompt.",)
    else:
        video_statement = ("",)

    if lm_type == "vlm" and whole == True:
        yt_video_statement = "video, "
    elif lm_type == "vlm" and whole == False:
        yt_video_statement = "video and "
    else:
        yt_video_statement = ""

    if lm_type == "vlm":
        facial_expression = ("\n           - Facial Expressions: Neutral or doubtful (furrowed brows, pursed lips).",
                             "\n           - Facial Expressions: Moderate enthusiasm (mild smiles, slightly raised eyebrows).",
                             "\n           - Facial Expressions: Enthusiastic, energetic (wide smiles, raised eyebrows).")
    else:
        facial_expression = ("",
                             "",
                             "")

    if whole == True and lm_type == "vlm":
        whole_transcript_specific = (", and video title",
                            f"""Inputs:
    - Video Title: {title}
    - Transcript: {transcript}""",
                            "\n           - Consistency: Low conviction if the title makes a bold claim, but the transcript lacks matching conviction.",
                            "\n           - Consistency: Medium conviction if the title makes a bold claim, followed by consistent confidence in the transcript.",
                            "\n           - Consistency: High conviction if the title and transcript are strongly aligned.")
    elif whole == True and lm_type == "lm":
      whole_transcript_specific = (" and video title",
                          f"""Inputs:
  - Video Title: {title}
  - Transcript: {transcript}""",
                          "\n           - Consistency: Low conviction if the title makes a bold claim, but the transcript lacks matching conviction.",
                          "\n           - Consistency: Medium conviction if the title makes a bold claim, followed by consistent confidence in the transcript.",
                          "\n           - Consistency: High conviction if the title and transcript are strongly aligned.")


    else:
        whole_transcript_specific = ("",
                            f"""Inputs:
    - Transcript: {segment_transcript}""",
                             "",
                             "",
                            "")

    # VLM part of this prompt "video provided along with this prompt" was inspired by hour video
    # https://huggingface.co/datasets/HourVideo/HourVideo/blob/main/prompts/baseline_evaluations/gemini-1.5-pro/qa_eval.yaml

    prompt = f"""Analyze the YouTube {yt_video_statement}transcript{whole_transcript_specific[0]} of influencers discussing the US stock market, focusing on stock recommendations and their conviction.{video_statement[0]}

    {whole_transcript_specific[1]}

    Instructions:
    1. Does the video contain any stock recommendations:
       - Label this as `Stock Recommendations Present` with either "Yes" or "No".

    2. If `Stock Recommendations Present` is "Yes", create a list under the key `Recommendations`. Each recommendation should follow this structure:{{"Action": "Buy | Hold | Don't Buy | Sell | Short Sell | Unclear",
         "Justification": "Brief explanation for the action based on the transcript",
         "Conviction Score": "1 | 2 | 3",
         "Ticker Name": "Ticker name"}}

       Details for each field:
       - `Action`: Categorize each stock recommendation as:
         - "Buy": Purchase shares of the stock.
         - "Hold":  Retain the stock if already owned, without necessarily
    buying more.
         - "Don't Buy": Refrain from purchasing the stock.
         - "Sell": Sell shares of the stock currently owned.
         - "Short Sell": Sell shares not currently owned, intending to
    buy them back later at a lower price.
         - "Unclear": When the action is not explicitly stated.
       - `Justification`: Provide a brief explanation for the action based on the transcript.
       - `Conviction Score`: Assign a score based on the following criteria:
         - "1" (Low Conviction):
           - Tone: Hesitant or uncertain language, frequent qualifiers (e.g., “maybe,” “possibly”).{facial_expression[0]}
           - Delivery: Reserved or doubtful language.{whole_transcript_specific[2]}
         - "2" (Moderate Conviction):
           - Tone: Relatively confident language with some qualifiers.{facial_expression[1]}
           - Delivery: Balanced and moderately positive language.{whole_transcript_specific[3]}
         - "3" (High Conviction):
           - Tone: Strong, assertive language without hesitation.{facial_expression[2]}
           - Delivery: Decisive recommendations with no qualifiers.{whole_transcript_specific[4]}
       - `Ticker Name`: Specify the ticker name of the stock being discussed.

    3. If `Stock Recommendations Present` is "No", return the following structure:{{"Stock Recommendations Present": "No",
         "Recommendations": []
       }}

    Output Requirements:
    - Return only valid JSON that can be directly parsed by JSON libraries.
    - Do not include any additional text, comments, formatting indicators (e.g., `json` or backticks), or explanatory content.
    """

    return prompt

In [10]:
def load_video(video_path, sampling_fps, size=512, verbose=True):
    """
    Load video frames, sample them at the specified frame rate, and return a NumPy array of raw RGB frames.

    Args:
        video_path (str): Path to the video file.
        sampling_fps (float): Frames per second to sample.
        size (int): The maximum size (in pixels) of the larger dimension for resizing.
        verbose (bool): Whether to print debug information.

    Returns:
        np.ndarray: Decoded and sampled frames as a NumPy array of shape (num_frames, height, width, 3).
    """
    # Initialize the video reader
    video_reader = VideoReader(video_path, ctx=cpu(0))
    video_length = len(video_reader)

    # Get the frames per second (FPS)
    fps = video_reader.get_avg_fps()

    # Determine the indices of frames to sample
    frame_indices = np.arange(0, video_length, int(fps / sampling_fps))
    if verbose:
        print(f'> Reading video: {video_path}')
        print(f'Stats => fps: {fps}, #frames: {video_length}, sampling fps: {sampling_fps}, #sampled_frames: {len(frame_indices)}')

    # Extract raw frames as numpy arrays
    raw_sample_frms = video_reader.get_batch(frame_indices).asnumpy()  # Shape: (Batch, Height, Width, Channels)
    if verbose:
        print(f'Raw frames shape before resizing: {raw_sample_frms.shape}')

    # Resize frames while maintaining aspect ratio
    def resize_frame(frame, target_size):
        from PIL import Image
        image = Image.fromarray(frame)  # Convert to PIL Image
        width, height = image.size

        # Calculate the new dimensions
        if width > height:
            new_width = target_size
            new_height = int((target_size / width) * height)
        else:
            new_height = target_size
            new_width = int((target_size / height) * width)

        # Resize the image and convert back to NumPy array
        resized_image = image.resize((new_width, new_height), Image.BICUBIC)
        return np.array(resized_image)

    # Resize all sampled frames
    processed_frames = np.array([resize_frame(frame, size) for frame in raw_sample_frms])
    if verbose:
        print(f'Processed frames shape after resizing: {processed_frames.shape}')

    return processed_frames

In [11]:
def save_response(video_id, start, end, response, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists
    file_path = os.path.join(output_dir, f"{video_id}__{start}__{end}.txt")
    with open(file_path, "w") as f:
        f.write(response)

def load_existing_response(video_id, start, end, output_dir):
    file_path = os.path.join(output_dir, f"{video_id}__{start}__{end}.txt")
    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            return f.read()
    return None

In [12]:
def process_video_row(row, processor, model, frame_rate, max_frame_size, video_segments_path, output_dir):
    video_id = row['video_id']
    title = row['video_title']
    start = row['start']
    end = row['end']
    segment_transcript = row['segment_transcript']

    # Check if output already exists
    existing_response = load_existing_response(video_id, start, end, output_dir)
    if existing_response:
        print(f"Skipping row as output already exists: {video_id}__{start}__{end}")
        return existing_response

    # Skip if segment duration exceeds 4 minutes
    if (end - start) > 150:
        print(f"Skipping row due to segment duration exceeding 4 minutes: {video_id}__{start}__{end}")
        return "VideoTooLong"

    video_file_path = os.path.join(video_segments_path, f"{video_id}__{start}__{end}.mp4")
    if not os.path.exists(video_file_path):
        print(f"Video file not found: {video_file_path}")
        return "NoVideoFile"

    try:
        # Extract frames
        frames = load_video(video_file_path, sampling_fps=frame_rate, size=max_frame_size, verbose=True)
        if frames is None or len(frames) == 0:
            print(f"No frames extracted for video: {video_file_path}")
            return "NoFrames"

        # Create prompt
        num_images = len(frames)
        # Whole needs to be changed to True for Full Videos (there might be one or two videos we can run)
        base_prompt = create_prompt(row, lm_type = "vlm", whole = False, price = False)

        # VLM Specific
        conversation = [{
            "role": "user",
            "content": [
              {"type": "text", "text": base_prompt},
              {"type": "video"},
            ],
          },
        ]
        prompt = processor.apply_chat_template(conversation)


        # Query LLaVA-NeXT model
        response = query_llava_next(processor, model, prompt, frames)

        # Save response
        save_response(video_id, start, end, response, output_dir)
        return response

    except Exception as e:
        print(f"Error processing video {video_id}__{start}__{end}: {e}")
        return "ErrorWhileProcessing"

In [13]:
def process_videos(df, processor, model, video_segments_path, frame_rate, max_frame_size, output_dir):

    results = []

    # Iterate over each row
    for index, row in df.iterrows():
        try:
            response = process_video_row(row, processor, model, frame_rate, max_frame_size, video_segments_path, output_dir)
            print(f"Response for row {index}: {response}")
            results.append(response)
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            results.append(None)

    # Add model outputs to DataFrame
    df['llava_next_segment_output'] = results

    # Save updated DataFrame
    output_csv_path = "llava_next_segment.csv"
    df.to_csv(output_csv_path, index=False)
    print(f"Updated CSV saved to {output_csv_path}")

In [14]:
data_path

'./video_transcriptions.csv'

In [15]:
# Load data
df = pd.read_csv(data_path).head(1)

# Load processor and model
processor, model = load_llava_next_model()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [16]:
df.columns

Index(['Unnamed: 0', 'id', 'derived_inner_id', 'video_id', 'start', 'end',
       'action', 'action_source', 'conviction_score', 'ticker_name',
       'action_date', 'price', 'quantity', 'video_title', 'annotation_id',
       'annotator', 'is_rec_present', 'original_inner_id',
       'original_video_title', 'publishedAt', 'channelId', 'channelTitle',
       'videoDescription', 'tags', 'defaultAudioLanguage', 'duration',
       'isCaptionAvailable', 'viewCount', 'likeCount', 'favoriteCount',
       'commentCount', 'comments', 'channelDescription', 'channelViewCount',
       'channelSubscriberCount', 'videoCount', 'channelCategory', 'transcript',
       'segment_transcript'],
      dtype='object')

In [17]:
output_dir

'./model_outputs_video_segments'

In [18]:
# df, video_segments_path, frame_rate, max_frame_size, output_dir

process_videos(df = df,
               processor = processor,
               model = model,
               video_segments_path = video_segments_path,
               frame_rate = frame_rate,
               max_frame_size = max_frame_size,
               output_dir = output_dir)

> Reading video: ./video_segments/0CJU8R4oNFk__109.87117723384252__158.5389470152761.mp4
Stats => fps: 23.976023976023978, #frames: 1154, sampling fps: 0.25, #sampled_frames: 13
Raw frames shape before resizing: (13, 720, 1280, 3)
Processed frames shape after resizing: (13, 288, 512, 3)
Response for row 0: USER: 
Analyze the YouTube video and transcript of influencers discussing the US stock market, focusing on stock recommendations and their conviction.
The video is provided along with this prompt.
    
    Inputs:
    - Transcript:  these in a minute. First up is $225 million dollar Veritone Inc., ticker V-E-R-I, a cloud-based AI platform that structures audio and video data. And now Nation, if that sounds like a bunch of tech jargon, just understand that Veritone is in the convergence of what's going to be the three biggest trends over the next decade. Data analysis of audio and video content, cloud-based connectivity, and an AI platform that learns to become more effective. And the

In [19]:
!pip freeze > requirements.txt