In [1]:
!pip install openai



In [2]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/Research/YoutubePortfolio/Benchmarking

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Research/YoutubePortfolio/Benchmarking


In [3]:
import pandas as pd
import os
import numpy as np
from openai import OpenAI
import os, yaml
import cv2
import base64
import json
from google.colab.patches import cv2_imshow

In [4]:

frame_rate = 0.25  # frames per second
max_frame_size = 512  # Maximum size for frame resizing
temperature = 0.0
seed = 2025
MAX_ANSWER_TOKENS = 512

os.environ['OPENAI_API_KEY'] = ''
model_id = 'gpt-4o'

In [5]:
def load_gpt_client():
    client = OpenAI()
    return client

def query_gpt(client, prompt, base64_images, model_id):
    messages = [{
        "role": "user",
        "content": [
            prompt,
            *map(lambda x: {"image": x}, base64_images),
        ],
    }]
    response = client.chat.completions.create(model = model_id, messages = messages, max_tokens = MAX_ANSWER_TOKENS, temperature = temperature, seed = seed)
    return response.choices[0].message.content

In [6]:
def full_length_create_prompt(series, lm_type = "vlm", whole = True, price = False):
    """
    Generates a structured prompt based on the provided title, transcript, language model type

    Args:
        series (pd.Series): The pandas series
        lm_type (str, optional): The type of language model the prompt is intended for (default is "lm").
                                 Options are "lm" or "vlm". vlm has a slightly larger prompt to incorporate facial expression.
        whole (bool, optional): Specifies whether to use the entire transcript (True) or a subset (False).
                                Defaults to True.
        price (bool, optional): Only available for vlm. Can the vlm detect price.

    Returns:
        str: A formatted prompt stored in the variable `prompt`, ready for input into a language model.
    """

    title = series['video_title']

    transcript = series['transcript']


    video_statement = ("\nThe video is provided along with this prompt.",)

    yt_video_statement = "video,"

    facial_expression = ("\n           - Facial Expressions: Neutral or doubtful (furrowed brows, pursed lips).",
                          "\n           - Facial Expressions: Moderate enthusiasm (mild smiles, slightly raised eyebrows).",
                          "\n           - Facial Expressions: Enthusiastic, energetic (wide smiles, raised eyebrows).")

    whole_transcript_specific = ("and video title",
                            f"""Inputs:
    - Video Title: {title}
        - This is the title of the video.
    - Transcript: {transcript}
        - This is the transcript from the video.
    - Video:
        - The video is provided along with this prompt.
""",
                           "\n           - Consistency: Low conviction if the title makes a bold claim, but the supporting input lacks matching conviction.",
                            "\n           - Consistency: Moderate conviction if the title makes a bold claim, followed by consistent confidence in the supporting input.",
                            "\n           - Consistency: High conviction if the title and supporting input are strongly aligned.")


    prompt = f"""Analyze the YouTube {yt_video_statement} transcript, {whole_transcript_specific[0]} of an influencer discussing the US stock market. Identify stock recommendations and assess their conviction level.
    {whole_transcript_specific[1]}

    Instructions:
    1. Does the video contain any stock recommendations:
       - Label this as `Stock Recommendations Present` with either "Yes" or "No".

    2. If `Stock Recommendations Present` is "Yes", create a list under the key `Recommendations`. Each recommendation should follow this structure:{{"Action": "Buy | Hold | Don't Buy | Sell | Short Sell | Unclear",
         "Justification": "Brief explanation for the action based on the inputs",
         "Conviction Score": "1 | 2 | 3",
         "Ticker Name": "Ticker name"}}

       Details for each field:
        - `Action`: Categorize each stock recommendation as:
          - "Buy": Purchase shares of the stock.
          - "Hold": Retain the stock if already owned.
          - "Don't Buy": Refrain from purchasing the stock.
          - "Sell": Sell shares currently owned.
          - "Short Sell": Sell shares not currently owned, intending to buy them back later at a lower price.
          - "Unclear": When the action is not explicitly stated.
       - `Justification`: Provide a brief explanation for the action based on the provided input.
       - `Conviction Score`: Assign a score based on the following criteria:
         - "1" (Low Conviction):
           - Tone: Hesitant or uncertain language, frequent qualifiers (e.g., “maybe,” “possibly”).{facial_expression[0]}
           - Delivery: Reserved or doubtful language.{whole_transcript_specific[2]}
         - "2" (Moderate Conviction):
           - Tone: Relatively confident language with some qualifiers.{facial_expression[1]}
           - Delivery: Balanced and moderately positive language.{whole_transcript_specific[3]}
         - "3" (High Conviction):
           - Tone: Strong, assertive language without hesitation.{facial_expression[2]}
           - Delivery: Decisive recommendations with no qualifiers.{whole_transcript_specific[4]}
       - `Ticker Name`: Specify the ticker name of the stock being discussed.

    3. If `Stock Recommendations Present` is "No", return the following structure:{{"Stock Recommendations Present": "No",
         "Recommendations": []
       }}

    Output Requirements:
    - Return only valid JSON that can be directly parsed by JSON libraries.
    - Do not include any additional text, comments, formatting indicators (e.g., `json` or backticks), or explanatory content.
    """

    return prompt

In [7]:
def save_response(video_id, start, end, response, output_dir):
    os.makedirs(output_dir, exist_ok=True)  # Ensure the output directory exists

    # Full length
    file_path = os.path.join(output_dir, f"{video_id}.txt")
    #file_path = os.path.join(output_dir, f"{video_id}__{start}__{end}.txt")

    with open(file_path, "w") as f:
        f.write(response)

def load_existing_response(video_id, start, end, output_dir):

    # Full length
    file_path = os.path.join(output_dir, f"{video_id}.txt")

    #file_path = os.path.join(output_dir, f"{video_id}__{start}__{end}.txt")

    if os.path.exists(file_path):
        with open(file_path, "r") as f:
            return f.read()
    return None

In [None]:
def encode_frame(frame, image_format='.jpg'):
    """
    Encodes a NumPy image (frame) to a base64 string.

    Args:
        frame (numpy.ndarray): The image to encode.
        image_format (str): The format to encode the image in (default is '.jpg').

    Returns:
        str: Base64-encoded string of the image.
    """
    success, buffer = cv2.imencode(image_format, frame)
    if not success:
        raise ValueError("Could not encode frame.")
    return base64.b64encode(buffer).decode('utf-8')


def sample_video_frames(video_path, sampling_fps, target_width=512, target_height=512):
    """
    Samples frames from a video at the specified sampling FPS, resizes each frame to 512×512 pixels,
    and encodes each frame as a base64 string.

    Args:
        video_path (str): The path to the video file.
        sampling_fps (float): The number of frames to sample per second (e.g., 0.25 means one frame every 4 seconds).
        target_width (int): The target width of the resized frame (default is 512).
        target_height (int): The target height of the resized frame (default is 512).

    Returns:
        List[str]: A list of base64-encoded strings representing the sampled frames.
    """
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        raise IOError(f"Cannot open video file: {video_path}")

    # Get the video's FPS.
    video_fps = cap.get(cv2.CAP_PROP_FPS)
    if video_fps <= 0:
        raise ValueError("Could not determine the video's FPS.")

    # Calculate the interval (in frames) between samples.
    frame_interval = int(round(video_fps / sampling_fps))

    encoded_frames = []
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break  # End of video
        if frame_count % frame_interval == 0:
            # Resize the frame to exactly 512 x 512 pixels.
            resized_frame = cv2.resize(frame, (target_width, target_height))
            # Encode the frame as a base64 string.
            encoded_image = encode_frame(resized_frame, image_format='.jpg')
            encoded_frames.append(encoded_image)
        frame_count += 1

    cap.release()
    # display_encoded_frames(encoded_frames)
    return encoded_frames

In [None]:
def full_length_process_video_row(row, client, frame_rate, max_frame_size, video_segments_path, output_dir, model_id):
    video_id = row['video_id']
    title = row['video_title']
    start = "0" # Conform code to segment standard
    end = "0" # Conform code to segment standard

    # Check if output already exists
    existing_response = load_existing_response(video_id, start, end, output_dir)
    if existing_response:
        # This is for video_id
        print(f"Skipping row as output already exists: {video_id}")
        return existing_response

    # This is for start and end
    video_file_path = os.path.join(video_segments_path, f"{video_id}.mp4")

    if not os.path.exists(video_file_path):
        print(f"Video file not found: {video_file_path}")
        return "NoVideoFile"

    try:
        # Create prompt
        prompt = full_length_create_prompt(row,
              lm_type = 'vlm',
              whole = True,
              price = False
             )

        video_frames = sample_video_frames(video_file_path, sampling_fps=frame_rate, target_width=max_frame_size, target_height=max_frame_size)

        # Query GPT model
        response = query_gpt(client, prompt, video_frames, model_id)

        # Save response
        save_response(video_id, start, end, response, output_dir)
        return response

    except Exception as e:
        print(f"Error processing video {video_id}: {e}")
        #print(f"Error processing video {video_id}__{start}__{end}: {e}")
        return "ErrorWhileProcessing"

In [10]:
def process_videos(data_path, video_segments_path, frame_rate, max_frame_size, output_dir):
    data = pd.read_csv(data_path)
    results = []

    # Load processor and model
    client = load_gpt_client()
    # Iterate over each row
    for index, row in data[:1].iterrows():
        try:
            response = process_video_row(row, client, frame_rate, max_frame_size, video_segments_path, output_dir)
            #print(f"Response for row {index}: {response}")
            results.append(response)
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            results.append(None)

## Testing Full Length Prompt

In [11]:

data_path = './w_transcripts_for_inference.csv' # Full Length Specific
video_segments_path = './videos' ## Full Length This variable name sucks.

data = pd.read_csv(data_path)
results = []


# Iterate over each row
for idx, row in data[:10].iterrows():
    # Extract title and transcript
    series = row

In [12]:
# Create prompt
prompt = full_length_create_prompt(series,
      lm_type = 'vlm',
      whole = True,
      price = False
      )

In [13]:
print(prompt)

Analyze the YouTube video, transcript, and video title of an influencer discussing the US stock market. Identify stock recommendations and assess their conviction level.
    Inputs:
    - Video Title: TOP 10 HIGHEST PAYING DIVIDEND STOCKS FOR 2019
        - This is the title of the video.
    - Transcript:  Top 10 highest dividend paying stocks out there, guys. That is what we are talking about here today. I went ahead and I went through the entire S&P 500 and I found the 10 highest paying dividend stocks out there in terms of the yield they will return to you as a shareholder, guys. I hope you really enjoy this. Still took a lot of work to kind of go into this and look at all these different stocks. I'm gonna share all the stocks with you in order starting from number 10 all the way down to number one. And I'm also gonna explain kind of their business model, what those businesses actually do. And hopefully this is a good starting point for some of you guys that wanna get into some div

## Full-length Inference

In [15]:
data_path = './w_transcripts_for_inference.csv' # Full Length Specific
video_segments_path = './videos' ## Full Length This variable name sucks.

data = pd.read_csv(data_path)

frame_rate = 0.25  # frames per second
max_frame_size = 512  # Maximum size for frame resizing
temperature = 0.0

# model_name = 'gpt-4o-2024-08-06'
# model_name = 'gpt-4o-mini-2024-07-18'
# model_name = o3-mini-2025-01-31

# 'gpt-4o-2024-08-06'
# o1-2024-12-17


for model_name in ['gpt-4o-2024-08-06', 'gpt-4o-mini-2024-07-18']:
  print(model_name)
  print('\n')
  model_id = "models/" + model_name

  output_dir=f'./{model_name}_outputs_video_full_length' ##


  results = []


  # Load processor and model
  client = load_gpt_client()

  for index, row in data.iterrows():
      if index != 0 and index % 60 == 0:
        print(f"Processed {index} iterations so far.")
      try:
          response = full_length_process_video_row(row,
                                                   client,
                                                   frame_rate,
                                                   max_frame_size,
                                                   video_segments_path,
                                                   output_dir,
                                                   model_id = model_name)

          #print(f"Response for row {index}: {response}")
          results.append(response)
      except Exception as e:
          print(f"Error processing row {index}: {e}")
          results.append(None)


gpt-4o-2024-08-06


Processed 60 iterations so far.
Processed 120 iterations so far.
Processed 180 iterations so far.
Processed 240 iterations so far.
gpt-4o-mini-2024-07-18


Processed 60 iterations so far.
Processed 120 iterations so far.
Processed 180 iterations so far.
Processed 240 iterations so far.


# Segment Video Inference

In [None]:
data = pd.read_csv(data_path)

# Selected region and only if rec is present. (381 videos)
# Applying all the filters using .loc
filtered_df = data.loc[
    (data['action_source'] == "Selected region") &
    (data['is_rec_present'] == "Yes") &
    (~data['transcript'].isna()) &
    (data['transcript'].str.strip() != "") &
    (data['transcript'].str.split().str.len() >= 3)
]

data = filtered_df.copy()
print(data.shape)


frame_rate = 0.25  # frames per second
max_frame_size = 512  # Maximum size for frame resizing
temperature = 0.0

# model_name = 'gpt-4o-2024-08-06'
# model_name = 'gpt-4o-mini-2024-07-18'
# model_name = o3-mini-2025-01-31

# 'gpt-4o-2024-08-06'
# o1-2024-12-17


for model_name in ['gpt-4o-2024-08-06', 'gpt-4o-mini-2024-07-18']:
  print(model_name)
  print('\n')
  model_id = "models/" + model_name

  data_path = './complete_dataset.csv' ## Segment Transcript
  video_segments_path = './video_segments' ## Video Segments
  output_dir=f'./{model_name}_outputs_video_segments' ##


  results = []


  # Load processor and model
  client = load_gpt_client()

  for index, row in data.iterrows():
      if index != 0 and index % 60 == 0:
        print(f"Processed {index} iterations so far.")
      try:
          response = process_video_row(row,
                                      client,
                                      frame_rate,
                                      max_frame_size,
                                      video_segments_path,
                                      output_dir,
                                      model_id = model_id)

          #print(f"Response for row {index}: {response}")
          results.append(response)
      except Exception as e:
          print(f"Error processing row {index}: {e}")
          results.append(None)


(381, 39)
gpt-4o-2024-08-06


Skipping row as output already exists: 0CJU8R4oNFk__197.7986688222514__267.77438718612933
Skipping row as output already exists: 0CJU8R4oNFk__515.435925412456__559.6029779831492
Skipping row as output already exists: 0CJU8R4oNFk__560.5183963259649__614.2462458777909
Skipping row as output already exists: 1Gm4A7EFYI4__39.577461060109286__642.9538446765027
Processed 10 iterations so far.
Skipping row as output already exists: 1Lx7z_x4Rc0__118.7247700983607__320.404668021858
Skipping row as output already exists: 1Lx7z_x4Rc0__329.91787075409843__432.6604602622952
Skipping row as output already exists: 1Lx7z_x4Rc0__466.90799009836064__519.4208691803278
Skipping row as output already exists: 1P65IOhAFFI__37.801578557377056__57.39668254426229
Skipping row as output already exists: 1P65IOhAFFI__61.56257079344262__85.47785518688524
Skipping row as output already exists: 1Wh-i9i4V28__231.11287449851733__248.4603299693504
Skipping row as output already exists: 1Wh-i

## Extra Code for Whole Video Prompting.

In [None]:
def create_prompt(series, lm_type = "lm", whole = True, price = False):
    """
    Generates a structured prompt based on the provided title, transcript, language model type, and whether we.

    Args:
        series (pd.Series): The pandas series where we get title, transcript, segment_transcript
        processor: For VLM specifically
        lm_type (str, optional): The type of language model the prompt is intended for (default is "lm").
                                 Options are "lm" or "vlm". vlm has a slightly larger prompt to incorporate facial expression.
        whole (bool, optional): Specifies whether to use the entire transcript (True) or a subset (False).
                                Defaults to True.
        price (bool, optional): Only available for vlm. Can the vlm detect price.

    Returns:
        str: A formatted prompt stored in the variable `prompt`, ready for input into a language model.
    """

    # title
    # transcript
    # 'segment_transcript'

    title = series['video_title']
    transcript = series['transcript']
    segment_transcript = series['segment_transcript']

    # The "video is provided along" part of the prompt is from hour video
    # https://huggingface.co/datasets/HourVideo/HourVideo/blob/main/prompts/baseline_evaluations/gemini-1.5-pro/qa_eval.yaml
    if lm_type == "vlm":
        video_statement = ("\nThe video is provided along with this prompt.",)
    else:
        video_statement = ("",)

    if lm_type == "vlm" and whole == True:
        yt_video_statement = "video, "
    elif lm_type == "vlm" and whole == False:
        yt_video_statement = "video and "
    else:
        yt_video_statement = ""

    if lm_type == "vlm":
        facial_expression = ("\n           - Facial Expressions: Neutral or doubtful (furrowed brows, pursed lips).",
                             "\n           - Facial Expressions: Moderate enthusiasm (mild smiles, slightly raised eyebrows).",
                             "\n           - Facial Expressions: Enthusiastic, energetic (wide smiles, raised eyebrows).")
    else:
        facial_expression = ("",
                             "",
                             "")

    if whole == True and lm_type == "vlm":
        whole_transcript_specific = (", and video title",
                            f"""Inputs:
    - Video Title: {title}
    - Transcript: {transcript}""",
                            "\n           - Consistency: Low conviction if the title makes a bold claim, but the transcript lacks matching conviction.",
                            "\n           - Consistency: Medium conviction if the title makes a bold claim, followed by consistent confidence in the transcript.",
                            "\n           - Consistency: High conviction if the title and transcript are strongly aligned.")
    elif whole == True and lm_type == "lm":
      whole_transcript_specific = (" and video title",
                          f"""Inputs:
  - Video Title: {title}
  - Transcript: {transcript}""",
                          "\n           - Consistency: Low conviction if the title makes a bold claim, but the transcript lacks matching conviction.",
                          "\n           - Consistency: Medium conviction if the title makes a bold claim, followed by consistent confidence in the transcript.",
                          "\n           - Consistency: High conviction if the title and transcript are strongly aligned.")


    else:
        whole_transcript_specific = ("",
                            f"""Inputs:
    - Transcript: {segment_transcript}""",
                             "",
                             "",
                            "")

    # VLM part of this prompt "video provided along with this prompt" was inspired by hour video
    # https://huggingface.co/datasets/HourVideo/HourVideo/blob/main/prompts/baseline_evaluations/gemini-1.5-pro/qa_eval.yaml

    prompt = f"""Analyze the YouTube {yt_video_statement}transcript{whole_transcript_specific[0]} of influencers discussing the US stock market, focusing on stock recommendations and their conviction.{video_statement[0]}

    {whole_transcript_specific[1]}

    Instructions:
    1. Does the video contain any stock recommendations:
       - Label this as `Stock Recommendations Present` with either "Yes" or "No".

    2. If `Stock Recommendations Present` is "Yes", create a list under the key `Recommendations`. Each recommendation should follow this structure:{{"Action": "Buy | Hold | Don't Buy | Sell | Short Sell | Unclear",
         "Justification": "Brief explanation for the action based on the transcript",
         "Conviction Score": "1 | 2 | 3",
         "Ticker Name": "Ticker name"}}

       Details for each field:
       - `Action`: Categorize each stock recommendation as:
         - "Buy": Purchase shares of the stock.
         - "Hold":  Retain the stock if already owned, without necessarily
    buying more.
         - "Don't Buy": Refrain from purchasing the stock.
         - "Sell": Sell shares of the stock currently owned.
         - "Short Sell": Sell shares not currently owned, intending to
    buy them back later at a lower price.
         - "Unclear": When the action is not explicitly stated.
       - `Justification`: Provide a brief explanation for the action based on the transcript.
       - `Conviction Score`: Assign a score based on the following criteria:
         - "1" (Low Conviction):
           - Tone: Hesitant or uncertain language, frequent qualifiers (e.g., “maybe,” “possibly”).{facial_expression[0]}
           - Delivery: Reserved or doubtful language.{whole_transcript_specific[2]}
         - "2" (Moderate Conviction):
           - Tone: Relatively confident language with some qualifiers.{facial_expression[1]}
           - Delivery: Balanced and moderately positive language.{whole_transcript_specific[3]}
         - "3" (High Conviction):
           - Tone: Strong, assertive language without hesitation.{facial_expression[2]}
           - Delivery: Decisive recommendations with no qualifiers.{whole_transcript_specific[4]}
       - `Ticker Name`: Specify the ticker name of the stock being discussed.

    3. If `Stock Recommendations Present` is "No", return the following structure:{{"Stock Recommendations Present": "No",
         "Recommendations": []
       }}

    Output Requirements:
    - Return only valid JSON that can be directly parsed by JSON libraries.
    - Do not include any additional text, comments, formatting indicators (e.g., `json` or backticks), or explanatory content.
    """

    return prompt