In [21]:
import os,sys
import pandas as pd
import numpy as np
from time import sleep, time
from datetime import date
import threading
from typing import Tuple
from datasets import load_dataset
from openai import OpenAI
import google.generativeai as genai
from google.ai import generativelanguage as glm
import json
from datetime import datetime
import ast
import os

### Path to folder with inference CSVs

In [22]:
folders_list = [
    ["../inference/outputs/llm_prompt_w_transcript_outputs", "whole"],
    [ "../inference/outputs/llm_prompt_s_transcript_outputs", "segmentwise"],
    ["../inference/outputs/mllm_s_combined_output", "segmentwise"],
    ["../inference/outputs/mllm_w_combined_output", "whole"]
    ]

### Dictionary for whole transcript inference results

In [23]:
global inference_dataframes_dictionary
inference_dataframes_dictionary = {}

In [24]:
def process_csv_files(whole_folder_path, prefix_s_or_w):
    # Loop through each file in the folder
    for filename in os.listdir(whole_folder_path):
        if filename.endswith(".csv"):  # Process only CSV files
            file_path = os.path.join(whole_folder_path, filename)
            
            # Read the CSV file as a DataFrame
            df = pd.read_csv(file_path)

            # Take inference date 
            inference_date = df['date_of_inference'].iloc[0]

            # Rename 'title' column to 'video_title' if it exists
            if 'title' in df.columns:
                df.rename(columns={'title': 'video_title'}, inplace=True)

            # Get the first value from the 'model_name' column
            if 'model_name' in df.columns and not df.empty:
                key = df['model_name'].iloc[0]
                #print(key)

                if key == "deepseek-ai/DeepSeek-R1":
                    def extract_json(text):
                        # 1) Find the end of the </think> tag.
                        # If not found, we'll just use the original text.
                        idx_after_think = text.find('</think>')
                        if idx_after_think != -1:
                            text = text[idx_after_think + len('</think>'):]

                        # 2) Find the first '{' and the last '}' from the (trimmed) text.
                        start_idx = text.find('{')
                        end_idx = text.rfind('}') + 1

                        # 3) Validate that we have plausible braces.
                        if start_idx == -1 or end_idx == 0:
                            # Means no valid JSON found
                            return None

                        # 4) Extract the substring that should be valid JSON
                        return text[start_idx:end_idx]
                    
                    # Apply the function to each value in the 'prompt_output' column
                    df['prompt_output'] = df['prompt_output'].apply(extract_json)
                    print("Preprocessed DeepSeek-R1 prompt output")
                    
                # Appending "whole_" prefix before each model name for the key (to differentiate between whole and segmentwise inference results)
                modified_key = f"{prefix_s_or_w}_{key}"
                
                # Add the key-value pair to the dictionary
                inference_dataframes_dictionary[modified_key] = df

                print(modified_key, "Appended to dictionary of inference results", f"[{inference_date}]")

In [25]:
for folder_path, s_or_w in folders_list:
    process_csv_files(folder_path, s_or_w)

whole_Qwen/Qwen2.5-7B-Instruct-Turbo Appended to dictionary of inference results [2025-02-24]
Preprocessed DeepSeek-R1 prompt output
whole_deepseek-ai/DeepSeek-R1 Appended to dictionary of inference results [2025-02-24]
whole_gemini-2.0-pro-exp-02-05 Appended to dictionary of inference results [2025-02-24]
whole_mistralai/Mistral-7B-Instruct-v0.2 Appended to dictionary of inference results [2025-02-24]
whole_meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo Appended to dictionary of inference results [2025-02-24]
whole_gemini-1.5-flash-002 Appended to dictionary of inference results [2025-02-24]
whole_mistralai/Mixtral-8x22B-Instruct-v0.1 Appended to dictionary of inference results [2025-02-24]
whole_Qwen/Qwen2.5-72B-Instruct-Turbo Appended to dictionary of inference results [2025-02-24]
whole_claude-3-5-sonnet-20241022 Appended to dictionary of inference results [2025-02-24]
whole_claude-3-5-haiku-20241022 Appended to dictionary of inference results [2025-02-24]
whole_gpt-4o-2024-08-06 Ap

### Create dictionary for inference results
Here key is model_name and value is the dictionary itself

I have a list of tuples. 

Tuple first element: folder path to folder with csvs
Tuple second element: string which has "whole" or "segmentwise" value
I will iterate over the list of tuples

Every iteration, you call the processing csv function, give it the folder path and the other string value 


In [26]:
len(inference_dataframes_dictionary)

43

In [27]:
first_key_name = next(iter(inference_dataframes_dictionary), None)
print(first_key_name)
inference_dataframes_dictionary[first_key_name].head()

whole_Qwen/Qwen2.5-7B-Instruct-Turbo


Unnamed: 0,video_id,video_title,transcript,model_source,model_name,date_of_inference,prompt_used,prompt_output,video_source
0,0CJU8R4oNFk,5 Stocks to Buy Now to Double Your Money,"Hey Bowtie Nation, Joseph Hogue here with the...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...,"{\n ""Stock Recommendations Present"": ""Yes"",\n...",
1,0Fg0YsbOzJA,i am selling it,So we have decided to start selling out of Wy...,Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...,"{\n ""Stock Recommendations Present"": ""Yes"",...",
2,0OJIHD_o59M,The Best Internet Stocks for 2023 You Can Buy Now,"Hey Bowtie Nation, Joseph Hogue here and a ve...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...,"{\n ""Stock Recommendations Present"": ""Yes"",\n...",
3,1Gm4A7EFYI4,I Just Bought The PERFECT Dividend Stock (At A...,So I've had my eye on a few different stocks ...,Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...,"{\n ""Stock Recommendations Present"": ""Yes"",...",
4,1Lx7z_x4Rc0,🔵WARNING TO EVERYONE!!!🔵 I JUST SOLD IT ALL!!!,Family we absolutely dominated it today. If y...,Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...,"{\n ""Stock Recommendations Present"": ""Yes"",\n...",


In [28]:
#inference_dataframes_dictionary

### LLM output parsing function

In [29]:
from io import StringIO

In [30]:
def clean_llm_output(df_llm_raw_output, model_name):

    log_stream = StringIO()

    # Open a log file for writing
    def log_print(*args, **kwargs):
        """Helper function to log and print simultaneously."""
        print(*args, **kwargs)
        print(*args, **kwargs, file=log_stream)

    
    log_print(model_name)

    if model_name.startswith("whole_"):
        recommendations_key = "Recommendations" # CHANGED
    elif model_name.startswith("segmentwise_"):
        recommendations_key = "Recommendation" # CHANGED
    else:
        log_print("Unknown model name prefix. Defaulting to 'Recommendations'") # CHANGED
        recommendations_key = "Recommendations" # CHANGED

    
    # Lists to store indices of valid and invalid JSON strings
    valid_json_indices = []
    invalid_json_indices = []
    no_recommendation_in_valid_indices = []  # New list to store indices with empty recommendations


    # Check each JSON string in the DataFrame
    for i, data_str in enumerate(df_llm_raw_output['prompt_output']):

        # First, check if the cell is empty or only contains whitespace
        if not isinstance(data_str, str) or not data_str.strip():
            # If empty (or None, or not a string), mark it as invalid JSON
            invalid_json_indices.append(i)
            continue
        
        try:
            json.loads(data_str)  # Attempt to parse
            valid_json_indices.append(i)  # Add to valid indices if successful
        except json.JSONDecodeError:
            invalid_json_indices.append(i)  # Add to invalid indices if there's an error

    # Print results
    if invalid_json_indices:
        log_print("Invalid JSON strings found at the following indices:")
        for index in invalid_json_indices:
            log_print(f"Index {index}:")
            #print(df_llm_raw_output['prompt_output'][index])
    else:
        log_print("All JSON strings are valid!")
    

    # -------------------------------------------------------------------------------------------------------------------------

    # List to store DataFrames for each valid JSON entry
    recommendations_with_metadata = []

    # Loop through each valid JSON index
    for index in valid_json_indices:
        # Get the JSON string at the current index
        data_str = df_llm_raw_output['prompt_output'][index]
        
        try:
            # Parse the JSON string
            data_dict = json.loads(data_str)


            # -------------------
            
            # Check if the recommendations list is empty
            if not data_dict.get(recommendations_key, []):  # Default to empty list if key doesn't exist
                no_recommendation_in_valid_indices.append(index)  # Track this index
                continue
            
            # -------------------

            # Convert the recommendations list into a DataFrame
            recommendations_df = pd.DataFrame(data_dict[recommendations_key])
            
            # Add metadata columns from df_llm_raw_output to recommendations_df
            for col in ["video_id", "video_title", "transcript", "model_source", "model_name", "date_of_inference", "prompt_used"]:
                recommendations_df[col] = df_llm_raw_output[col][index]
            
            # Append this DataFrame to the list
            recommendations_with_metadata.append(recommendations_df)
            
        except KeyError as e:
            log_print(f"KeyError for index {index}: {e}")
        except Exception as e:
            log_print(f"An unexpected error occurred for index {index}: {e}")

    # Combine all the DataFrames into one
    df_llm_annotations = pd.concat(recommendations_with_metadata, ignore_index=True)

     # Process indices with empty recommendations
    for index in no_recommendation_in_valid_indices:
        # Create a new DataFrame with metadata columns filled
        metadata_row = {
            "video_id": df_llm_raw_output["video_id"][index],
            "video_title": df_llm_raw_output["video_title"][index],
            "transcript": df_llm_raw_output["transcript"][index],
            "model_source": df_llm_raw_output["model_source"][index],
            "model_name": df_llm_raw_output["model_name"][index],
            "date_of_inference": df_llm_raw_output["date_of_inference"][index],
            "prompt_used": df_llm_raw_output["prompt_used"][index],
        }
        
        # Add columns for non-metadata as empty values
        other_columns = {col: None for col in df_llm_annotations.columns if col not in metadata_row}
        
        # Combine metadata and other columns
        empty_recommendation_row = {**metadata_row, **other_columns}
        
        # Append the row to df_llm_annotations
        df_llm_annotations = pd.concat(
            [df_llm_annotations, pd.DataFrame([empty_recommendation_row])],
            ignore_index=True
        )


    # Compute stats 

    # Stat: Number of videos in the inference dataframe    
    num_videos_inference = df_llm_raw_output['video_id'].nunique()
    log_print(f"Number of videos in the inference dataframe: {num_videos_inference}")

    # Stat: Number of videos in the cleaned dataframe
    num_videos_cleaned = df_llm_annotations['video_id'].nunique()
    log_print(f"Number of videos in the cleaned dataframe: {num_videos_cleaned}")

    # Stat: Differering video ids from inference (raw) and parsed (cleaned) dataframe
    unique_raw_output = set(df_llm_raw_output['video_id'])
    unique_annotations = set(df_llm_annotations['video_id'])
    not_in_annotations = unique_raw_output - unique_annotations
    not_in_raw_output = unique_annotations - unique_raw_output
    log_print("video_id(s) in inference dataframe but not in final parsed dataframe:")
    log_print(not_in_annotations)
    log_print("\nvideo_id(s) in final parsed dataframe but not in inference dataframe:")
    log_print(not_in_raw_output)

    # Stat: Number of indices with no recommendations
    log_print(f"Indicies with no recommendations: {no_recommendation_in_valid_indices}")

    # Stat: Number of valid and invalid JSON strings
    log_print(f"Valid JSON count: {len(valid_json_indices)}")
    log_print(f"Invalid JSON count: {len(invalid_json_indices)}")

    #log_filename = f"{model_name}_output_log.txt"

    """with open(log_filename, "w") as log_file:
        log_file.write(log_stream.getvalue())"""

    return df_llm_annotations

In [31]:
parsed_inference_dataframes_dictionary = {}

for model_name, df_raw in inference_dataframes_dictionary.items():
    parsed_inference_dataframes_dictionary[model_name] = clean_llm_output(df_raw, model_name)

whole_Qwen/Qwen2.5-7B-Instruct-Turbo
All JSON strings are valid!
Number of videos in the inference dataframe: 288
Number of videos in the cleaned dataframe: 288
video_id(s) in inference dataframe but not in final parsed dataframe:
set()

video_id(s) in final parsed dataframe but not in inference dataframe:
set()
Indicies with no recommendations: [48, 74, 82, 111, 122, 152, 214, 216, 217, 218, 219, 222, 223, 224, 229, 232, 240, 241, 249, 250, 251, 252, 257, 261, 264, 265, 269, 272, 275, 276, 277, 278, 279, 282]
Valid JSON count: 288
Invalid JSON count: 0
whole_deepseek-ai/DeepSeek-R1
Invalid JSON strings found at the following indices:
Index 10:
Index 13:
Index 44:
Index 73:
Index 156:
Index 195:
Index 209:
Index 212:
Index 265:
Number of videos in the inference dataframe: 288
Number of videos in the cleaned dataframe: 279
video_id(s) in inference dataframe but not in final parsed dataframe:
{'wpOISCIl4Cw', 'sQRH-lQljQw', '3eSDDUgC5Mo', 'ZeyBx-ItMwQ', '4J0-56UXX1M', 'aV-vPmnZBs8', 'Ql_q

In [32]:
parsed_inference_dataframes_dictionary[first_key_name].head()

Unnamed: 0,Action,Justification,Conviction Score,Ticker Name,video_id,video_title,transcript,model_source,model_name,date_of_inference,prompt_used
0,Buy,The stock is expected to return 120% over the ...,3,VERI,0CJU8R4oNFk,5 Stocks to Buy Now to Double Your Money,"Hey Bowtie Nation, Joseph Hogue here with the...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...
1,Buy,The stock is expected to return 82% over the n...,3,ZYXI,0CJU8R4oNFk,5 Stocks to Buy Now to Double Your Money,"Hey Bowtie Nation, Joseph Hogue here with the...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...
2,Buy,The speaker is confident in the company's grow...,3,MITK,0CJU8R4oNFk,5 Stocks to Buy Now to Double Your Money,"Hey Bowtie Nation, Joseph Hogue here with the...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...
3,Buy,The speaker is confident in the company's grow...,3,KRMD,0CJU8R4oNFk,5 Stocks to Buy Now to Double Your Money,"Hey Bowtie Nation, Joseph Hogue here with the...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...
4,Buy,The speaker is confident in the company's grow...,3,IRMD,0CJU8R4oNFk,5 Stocks to Buy Now to Double Your Money,"Hey Bowtie Nation, Joseph Hogue here with the...",Together AI,Qwen/Qwen2.5-7B-Instruct-Turbo,2025-02-24,Analyze the YouTube video transcript and video...


In [33]:
print(len(inference_dataframes_dictionary))
print(len(parsed_inference_dataframes_dictionary))

43
43


---

## Some formatting

### Renaming columns

In [34]:
# Rename specified columns in each dataframe within the dictionary
for key, df in parsed_inference_dataframes_dictionary.items():
    parsed_inference_dataframes_dictionary[key] = df.rename(
        columns={
            "Action": "llm_action",
            "Conviction Score": "llm_conviction_score",
            "Ticker Name": "llm_ticker_name"
        }
    )

### Output parsed CSVs

In [35]:
# Ensure the 'cleaned_csvs' subdirectory exists
output_dir = "cleaned_inference_results"
os.makedirs(output_dir, exist_ok=True)

In [36]:
# Iterate through the dictionary and save each DataFrame as a CSV
for key, dataframe in parsed_inference_dataframes_dictionary.items():
    # Clean key to make it a valid filename
    sanitized_key = key.replace("/", "_").replace("\\", "_").replace(" ", "_")
    output_filename = os.path.join(output_dir, f"cleaned_{sanitized_key}.csv")
    
    try:
        dataframe.to_csv(output_filename, index=False)
        print(f"Saved {output_filename}")
    except OSError as e:
        print(f"Error saving {output_filename}: {e}")

Saved cleaned_inference_results/cleaned_whole_Qwen_Qwen2.5-7B-Instruct-Turbo.csv
Saved cleaned_inference_results/cleaned_whole_deepseek-ai_DeepSeek-R1.csv
Saved cleaned_inference_results/cleaned_whole_gemini-2.0-pro-exp-02-05.csv
Saved cleaned_inference_results/cleaned_whole_mistralai_Mistral-7B-Instruct-v0.2.csv
Saved cleaned_inference_results/cleaned_whole_meta-llama_Meta-Llama-3.1-405B-Instruct-Turbo.csv
Saved cleaned_inference_results/cleaned_whole_gemini-1.5-flash-002.csv
Saved cleaned_inference_results/cleaned_whole_mistralai_Mixtral-8x22B-Instruct-v0.1.csv
Saved cleaned_inference_results/cleaned_whole_Qwen_Qwen2.5-72B-Instruct-Turbo.csv
Saved cleaned_inference_results/cleaned_whole_claude-3-5-sonnet-20241022.csv
Saved cleaned_inference_results/cleaned_whole_claude-3-5-haiku-20241022.csv
Saved cleaned_inference_results/cleaned_whole_gpt-4o-2024-08-06.csv
Saved cleaned_inference_results/cleaned_whole_gemini-1.5-pro-002.csv
Saved cleaned_inference_results/cleaned_whole_meta-llama_M

### Conviction score type

In [37]:

# Iterate over each value (dataframe) in the dictionary
for key, dataframe in parsed_inference_dataframes_dictionary.items():
    if not dataframe.empty and 'llm_conviction_score' in dataframe.columns:
        # Take the first value of the 'llm_conviction_score' column
        first_value = dataframe['llm_conviction_score'].iloc[0]
        # Print the variable type of the first value
        print(f"Key: {key}, Type: {type(first_value)}")
    else:
        print(f"Key: {key}, DataFrame is either empty or does not contain 'llm_conviction_score' column.")
        

Key: whole_Qwen/Qwen2.5-7B-Instruct-Turbo, Type: <class 'str'>
Key: whole_deepseek-ai/DeepSeek-R1, Type: <class 'str'>
Key: whole_gemini-2.0-pro-exp-02-05, Type: <class 'str'>
Key: whole_mistralai/Mistral-7B-Instruct-v0.2, Type: <class 'str'>
Key: whole_meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo, Type: <class 'str'>
Key: whole_gemini-1.5-flash-002, Type: <class 'str'>
Key: whole_mistralai/Mixtral-8x22B-Instruct-v0.1, Type: <class 'str'>
Key: whole_Qwen/Qwen2.5-72B-Instruct-Turbo, Type: <class 'str'>
Key: whole_claude-3-5-sonnet-20241022, Type: <class 'str'>
Key: whole_claude-3-5-haiku-20241022, Type: <class 'int'>
Key: whole_gpt-4o-2024-08-06, Type: <class 'str'>
Key: whole_gemini-1.5-pro-002, Type: <class 'str'>
Key: whole_meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo, Type: <class 'str'>
Key: whole_meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo, Type: <class 'str'>
Key: whole_deepseek-ai/DeepSeek-V3, Type: <class 'str'>
Key: whole_gemini-2.0-flash-001, Type: <class 'str'>
Key: segment

In [38]:
# Initialize log string
log_string = ""

# Iterate over each value (dataframe) in the dictionary
for key, dataframe in parsed_inference_dataframes_dictionary.items():
    if not dataframe.empty and 'llm_conviction_score' in dataframe.columns:
        # Take the first value of the 'llm_conviction_score' column
        first_value = dataframe['llm_conviction_score'].iloc[0]
        # Print the variable type of the first value before changing
        print(f"Key: {key}, Before Change - Type: {type(first_value)}, Value: {first_value}")
        
        # Apply conversion logic
        def convert_value(x):
            global log_string
            if isinstance(x, str):
                try:
                    return int(x)  # Try converting to integer
                except ValueError:
                    current_error = f"For {key}, imputed -1 for conviction score because ValueError: {x}\n"
                    print(current_error)
                    log_string = log_string + current_error
                    return -1  # Assign -1 if conversion fails
            return x  # Keep the value as is if not a string
        
        dataframe['llm_conviction_score'] = dataframe['llm_conviction_score'].apply(convert_value)
        first_value = dataframe['llm_conviction_score'].iloc[0]  # Update after conversion
        
        # Print the variable type of the first value after changing
        print(f"Key: {key}, After Change - Type: {type(first_value)}, Value: {first_value}")
    else:
        print(f"Key: {key}, DataFrame is either empty or does not contain 'llm_conviction_score' column.")

# Ensure the directory exists
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)  # Creates the directory if it doesn't exist


# Output the log string to a file
log_file_path = "logs/conviction_score_log.txt"
with open(log_file_path, "w") as log_file:
    log_file.write(log_string)

print(f"Log file saved at: {log_file_path}")


Key: whole_Qwen/Qwen2.5-7B-Instruct-Turbo, Before Change - Type: <class 'str'>, Value: 3
Key: whole_Qwen/Qwen2.5-7B-Instruct-Turbo, After Change - Type: <class 'numpy.float64'>, Value: 3.0
Key: whole_deepseek-ai/DeepSeek-R1, Before Change - Type: <class 'str'>, Value: 3
Key: whole_deepseek-ai/DeepSeek-R1, After Change - Type: <class 'numpy.float64'>, Value: 3.0
Key: whole_gemini-2.0-pro-exp-02-05, Before Change - Type: <class 'str'>, Value: 3
Key: whole_gemini-2.0-pro-exp-02-05, After Change - Type: <class 'numpy.float64'>, Value: 3.0
Key: whole_mistralai/Mistral-7B-Instruct-v0.2, Before Change - Type: <class 'str'>, Value: 3
For whole_mistralai/Mistral-7B-Instruct-v0.2, imputed -1 for conviction score because ValueError: Unclear

Key: whole_mistralai/Mistral-7B-Instruct-v0.2, After Change - Type: <class 'numpy.float64'>, Value: 3.0
Key: whole_meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo, Before Change - Type: <class 'str'>, Value: 3
Key: whole_meta-llama/Meta-Llama-3.1-405B-Instruct-

## Formatting completed - ready to evaluate

---

## Evaluation

In [39]:
human_dataset = pd.read_csv("../data_needed_for_inference/complete_dataset_with_refined_prices.csv")

### F1 score calculator

In [40]:
import ast

def f1_score_calculator_all(ground_truth, predicted, tuple_type="single"):
    """
    Calculate F1 score for:
      - single:   list of strings
      - pairs:    list of 2-tuples of (string, string)
      - triplets: list of 3-tuples of (string, string, numeric)
    
    :param ground_truth: list or string representation of list
    :param predicted: list or string representation of list
    :param tuple_type: one of {"single", "pairs", "triplets"}
    :return: F1 score (float)
    """

    # 1. Parse inputs if provided as strings
    if isinstance(ground_truth, str):
        ground_truth = ast.literal_eval(ground_truth)
    if isinstance(predicted, str):
        predicted = ast.literal_eval(predicted)

    # 2. Validate and normalize ground_truth/predicted depending on tuple_type
    if tuple_type == "single":
        # Expect: list of strings, e.g. ["AAPL", "META"]
        def is_valid_single_list(data_list):
            return (isinstance(data_list, list) and 
                    all(isinstance(item, str) for item in data_list))

        if not (is_valid_single_list(ground_truth) and 
                is_valid_single_list(predicted)):
            return 0.0  # Invalid format

        # Normalize: convert each string to lowercase
        ground_truth_set = set(item.lower() for item in ground_truth)
        predicted_set    = set(item.lower() for item in predicted)

    elif tuple_type == "pairs":
        # Expect: list of 2-tuples (string, string)
        def is_valid_pairs_list(data_list):
            if not isinstance(data_list, list):
                return False
            for tup in data_list:
                if not (isinstance(tup, tuple) and len(tup) == 2):
                    return False
                if not all(isinstance(x, str) for x in tup):
                    return False
            return True

        if not (is_valid_pairs_list(ground_truth) and
                is_valid_pairs_list(predicted)):
            return 0.0  # Invalid format

        # Normalize: (str.lower(), str.lower())
        def normalize_pair(tup):
            return (tup[0].lower(), tup[1].lower())
        
        ground_truth_set = set(normalize_pair(tup) for tup in ground_truth)
        predicted_set    = set(normalize_pair(tup) for tup in predicted)

    elif tuple_type == "triplets":
        # Expect: list of 3-tuples (string, string, numeric)
        def is_valid_triplets_list(data_list):
            if not isinstance(data_list, list):
                return False
            for tup in data_list:
                if not (isinstance(tup, tuple) and len(tup) == 3):
                    return False
                # First two are strings, last one is numeric
                if not (isinstance(tup[0], str) and 
                        isinstance(tup[1], str) and 
                        isinstance(tup[2], (int, float))):
                    return False
            return True
        
        if not (is_valid_triplets_list(ground_truth) and
                is_valid_triplets_list(predicted)):
            return 0.0  # Invalid format

        # Normalize: lower the first two strings, keep numeric as-is
        def normalize_triplet(tup):
            return (tup[0].lower(), tup[1].lower(), tup[2])

        ground_truth_set = set(normalize_triplet(tup) for tup in ground_truth)
        predicted_set    = set(normalize_triplet(tup) for tup in predicted)

    else:
        raise ValueError("Unsupported tuple_type. Choose from 'single', 'pairs', 'triplets'.")

    # 3. Calculate precision, recall, and F1
    true_positives = ground_truth_set & predicted_set
    false_positives = predicted_set - ground_truth_set
    false_negatives = ground_truth_set - predicted_set

    precision = len(true_positives) / len(predicted_set) if predicted_set else 0.0
    recall    = len(true_positives) / len(ground_truth_set) if ground_truth_set else 0.0
    f1_score  = (2 * precision * recall) / (precision + recall) if (precision + recall) else 0.0

    return f1_score

# ------------------------------------------------------------------------------
# EXAMPLE USAGE
# ------------------------------------------------------------------------------

# 1) SINGLE
ground_truth_single = ["AAPL", "MTA"]
predicted_single = ["AAPL", "META"]  # case order is mixed on purpose
f1_single = f1_score_calculator_all(ground_truth_single, predicted_single, tuple_type="single")
print(f"Single F1: {f1_single}")

# 2) PAIRS
ground_truth_pairs = [("AAPL", "BUY"), ("MSFT", "SELL")]
predicted_pairs    = [("AAPL", "BUY"), ("MSFT", "SELL")]
f1_pairs = f1_score_calculator_all(ground_truth_pairs, predicted_pairs, tuple_type="pairs")
print(f"Pairs F1: {f1_pairs}")

# 3) TRIPLETS
ground_truth_triplets = [("AAPL", "BUY", 2), ("MSFT", "SELL", 3.0)]
predicted_triplets    = [("aapl", "buy", 2), ("msft", "sell", 2)]
f1_triplets = f1_score_calculator_all(ground_truth_triplets, predicted_triplets, tuple_type="triplets")
print(f"Triplets F1: {f1_triplets}")


Single F1: 0.5
Pairs F1: 1.0
Triplets F1: 0.5


### L1 benchmark: `(ticker_name)` single

In [41]:
def evaluate_llm_ticker_name(human_dataset, df_llm_annotations):
    # Condense LLM annotations by 'video_id' to a list of ticker names
    df_llm_condensed = (
        df_llm_annotations
        .groupby('video_id', group_keys=False)['llm_ticker_name']
        .apply(list)
        .reset_index(name='llm_ticker_names')
    )

    # Condense human annotations by 'video_id' to a list of ticker names
    df_human_condensed = (
        human_dataset
        .groupby('video_id', group_keys=False)['ticker_name']
        .apply(list)
        .reset_index(name='human_ticker_names')
    )

    # Merging the dataframes by 'video_id'
    df_condensed = pd.merge(df_llm_condensed, df_human_condensed, on='video_id', suffixes=('_llm', '_human'))

    # Add the f1_score column to df_condensed
    df_condensed['f1_score'] = df_condensed.apply(
        lambda row: f1_score_calculator_all(row['human_ticker_names'], row['llm_ticker_names'], "single"), axis=1
    )

    # Calculate Macro F1 Score
    macro_f1 = df_condensed['f1_score'].mean()

    return macro_f1


In [42]:
evaluate_llm_ticker_name(human_dataset, parsed_inference_dataframes_dictionary[first_key_name])

0.5605905207580932

### L2 benchmark: `(ticker_name, action)` pair

In [43]:
def evaluate_llm_ticker_name_action(human_dataset, df_llm_annotations):
    df_llm_condensed = (
        df_llm_annotations
        .groupby('video_id', group_keys=False)[['llm_ticker_name', 'llm_action']]
        .apply(lambda x: list(zip(x['llm_ticker_name'], x['llm_action'])))
        .reset_index(name='llm_pairs')
    )

    df_human_condensed = (
        human_dataset
        .groupby('video_id', group_keys=False)[['ticker_name', 'action']]
        .apply(lambda x: list(zip(x['ticker_name'], x['action'])))
        .reset_index(name='human_pairs')
    )

    # Merging the dataframes by 'video_id'
    df_condensed = pd.merge(df_llm_condensed, df_human_condensed, on='video_id', suffixes=('_llm', '_human'))

    # Add the f1_score column to df_condensed
    df_condensed['f1_score'] = df_condensed.apply(
        lambda row: f1_score_calculator_all(row['human_pairs'], row['llm_pairs'], tuple_type="pairs"), axis=1
    )

    # Calculate Macro F1 Score
    macro_f1 = df_condensed['f1_score'].mean()
    
    return macro_f1

    #return df_condensed

In [44]:
evaluate_llm_ticker_name_action(human_dataset, parsed_inference_dataframes_dictionary[first_key_name])

0.41011549322714463

### L3 benchmark: `(ticker_name, action, conviction_score)` triplet

In [45]:
def evaluate_llm_action_ticker_name_conviction_score(human_dataset, df_llm_annotations):
    # Creating the LLM triplets dataframe
    df_llm_condensed = (
        df_llm_annotations
        .groupby('video_id', group_keys=False)[['llm_ticker_name', 'llm_action', 'llm_conviction_score']]
        .apply(lambda x: list(zip(x['llm_ticker_name'], x['llm_action'], x['llm_conviction_score'])))
        .reset_index(name='llm_triplets')
    )

    # Creating the human triplets dataframe
    df_human_condensed = (
        human_dataset
        .groupby('video_id', group_keys=False)[['ticker_name', 'action', 'conviction_score']]
        .apply(lambda x: list(zip(x['ticker_name'], x['action'], x['conviction_score'])))
        .reset_index(name='human_triplets')
    )

    # Merging the dataframes by 'video_id'
    df_condensed = pd.merge(df_llm_condensed, df_human_condensed, on='video_id', suffixes=('_llm', '_human'))


    # Add the f1_score column to df_condensed
    df_condensed['f1_score'] = df_condensed.apply(
        lambda row: f1_score_calculator_all(row['human_triplets'], row['llm_triplets'], tuple_type="triplets"), axis=1
    )
        
    # Calculate Macro F1 Score
    macro_f1 = df_condensed['f1_score'].mean()
    
    return macro_f1

    #return df_condensed


In [46]:
df = evaluate_llm_action_ticker_name_conviction_score(human_dataset, parsed_inference_dataframes_dictionary[first_key_name])
df

0.19540343769820315

### RMSE score: number of recommendations

In [47]:
def evaluate_number_of_recs(human_dataset, df_llm_annotations):
    df_llm_condensed = df_llm_annotations.groupby('video_id').size().reset_index(name='llm_recommendation_count')
    df_human_condensed = human_dataset.groupby('video_id').size().reset_index(name='human_recommendation_count')


    # Merging the dataframes by 'video_id'
    df_condensed = pd.merge(df_llm_condensed, df_human_condensed, on='video_id', suffixes=('_llm', '_human'))

    # RMSE 

    mae = np.mean(np.abs(df_condensed['llm_recommendation_count'] - df_condensed['human_recommendation_count']))

    return mae

    

In [48]:
evaluate_number_of_recs(human_dataset, parsed_inference_dataframes_dictionary[first_key_name])

0.6840277777777778

### Combining results

In [49]:
# Create a results dataframe
results = []

# Iterate through models and their respective dataframes
for model_name, df_llm_annotations in parsed_inference_dataframes_dictionary.items():
    f1_single = round(evaluate_llm_ticker_name(human_dataset, df_llm_annotations) * 100, 2)
    f1_pairs = round(evaluate_llm_ticker_name_action(human_dataset, df_llm_annotations) * 100, 2)
    f1_triplets = round(evaluate_llm_action_ticker_name_conviction_score(human_dataset, df_llm_annotations) * 100, 2)
    mae_count = evaluate_number_of_recs(human_dataset, df_llm_annotations)
    
    results.append({
        "Model": model_name,
        "T": f1_single,
        "TA": f1_pairs,
        "TAC": f1_triplets,
        "Count (MAE)" : mae_count
    })

# Convert results to a DataFrame
df_waterfall_results = pd.DataFrame(results)


In [50]:
#df_waterfall_results["Avg"] = df_waterfall_results[["Action", "Action and ticker", "Action, ticker, conviction"]].mean(axis=1)
df_waterfall_results.sort_values(by=["TAC"], ascending=False, inplace=True)
#df_waterfall_results

In [51]:
df = df_waterfall_results.copy()

In [52]:

# 1) Split into 'prefix' and 'base_name'
df[['prefix', 'base_name']] = df['Model'].str.split('_', n=1, expand=True)

# 2) Pivot using the newly created columns
df_pivot = df.pivot(
    index='base_name',
    columns='prefix',
    values=['T','TA','TAC','Count (MAE)']
)

# 3) Flatten the MultiIndex column names
df_pivot.columns = [f"{col[1]}_{col[0]}" for col in df_pivot.columns]

# 4) Optional: reset index to make 'base_name' a normal column
df_pivot = df_pivot.reset_index()

# 5) Reorder columns in the desired sequence
desired_order = [
    'base_name',
    'whole_T', 'whole_TA', 'whole_TAC', 'whole_Count (MAE)',
    'segmentwise_T', 'segmentwise_TA', 'segmentwise_TAC', 'segmentwise_Count (MAE)'
]
df_pivot = df_pivot[desired_order]

In [53]:
df_pivot.drop(['whole_Count (MAE)', 'segmentwise_Count (MAE)'], axis=1, inplace=True)

In [54]:
df_pivot
df_pivot.sort_values(by=["base_name"], ascending=True, inplace=True)
df_pivot

Unnamed: 0,base_name,whole_T,whole_TA,whole_TAC,segmentwise_T,segmentwise_TA,segmentwise_TAC
0,Qwen/Qwen2.5-72B-Instruct-Turbo,65.65,47.42,19.65,79.2,48.36,22.77
1,Qwen/Qwen2.5-7B-Instruct-Turbo,56.06,41.01,19.54,65.57,38.16,20.76
2,claude-3-5-haiku-20241022,56.94,40.66,19.75,68.18,43.61,21.54
3,claude-3-5-sonnet-20241022,65.32,43.38,22.54,75.9,46.47,24.6
4,deepseek-ai/DeepSeek-R1,63.89,45.81,21.29,71.18,47.29,21.36
5,deepseek-ai/DeepSeek-V3,64.91,47.1,23.65,77.56,51.35,28.17
6,gemini-1.5-flash-002,56.9,38.2,18.97,72.09,44.83,21.31
7,gemini-1.5-flash-002_outputs_video_full_length,64.91,45.06,20.66,,,
8,gemini-1.5-flash-002_outputs_video_segments,,,,86.23,54.21,23.27
9,gemini-1.5-pro-002,62.78,43.25,21.26,76.56,46.38,22.87


## Metric 1: Average across all models

In [55]:

# 1. Create row-wise average F1 scores for T, TA, TAC
df_pivot['T_avg'] = df_pivot[['whole_T', 'segmentwise_T']].mean(axis=1)
df_pivot['TA_avg'] = df_pivot[['whole_TA', 'segmentwise_TA']].mean(axis=1)
df_pivot['TAC_avg'] = df_pivot[['whole_TAC', 'segmentwise_TAC']].mean(axis=1)

# 2. Compute the overall average for each task (across all rows)
overall_T_avg = df_pivot['T_avg'].mean()
overall_TA_avg = df_pivot['TA_avg'].mean()
overall_TAC_avg = df_pivot['TAC_avg'].mean()



print("\nOverall average F1 for T:", overall_T_avg)
print("Overall average F1 for TA:", overall_TA_avg)
print("Overall average F1 for TAC:", overall_TAC_avg)


Overall average F1 for T: 67.86185185185185
Overall average F1 for TA: 44.25981481481482
Overall average F1 for TAC: 21.014074074074074


## Metric 2: Ticker performance for s and w on current directory selection (models)

66.65 for llm 


In [56]:
# Compute averages
average_whole_T = df_pivot["whole_T"].mean()
average_segmentwise_T = df_pivot["segmentwise_T"].mean()

# Display results
print(f"Average whole_T: {average_whole_T}")
print(f"Average segmentwise_T: {average_segmentwise_T}")
print((average_whole_T+average_segmentwise_T)/2)

Average whole_T: 61.3247619047619
Average segmentwise_T: 73.13545454545455
67.23010822510822


In [57]:
"""for key, df in parsed_inference_dataframes_dictionary.items():
    if "llm_action" in df.columns:
        print(f"Key: {key}")
        print(df["llm_action"].value_counts())
        print("\n" + "="*50 + "\n")
    else:
        print(f"Key: {key} - 'llm_action' column not found in dataframe.\n")
"""

'for key, df in parsed_inference_dataframes_dictionary.items():\n    if "llm_action" in df.columns:\n        print(f"Key: {key}")\n        print(df["llm_action"].value_counts())\n        print("\n" + "="*50 + "\n")\n    else:\n        print(f"Key: {key} - \'llm_action\' column not found in dataframe.\n")\n'

## Check Video IDS being evaluated for each model

In [58]:
for model_name, df_llm_annotations in parsed_inference_dataframes_dictionary.items():
    print(f"\nProcessing model: {model_name}\n" + "="*40)
    
    # Create the LLM condensed DataFrame
    df_llm_condensed = (
        df_llm_annotations
        .groupby('video_id', group_keys=False)[['llm_ticker_name', 'llm_action', 'llm_conviction_score']]
        .apply(lambda x: list(zip(x['llm_ticker_name'], x['llm_action'], x['llm_conviction_score'])))
        .reset_index(name='llm_triplets')
    )
    
    # Create the human condensed DataFrame
    df_human_condensed = (
        human_dataset
        .groupby('video_id', group_keys=False)[['ticker_name', 'action', 'conviction_score']]
        .apply(lambda x: list(zip(x['ticker_name'], x['action'], x['conviction_score'])))
        .reset_index(name='human_triplets')
    )
    
    # Merge the two DataFrames on 'video_id'
    df_condensed = pd.merge(df_llm_condensed, df_human_condensed, on='video_id', suffixes=('_llm', '_human'))
    
    # Count unique video IDs in each DataFrame
    count_llm = df_llm_condensed['video_id'].nunique()
    count_human = df_human_condensed['video_id'].nunique()
    count_merged = df_condensed['video_id'].nunique()
    
    # Print the counts with clear formatting
    print(f"LLM Condensed Unique Count: {count_llm}")
    print(f"Human Condensed Unique Count: {count_human}")
    print(f"Merged Condensed Unique Count: {count_merged}")
    print("-"*50)



Processing model: whole_Qwen/Qwen2.5-7B-Instruct-Turbo
LLM Condensed Unique Count: 288
Human Condensed Unique Count: 288
Merged Condensed Unique Count: 288
--------------------------------------------------

Processing model: whole_deepseek-ai/DeepSeek-R1
LLM Condensed Unique Count: 279
Human Condensed Unique Count: 288
Merged Condensed Unique Count: 279
--------------------------------------------------

Processing model: whole_gemini-2.0-pro-exp-02-05
LLM Condensed Unique Count: 188
Human Condensed Unique Count: 288
Merged Condensed Unique Count: 188
--------------------------------------------------

Processing model: whole_mistralai/Mistral-7B-Instruct-v0.2
LLM Condensed Unique Count: 282
Human Condensed Unique Count: 288
Merged Condensed Unique Count: 282
--------------------------------------------------

Processing model: whole_meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo
LLM Condensed Unique Count: 288
Human Condensed Unique Count: 288
Merged Condensed Unique Count: 288
-----