In [38]:
import pandas as pd
import re
from pathlib import Path
import json

In [39]:
def parse_llava(prompt_text):
    # Find the JSON block enclosed in triple backticks with "json"
    pattern = r"```json\s*(\{.*?\})\s*```"
    match = re.search(pattern, prompt_text, re.DOTALL)
    if match:
        json_str = match.group(1)
        return json_str

    else:
        return None


Process Overview:

Start at the main folder. \
Iterate through each subfolder (model folder). \
For each model folder:\
List all the text files.\
For each text file:\
Read its filename to get the video_id.\
Open the file and read its content to get the prompt output.\
Create a row in the dataframe with model_name, video_id, and prompt output.\



Finally, you will have one dataframe per model folder containing all these rows.

In [40]:
input_output_folders = [
[Path('mllm_prompt_s_video_outputs'), Path('mllm_s_combined_output')],
[Path('mllm_prompt_w_video_outputs'), Path('mllm_w_combined_output')]
]

In [41]:
def txt_file_combiner(main_folder):
    # Create a dictionary to hold a DataFrame for each model folder
    model_dataframes = {}

    # Iterate over each model folder in the main folder
    for model_folder in main_folder.iterdir():
        if model_folder.is_dir():
            rows = []
            
            # Iterate over each text file in the model folder
            for txt_file in model_folder.glob("*.txt"):
                # Skip the file if it's named 'details.txt'
                if txt_file.name.lower() == "details.txt":
                    continue

                # Extract the YouTube video id from the filename (format "videoid__rest_of_the_filename")
                full_stem = txt_file.stem
                video_id = full_stem.split("__")[0] if "__" in full_stem else full_stem
                
                # Read the file content as prompt output
                with open(txt_file, 'r', encoding='utf-8') as f:
                    prompt_output = f.read().strip()
                
                # Create a row with columns in the specified order
                row = {
                    'video_id': video_id,
                    'video_title': "NA",
                    'transcript': "NA",
                    'model_source': "NA",
                    'model_name': model_folder.name,
                    'date_of_inference': "NA",
                    'prompt_used': "NA",
                    'prompt_output': prompt_output,
                    'video_source': "NA"
                }
                rows.append(row)
            
            if rows:
                # Create the DataFrame with columns in the desired order
                columns_order = ['video_id', 'video_title', 'transcript', 'model_source', 
                                 'model_name', 'date_of_inference', 'prompt_used', 'prompt_output', 'video_source']
                df = pd.DataFrame(rows, columns=columns_order)
                model_dataframes[model_folder.name] = df
    
    return model_dataframes

In [42]:
# Iterate over the input/output folder pairs
for input_folder, output_folder in input_output_folders:
    # Ensure the output folder exists
    output_folder.mkdir(parents=True, exist_ok=True)
    
    # Generate the dictionary of DataFrames from the input folder
    model_dataframes = txt_file_combiner(input_folder)
    
    # Iterate over the dictionary and save each DataFrame as a CSV
    for model_name, df in model_dataframes.items():

        # If model_name == "Llava whatever"
        if model_name == "llava-v1.6-mistral-7b-hf_outputs_video_segments":
            print("This is llava")
            # we have a nomral df with prompt_output column right
            # Go row by row for each prompt_output row and apply the llava_parse(current_prompt_output) function which returns the new prompt output
            # Replace it with that parsed version
            # Move ahead 

            df['prompt_output'] = df['prompt_output'].apply(parse_llava)
        
        else:
            print("This is not llava")

        

        
        output_csv_path = output_folder / f"{model_name}.csv"
        df.to_csv(output_csv_path, index=False)
        print(f"Saved {output_csv_path}")

This is llava
Saved mllm_s_combined_output/llava-v1.6-mistral-7b-hf_outputs_video_segments.csv
This is not llava
Saved mllm_s_combined_output/gemini-2.0-pro-exp-02-05_outputs_video_segments.csv
This is not llava
Saved mllm_s_combined_output/gemini-2.0-flash-001_outputs_video_segments.csv
This is not llava
Saved mllm_s_combined_output/gemini-1.5-pro-002_outputs_video_segments.csv
This is not llava
Saved mllm_s_combined_output/gpt-4o-2024-08-06_outputs_video_segments.csv
This is not llava
Saved mllm_s_combined_output/gemini-1.5-flash-002_outputs_video_segments.csv
This is not llava
Saved mllm_w_combined_output/gemini-1.5-pro-002_outputs_video_full_length.csv
This is not llava
Saved mllm_w_combined_output/gemini-2.0-flash-001_outputs_video_full_length.csv
This is not llava
Saved mllm_w_combined_output/gemini-2.0-pro-exp-02-05_outputs_video_full_length.csv
This is not llava
Saved mllm_w_combined_output/gpt-4o-2024-08-06_outputs_video_full_length.csv
This is not llava
Saved mllm_w_combined_