In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import ast
from itertools import zip_longest

In [None]:
def get_llm_ratings(llm_rating_sources: list[str]) -> list[float]:
    """
    Get the average llm rating for a given source.
    Args:
        llm_rating_sources: list of llm rating sources. This is the dirname to start checking for csv files from.
                            Subdirectories will be checked starting from this directory.
    Return:
        List of all llm ratings for the given source.
        If multiple sources are given return the average rating across all sources.
    """
    all_source_ratings = []

    for source in llm_rating_sources:
        source = f"{os.path.dirname(os.path.abspath(os.getcwd()))}/../outputs/{source}"
        source_ratings = []
        
        # Traverse through the directory to find CSV files
        for root, _, files in os.walk(source):
            for file in files:
                if file.endswith(".csv"):
                    # Read the CSV file
                    file_path = os.path.join(root, file)
                    df = pd.read_csv(file_path)
                    
                    # Assuming the response column contains the dictionary of ratings as string
                    if 'response' in df.columns:
                        response = df['response'][0]

                        # Convert the response from string to a dictionary
                        response_dict = ast.literal_eval(response)
                        
                        # Extract ratings and append them to source_ratings
                        source_ratings = response_dict.values()
        
        if source_ratings:
            all_source_ratings.append(source_ratings)

    # Calculate average ratings
    zipped_lists = zip_longest(*all_source_ratings, fillvalue=None)
    llm_ratings = [
        sum(filter(None.__ne__, group)) / len(list(filter(None.__ne__, group)))
        for group in zipped_lists
    ]
    
    return llm_ratings

In [None]:
human_ratings = []

In [None]:
llm_rating_sources = ["bentz_experiment/standard/creative"]

llm_ratings = get_llm_ratings(llm_rating_sources)

In [None]:
USE_HUMAN_RATINGS = len(human_ratings) > 0

In [None]:
plt.plot(llm_ratings, label='LLM Ratings')

plt.title('LLM Suspense Ratings')
plt.xlabel('Passage')
plt.ylabel('Suspense Rating')
plt.legend()

if USE_HUMAN_RATINGS:
    plt.plot(human_ratings, label='Human Ratings')
    plt.title('LLM vs Human Suspense Ratings')

plt.show()

In [None]:
llm_ratings_norm = (np.array(llm_ratings) - np.min(llm_ratings)) / (np.max(llm_ratings) - np.min(llm_ratings))
plt.plot(llm_ratings_norm, label='LLM Ratings Normalized')

plt.title('Normalized LLM Suspense Ratings')
plt.xlabel('Passage')
plt.ylabel('Suspense Rating')
plt.legend()

if USE_HUMAN_RATINGS:
    human_ratings_norm = (np.array(human_ratings) - np.min(human_ratings)) / (np.max(human_ratings) - np.min(human_ratings))
    plt.plot(human_ratings_norm, label='Human Ratings Normalized')
    plt.title('Normalized LLM vs Human Suspense Ratings')
    
plt.show()

In [None]:
llm_ratings_diff = np.diff(llm_ratings)
plt.plot(llm_ratings_diff, label='LLM Ratings Diff')

plt.title('LLM Suspense Ratings Diff')
plt.xlabel('Passage')
plt.ylabel('Suspense Rating')
plt.legend()

if USE_HUMAN_RATINGS:
    human_ratings_diff = np.diff(human_ratings)
    plt.plot(human_ratings_diff, label='Human Ratings Diff')
    plt.title('LLM vs Human Suspense Ratings Diff')

plt.show()