# Metrics of the models

This notebook will create a table that compares different metrics between the models that have been benchmarked inside `coco_dataset`.
To successfully run this notebook, it is advised to have a virtual `Conda` environment so this notebook has access to the needed dependencies.

### Importing results files 

In [15]:
files_path = './coco_dataset'

In [19]:
import pandas as pd
import os

def load_results_to_dataframe(directory_path=files_path):
    # Initialize an empty list to store the dataframes
    df_list = []
    
    # Iterate over all files in the specified directory
    for filename in os.listdir(directory_path):
        if filename.endswith('_results.csv'):
            # Extract the model name from the filename
            model_name = filename.replace('_results.csv', '')
            
            # Construct the full file path
            file_path = os.path.join(directory_path, filename)
            
            # Read the CSV file into a dataframe
            df = pd.read_csv(file_path)
            
            # Check if the expected columns are present in the DataFrame
            if set(['image_id', 'time_in_microseconds', 'prediction']).issubset(df.columns):
                # Add the model_name column
                df['model_name'] = model_name
                
                # Keep only the required columns in the specified order
                df = df[['image_id', 'model_name', 'time_in_microseconds', 'prediction']]
                
                # Append the dataframe to the list
                df_list.append(df)
            else:
                print(f"Warning: File {filename} does not contain the required columns.")
    
    # Concatenate all dataframes in the list into a single dataframe
    results_df = pd.concat(df_list)
    
    # Reset the index of the resulting dataframe
    results_df.reset_index(drop=True, inplace=True)
    
    return results_df

# Call the function and assign the result to a variable
df = load_results_to_dataframe()

### Adding information from the COCO dataset captions

In [20]:
# Load the captions.csv file into a DataFrame
captions_df = pd.read_csv(os.path.join(files_path, 'captions.csv'))

# Rename the 'caption' column to 'original_caption'
captions_df.rename(columns={'caption': 'original_caption'}, inplace=True)

# Merge the two DataFrames on the 'image_id' column
df = pd.merge(df, captions_df, on='image_id', how='left')

### Cleaning up the data
Let's make both the `predicted caption` and the `original caption` lower case and formatted the same way so the metrics that we measure are more reliable.

In [None]:
# Define a function to clean the text according to the specified rules
def clean_text(text):
    # Make the text lowercase
    text = text.lower()
    # Remove any surrounding quotation marks
    text = text.strip('\"')
    # Trim whitespace
    text = text.strip()
    # Remove the period at the end if there is one
    if text.endswith('.'):
        text = text[:-1]
    return text

# Apply the clean_text function to the 'original_caption' and 'prediction' columns
df['original_caption'] = df['original_caption'].apply(clean_text)
df['prediction'] = df['prediction'].apply(clean_text)