In [200]:
import pandas as pd
import re
import numpy as np
import os

In [201]:
def parse_latex_table(table_str):
    # Correctly split the input text into lines at the LaTeX row ending '\\'
    rows = table_str.strip().split('\\\\\n')
    
    # Initialize an empty list to store our processed data
    data = []

    # Loop over each row
    for row in rows:
        # Remove the LaTeX formatting completely from the rows
        clean_row = re.sub(r"\\multirow{1}{\*}{\\bfseries\s*}", "", row)
        clean_row = re.sub(r"\\bfseries\s*", "", clean_row)
        clean_row = clean_row.strip()

        # Split the row into cells based on '&' delimiter
        cells = clean_row.split('&')

        # Remove leading and trailing braces around model names
        cells[0] = re.sub(r"[{}]", "", cells[0]).strip()
        
        # Append the cleaned cells to the data list
        data.append(cells)

    # Assuming the first column corresponds to model names
    model_names = [row[0] for row in data]
    
    # Assuming the remaining columns are data points
    data_points = [row[1:] for row in data]
    
    # Now create DataFrame with model names as the index and numeric columns
    df = pd.DataFrame(data_points, index=model_names)
    
    # Convert all the non-numeric values to float and keep 'NR' as it is
    df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))

    return df


In [202]:

# Helper function to determine color based on the difference value
def color_diff(value):
    if value == 'NR' or pd.isna(value):
        return "NR"
    if abs(value) < 0.10:
        # black if similar
        return "{:.3f}".format(value)
    elif value > 0:
        # green if greater
        return r"\textcolor{green}{%.3f}" % value
    else:
        # red if lower
        return r"\textcolor{red}{%.3f}" % value

# Convert 'NR' and calculate the differences, store them in the DataFrame
def process_data(parsed_data):
    df_parsed_table_data = parsed_data
    for index, row in df_parsed_table_data.iterrows():
        for i in range(0, len(row) - 2, 3):
            # Ensure both values are not 'NR' and are numeric before subtracting
            if row[i] != 'NR' and row[i+1] != 'NR':
                try:
                    # Attempt to convert both values to float and then subtract
                    diff = float(row[i+1]) - float(row[i])
                    df_parsed_table_data.at[index, row.index[i+2]] = diff
                except ValueError:
                    # Handle the case where conversion to float fails
                    df_parsed_table_data.at[index, row.index[i+2]] = 'NR'
            else:
                # If one or both values are 'NR', set the result of the third cell to 'NR'
                df_parsed_table_data.at[index, row.index[i+2]] = 'NR'

    for index, row in df_parsed_table_data.iterrows():
        for i in range(0, len(row) - 2, 3):
            if row[i] != 'NR' and row[i+1] != 'NR':
    # Convert both values to float before subtracting
                value1 = float(row[i])
                value2 = float(row[i+1])
                df_parsed_table_data.at[index, row.index[i+2]] = value2 - value1
            else:
                df_parsed_table_data.at[index, row.index[i+2]] = 'NR'


    # Convert DataFrame to float where possible for max value comparison
    df_numeric = df_parsed_table_data.apply(pd.to_numeric, errors='ignore')

    # Find max values per column
    max_values = df_numeric.max()

    # Initialize list to store formatted LaTeX rows
    formatted_rows = []

    # Iterate over DataFrame rows to create the LaTeX formatted string
    for index, row in df_parsed_table_data.iterrows():
        formatted_cells = [index]  # Start with the model name
        for i, cell in enumerate(row):
            if pd.notna(cell):
                # Format cell as a float or leave as 'NR'
                cell_value = f"{cell:.3f}" if isinstance(cell, (int, float)) and not pd.isna(cell) else cell
            else:
                cell_value = "NR"
            
            if i % 3 == 2:  # Color code the 3rd cell in each group of three
                cell_value = color_diff(cell)
            
            # Boldface the max value in the column if it's not 'NR' or NaN
            if cell == max_values[i] and pd.notna(cell):
                cell_value = r"\textbf{%s}" % cell_value
            
            # Add the formatted cell to the list of cells
            formatted_cells.append(cell_value)
        
        # Create a formatted row string
        formatted_row = r"\multirow{1}{*}{\bfseries %s} & " % formatted_cells[0] + " & ".join(formatted_cells[1:]) + r"\\"
        
        # Add the formatted row to the list of rows
        formatted_rows.append(formatted_row)

    # Join all formatted rows into one string with line breaks
    formatted_table = "\n".join(formatted_rows)

    # Print the formatted table
    return(formatted_table)




In [203]:
file_names = ['wikipedia', 'reddit', 'synthetic_V1', 'synthetic_V2'] 

# Folder for the processed tables
output_folder = 'processed_tables'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

# Loop over each file name
for base_name in file_names:
    input_file_path = f'{base_name}.txt'
    output_file_path = os.path.join(output_folder, f'processed_{base_name}.txt')
    
    # Read the LaTeX table from the input file
    with open(input_file_path, 'r') as file:
        table_str = file.read()

    # Parse the LaTeX table into a DataFrame
    df_parsed_table_data = parse_latex_table(table_str)

    # Process the data and get the LaTeX-formatted table
    latex_table = process_data(df_parsed_table_data)

    # Save the LaTeX table to a text file with the corresponding name in the 'processed_tables' folder
    with open(output_file_path, 'w') as output_file:
        output_file.write(latex_table)
    
    print(f"Processed LaTeX table from {input_file_path} has been saved to {output_file_path}")

Processed LaTeX table from wikipedia.txt has been saved to processed_tables\processed_wikipedia.txt
Processed LaTeX table from reddit.txt has been saved to processed_tables\processed_reddit.txt
Processed LaTeX table from synthetic_V1.txt has been saved to processed_tables\processed_synthetic_V1.txt
Processed LaTeX table from synthetic_V2.txt has been saved to processed_tables\processed_synthetic_V2.txt
