In [2]:
import pandas as pd

# MLP EVOTORCH GA

In [3]:
import pandas as pd
import re

# Path to your EvoTorch log file
evotorch_file_path_GA = '../../../plotting_data/raw/MLP_evotorch_10Jan25_GA_7706549.log'

def MLP_Evotorch_data(file_path):
    """
    Process an EvoTorch log file to extract F1 score and Time data:
    - Summary: Best F1 score and total Time for each iteration.
    - Generational: Best F1 score for each combination of iteration and trial.

    Args:
        file_path (str): Path to the log file.

    Returns:
        tuple: Two DataFrames:
            - Summary: Best F1 score and total Time for each iteration.
            - Generational: Best F1 score for each combination of iteration and trial.
    """
    # Load the file content
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Initialize lists to store parsed data
    repeat_summary = []  # Will store tuples of (Iteration, Best F1 Score, Total Time)
    generational_data = []  # Will store tuples of (Iteration, Generation, Best F1 Score)

    # Regex patterns
    iteration_start_pattern = re.compile(r"Starting optimization repeat (\d+)/100")
    repeat_completed_pattern = re.compile(
        r"Repeat \d+ completed - Average F1 Score: ([\d\.]+), Time: ([\d\.]+) seconds"
    )
    generation_pattern = re.compile(r"Generation (\d+) - Best F1 Score: ([\d\.]+)")

    # Variables to track current iteration and accumulation
    current_iteration = None
    current_total_time = 0.0
    current_best_f1 = -float('inf')  # Initialize to negative infinity to ensure any F1 score is higher
    # Optionally, track per-generation best F1 scores if needed

    # Parse the file line by line
    for line in lines:
        # Check for the start of a new iteration
        iteration_start_match = iteration_start_pattern.search(line)
        if iteration_start_match:
            # If there's an ongoing iteration, save its summary before starting a new one
            if current_iteration is not None:
                repeat_summary.append((current_iteration, current_best_f1, current_total_time))
                # Reset accumulators for the new iteration
                current_total_time = 0.0
                current_best_f1 = -float('inf')
            
            # Start a new iteration
            current_iteration = int(iteration_start_match.group(1))
            continue  # Move to the next line

        # Check for repeat completion within the current iteration
        repeat_completed_match = repeat_completed_pattern.search(line)
        if repeat_completed_match and current_iteration is not None:
            f1_score = float(repeat_completed_match.group(1))
            time_taken = float(repeat_completed_match.group(2))
            # Accumulate time
            current_total_time += time_taken
            # Update best F1 score if necessary
            if f1_score > current_best_f1:
                current_best_f1 = f1_score
            continue  # Move to the next line

        # Check for generation summary within the current iteration
        generation_match = generation_pattern.search(line)
        if generation_match and current_iteration is not None:
            generation_number = int(generation_match.group(1))
            generation_f1 = float(generation_match.group(2))
            generational_data.append((current_iteration, generation_number, generation_f1))
            continue  # Move to the next line

    # After processing all lines, ensure the last iteration is saved
    if current_iteration is not None:
        repeat_summary.append((current_iteration, current_best_f1, current_total_time))

    # Create DataFrames for the results
    # Summary contains best F1 score and total Time for each iteration
    repeat_summary_df = pd.DataFrame(repeat_summary, columns=["Iteration", "Best_F1_Score", "Total_Time"])

    # Generational contains the best F1 score for each combination of iteration and generation
    generational_data_df = pd.DataFrame(generational_data, columns=["Iteration", "Generation", "Best_F1_Score"])

    return repeat_summary_df, generational_data_df

# Process the provided file
repeat_summary, generational_data = MLP_Evotorch_data(evotorch_file_path_GA)

# Display the resulting DataFrames
print("Summary (Best F1 Score and Total Time for each Iteration):")
print(repeat_summary)
print("\nGenerational Data (Best F1 Score for each Iteration + Generation):")
print(generational_data)


Summary (Best F1 Score and Total Time for each Iteration):
    Iteration  Best_F1_Score  Total_Time
0           1         0.4041      214.16
1           2         0.4235      197.86
2           3         0.4111      196.20
3           4         0.4013      195.49
4           5         0.4205      196.00
..        ...            ...         ...
95         96         0.4259      194.74
96         97         0.4191      193.23
97         98         0.4257      210.09
98         99         0.4287      195.02
99        100         0.4197      195.70

[100 rows x 3 columns]

Generational Data (Best F1 Score for each Iteration + Generation):
     Iteration  Generation  Best_F1_Score
0            1           1         0.3879
1            1           2         0.3879
2            1           3         0.3937
3            1           4         0.3981
4            1           5         0.3981
..         ...         ...            ...
795        100           4         0.4169
796        100       

In [4]:
repeat_summary

Unnamed: 0,Iteration,Best_F1_Score,Total_Time
0,1,0.4041,214.16
1,2,0.4235,197.86
2,3,0.4111,196.20
3,4,0.4013,195.49
4,5,0.4205,196.00
...,...,...,...
95,96,0.4259,194.74
96,97,0.4191,193.23
97,98,0.4257,210.09
98,99,0.4287,195.02


In [5]:
repeat_summary
generational_data

Unnamed: 0,Iteration,Generation,Best_F1_Score
0,1,1,0.3879
1,1,2,0.3879
2,1,3,0.3937
3,1,4,0.3981
4,1,5,0.3981
...,...,...,...
795,100,4,0.4169
796,100,5,0.4169
797,100,6,0.4169
798,100,7,0.4177


In [6]:
# change colum nname from Iteratin to Repeat
repeat_summary.rename(columns={'Iteration': 'Repeat'}, inplace=True)
#change column to F1_Score
repeat_summary.rename(columns={'Best_F1_Score': 'F1_Score'}, inplace=True)
repeat_summary.rename(columns={'Total_Time': 'Time'}, inplace=True) 
repeat_summary.to_csv('MLP_evotorch_10Jan25_GA_7706549_REPEAT.csv')
repeat_summary

Unnamed: 0,Repeat,F1_Score,Time
0,1,0.4041,214.16
1,2,0.4235,197.86
2,3,0.4111,196.20
3,4,0.4013,195.49
4,5,0.4205,196.00
...,...,...,...
95,96,0.4259,194.74
96,97,0.4191,193.23
97,98,0.4257,210.09
98,99,0.4287,195.02


In [7]:
# Change columns to Repeat and Generation 
generational_data.rename(columns={'Iteration': 'Repeat', 'Trial': 'Generation'}, inplace=True)
# change Best F1 Score to F1_Score
generational_data.rename(columns={'Best_F1_Score': 'F1_Score'}, inplace=True)
generational_data.to_csv('MLP_evotorch_10Jan25_GA_7706549_GENERATION.csv', index=False)
generational_data

Unnamed: 0,Repeat,Generation,F1_Score
0,1,1,0.3879
1,1,2,0.3879
2,1,3,0.3937
3,1,4,0.3981
4,1,5,0.3981
...,...,...,...
795,100,4,0.4169
796,100,5,0.4169
797,100,6,0.4169
798,100,7,0.4177


# max and min f1 score per generation

In [8]:
import pandas as pd

# Assuming generational_data is the original dataframe
# Create the new dataframe with max and min F1_Score for each Generation
generational_summary = generational_data.groupby('Generation').agg(
    Max_F1_Score=('F1_Score', 'max'),
    Min_F1_Score=('F1_Score', 'min'),
    Mean_F1_Score=('F1_Score', 'mean'),
    Std_F1_Score=('F1_Score', 'std'),
).reset_index()

# Display the resulting dataframe
generational_summary



Unnamed: 0,Generation,Max_F1_Score,Min_F1_Score,Mean_F1_Score,Std_F1_Score
0,1,0.4217,0.371,0.39241,0.011499
1,2,0.4255,0.3833,0.404288,0.010861
2,3,0.4291,0.3883,0.41038,0.009265
3,4,0.4339,0.3939,0.414688,0.008831
4,5,0.4341,0.3973,0.4177,0.008568
5,6,0.4341,0.3973,0.419912,0.008148
6,7,0.4347,0.3993,0.421264,0.007595
7,8,0.4357,0.4013,0.42268,0.007598


In [9]:
# Calculate the final summary statistics
final_summary = {
    'Best_F1_Score': repeat_summary['F1_Score'].max(),
    'Worst_F1_Score': repeat_summary['F1_Score'].min(),
    'Avg_F1_Score': repeat_summary['F1_Score'].mean(),
    'StdDev_F1_Score': repeat_summary['F1_Score'].std(),
    'Avg_Time': repeat_summary['Time'].mean(),
    'StdDev_Time': repeat_summary['Time'].std(),
}

# Convert to a DataFrame for display
final_summary_df = pd.DataFrame([final_summary])

# Display the final summary DataFrame to the user
final_summary_df.to_csv('MLP_evotorch_10Jan25_GA_7706549_FINAL_SUMMARY.csv', index=False)
final_summary_df


Unnamed: 0,Best_F1_Score,Worst_F1_Score,Avg_F1_Score,StdDev_F1_Score,Avg_Time,StdDev_Time
0,0.4357,0.4013,0.42268,0.007598,195.0121,3.85317
