# GA Parse

In [1]:
import pandas as pd
import re
# Define the file path as a variable
filename = 'gramacy_evotorch_10Jan25_250pop_paramplot_GA_7686802'
filename_GA = f'../../../raw/{filename}'
log_file_path = f'{filename_GA}.log'

# Load and parse the log file
with open(log_file_path, 'r') as file:
    log_content = file.readlines()

# Re-initialize variables to store extracted data
data = []
current_repeat = None

# Regular expressions for parsing
repeat_pattern = re.compile(r'REPEAT (\d+)/100')
generation_pattern = re.compile(r'Generation (\d+)/10:')
mse_pattern = re.compile(r'Best MSE So Far: ([\d.]+)')

# Parsing logic
for line in log_content:
    line = line.strip()  # Remove extra whitespace
    repeat_match = repeat_pattern.search(line)
    generation_match = generation_pattern.search(line)
    mse_match = mse_pattern.search(line)
    
    if repeat_match:
        current_repeat = int(repeat_match.group(1))
    elif generation_match:
        generation = int(generation_match.group(1))
    if mse_match:
        mse = float(mse_match.group(1))
        data.append({'Repeat': current_repeat, 'Generation': generation, 'MSE': mse})

# Create a DataFrame
df_GA = pd.DataFrame(data)
df_GA.to_csv(f'{filename}_GENERATIONS.csv', index=False)
df_GA


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


Unnamed: 0,Repeat,Generation,MSE
0,1,1,7.938541
1,1,2,7.938541
2,1,3,7.938541
3,1,4,7.769546
4,1,5,6.099251
...,...,...,...
995,100,6,6.617259
996,100,7,5.257891
997,100,8,5.048333
998,100,9,5.048333


# EA parse

In [2]:
import pandas as pd
import re
# Define the file path as a variable
filename_E = 'gramacy_evotorch_10Jan25_250pop_k_EA_7686636'
filename_EA = f'../../../raw/{filename_E}'
log_file_path = f'{filename_EA}.log'

# Load and parse the log file
with open(log_file_path, 'r') as file:
    log_content = file.readlines()

# Re-initialize variables to store extracted data
data = []
current_repeat = None

# Regular expressions for parsing
repeat_pattern = re.compile(r'REPEAT (\d+)/100')
generation_pattern = re.compile(r'Generation (\d+)/10:')
mse_pattern = re.compile(r'Best MSE So Far: ([\d.]+)')

# Parsing logic
for line in log_content:
    line = line.strip()  # Remove extra whitespace
    repeat_match = repeat_pattern.search(line)
    generation_match = generation_pattern.search(line)
    mse_match = mse_pattern.search(line)
    
    if repeat_match:
        current_repeat = int(repeat_match.group(1))
    elif generation_match:
        generation = int(generation_match.group(1))
    if mse_match:
        mse = float(mse_match.group(1))
        data.append({'Repeat': current_repeat, 'Generation': generation, 'MSE': mse})

# Create a DataFrame
df_EA = pd.DataFrame(data)
df_EA.to_csv(f'{filename_E}_GENERATIONS.csv', index=False)
df_EA


Unnamed: 0,Repeat,Generation,MSE
0,1,1,11.326159
1,1,2,11.326159
2,1,3,10.575054
3,1,4,9.872123
4,1,5,9.872123
...,...,...,...
995,100,6,9.335157
996,100,7,9.109122
997,100,8,9.009114
998,100,9,8.278005


# EA summary

In [3]:
# Group by 'Repeat' and find the minimum MSE
df_EA

min_mse_df_EA = df_EA.groupby('Repeat')['MSE'].min().reset_index()

# Create a DataFrame with all repeats from 1 to 100
all_repeats = pd.DataFrame({'Repeat': range(1, 101)})

# Merge to include all repeats, filling missing with NaN
min_mse_df_EA = all_repeats.merge(min_mse_df_EA, on='Repeat', how='left')

# Optionally fill NaN values
min_mse_df_EA['MSE'] = min_mse_df_EA['MSE'].fillna(0)

# min_mse_df_EA.to_csv(f'{filename_E}_REPEAT_SUMMARY.csv', index=False)

# Display the result
min_mse_df_EA

Unnamed: 0,Repeat,MSE
0,1,8.088330
1,2,7.638203
2,3,7.985531
3,4,6.968668
4,5,6.593210
...,...,...
95,96,6.515400
96,97,6.952879
97,98,7.388397
98,99,5.316413


# GA

In [4]:
# Group by 'Repeat' and find the minimum MSE
df_GA

min_mse_df_GA = df_GA.groupby('Repeat')['MSE'].min().reset_index()

# Create a DataFrame with all repeats from 1 to 100
all_repeats = pd.DataFrame({'Repeat': range(1, 101)})

# Merge to include all repeats, filling missing with NaN
min_mse_df_GA = all_repeats.merge(min_mse_df_GA, on='Repeat', how='left')

# Optionally fill NaN values
min_mse_df_GA['MSE'] = min_mse_df_GA['MSE'].fillna(0)

# min_mse_df_GA.to_csv(f'{filename}_REPEAT_SUMMARY.csv', index=False)

# Display the result
min_mse_df_GA

Unnamed: 0,Repeat,MSE
0,1,4.290932
1,2,3.269217
2,3,3.679793
3,4,4.162817
4,5,3.853625
...,...,...
95,96,3.837444
96,97,4.077254
97,98,3.803271
98,99,3.975649


# TIME ADD EA

In [5]:
import re
import pandas as pd

def evotorch_time_parse(log_file_path):
    """
    Parses an EvoTorch log file to extract repeat numbers and time taken.

    Args:
        log_file_path (str): Path to the log file to be parsed.

    Returns:
        pd.DataFrame: A DataFrame containing columns 'Repeat' and 'Time'.
    """
    # Define the patterns to match
    repeat_pattern = r"Repeat (\d+) Summary"
    time_pattern = r"Time Taken: ([\d\.]+) seconds"

    # Initialize lists to store parsed data
    repeats = []
    times = []

    # Open the log file and parse it line by line
    with open(log_file_path, 'r') as file:
        for line in file:
            # Match repeat number
            repeat_match = re.search(repeat_pattern, line)
            if repeat_match:
                repeats.append(int(repeat_match.group(1)))

            # Match time taken
            time_match = re.search(time_pattern, line)
            if time_match:
                times.append(float(time_match.group(1)))

    # Ensure both lists have the same length
    if len(repeats) != len(times):
        raise ValueError("Mismatch between number of repeats and times. Check log file format.")

    # Create a DataFrame with the extracted data
    data = pd.DataFrame({
        'Repeat': repeats,
        'Time': times
    })

    return data

# Example usage
filename_E = 'gramacy_evotorch_10Jan25_250pop_k_EA_7686636'
filename_EA = f'../../../raw/{filename_E}'
log_file_path = f'{filename_EA}.log'

df_time_EA = evotorch_time_parse(log_file_path)
df_time_EA

Unnamed: 0,Repeat,Time
0,1,25.48
1,2,25.46
2,3,25.63
3,4,25.55
4,5,25.48
...,...,...
95,96,25.53
96,97,25.55
97,98,25.63
98,99,25.59


# TIME GA

In [6]:

# Define the file path as a variable
filename = 'gramacy_evotorch_10Jan25_250pop_paramplot_GA_7686802'
filename_GA = f'../../../raw/{filename}'
log_file_path = f'{filename_GA}.log'

df_time_GA = evotorch_time_parse(log_file_path)
df_time_GA

Unnamed: 0,Repeat,Time
0,1,30.16
1,2,25.20
2,3,25.20
3,4,25.20
4,5,25.21
...,...,...
95,96,25.19
96,97,25.19
97,98,25.17
98,99,25.16


# MERGE EA

In [7]:
import pandas as pd

# Assuming df_time_EA and min_mse_df_EA are already defined and contain the data

# Merge the two DataFrames on the 'Repeat' column
merged_df_EA = pd.merge(df_time_EA, min_mse_df_EA, on="Repeat")
# to csv
merged_df_EA.to_csv(f'{filename_E}_REPEAT_SUMMARY.csv', index=False)

# Display the merged DataFrame
merged_df_EA

# If you want to save the result to a CSV file, you can use:
# merged_df.to_csv("merged_output.csv", index=False)


Unnamed: 0,Repeat,Time,MSE
0,1,25.48,8.088330
1,2,25.46,7.638203
2,3,25.63,7.985531
3,4,25.55,6.968668
4,5,25.48,6.593210
...,...,...,...
95,96,25.53,6.515400
96,97,25.55,6.952879
97,98,25.63,7.388397
98,99,25.59,5.316413


# MERGE GA

In [8]:

# Merge the two DataFrames on the 'Repeat' column
merged_df_GA = pd.merge(df_time_GA, min_mse_df_GA, on="Repeat")
# to csv
merged_df_GA.to_csv(f'{filename}_REPEAT_SUMMARY.csv', index=False)

# Display the merged DataFrame
merged_df_GA


Unnamed: 0,Repeat,Time,MSE
0,1,30.16,4.290932
1,2,25.20,3.269217
2,3,25.20,3.679793
3,4,25.20,4.162817
4,5,25.21,3.853625
...,...,...,...
95,96,25.19,3.837444
96,97,25.19,4.077254
97,98,25.17,3.803271
98,99,25.16,3.975649


# EA Final summary

In [9]:
# Calculate summary statistics for Min_MSE
mse_summary_EA = {
    'Best MSE': [merged_df_EA['MSE'].min()],  # Since lower MSE is better
    'Worst MSE': [merged_df_EA['MSE'].max()],
    'Average MSE': [merged_df_EA['MSE'].mean()],
    'MSE StdDev': [merged_df_EA['MSE'].std()],
    # add time average and std dev
    'Avg Time': [merged_df_EA['Time'].mean()],
    'Time StdDev': [merged_df_EA['Time'].std()]
}
mse_summary_df_EA = pd.DataFrame(mse_summary_EA)
mse_summary_df_EA.to_csv(f'{filename_E}_FINAL_SUMMARY_EA.csv', index=False)
mse_summary_df_EA


Unnamed: 0,Best MSE,Worst MSE,Average MSE,MSE StdDev,Avg Time,Time StdDev
0,5.239719,8.914597,7.311312,0.811544,25.5453,0.129743


# GA final summary

In [10]:
# Calculate summary statistics for Min_MSE
mse_summary_GA = {
    'Best MSE': [merged_df_GA['MSE'].min()],  # Since lower MSE is better
    'Worst MSE': [merged_df_GA['MSE'].max()],
    'Average MSE': [merged_df_GA['MSE'].mean()],
    'MSE StdDev': [merged_df_GA['MSE'].std()],
    # add time average and std dev
    'Avg Time': [merged_df_GA['Time'].mean()],
    'Time StdDev': [merged_df_GA['Time'].std()]
}
mse_summary_df_GA = pd.DataFrame(mse_summary_GA)
mse_summary_df_GA.to_csv(f'{filename}_FINAL_SUMMARY_GA.csv', index=False)
mse_summary_df_GA


Unnamed: 0,Best MSE,Worst MSE,Average MSE,MSE StdDev,Avg Time,Time StdDev
0,2.933824,4.954795,3.93919,0.443351,25.246,0.496544
