In [3]:
import pandas as pd
import re

# Sample log file content
log_content = """
MSE 0.2253 for trial 468 of run 1/100
[INFO 12-07 01:21:56] ax.service.managed_loop: Running optimization trial 469...
Mean Squared Error: 0.2444
MSE 0.2444 for trial 480 of run 3/100
[INFO 12-07 01:21:59] ax.service.managed_loop: Running optimization trial 481...
Mean Squared Error: 0.1388
MSE 0.1388 for trial 499 of run 2/100
[INFO 12-07 01:22:12] ax.service.managed_loop: Running optimization trial 500...
Mean Squared Error: 0.1942
MSE 0.1942 for trial 469 of run 1/100
[INFO 12-07 01:22:17] ax.service.managed_loop: Running optimization trial 470...
Mean Squared Error: 0.2784
MSE 0.2784 for trial 481 of run 3/100
[INFO 12-07 01:22:23] ax.service.managed_loop: Running optimization trial 482...
Mean Squared Error: 0.1236
MSE 0.1236 for trial 500 of run 2/100
"""

# Regular expression to match the desired pattern
pattern = r"MSE ([0-9.]+) for trial (\d+) of run (\d+)/\d+"

# Find all matches in the log content
matches = re.findall(pattern, log_content)

# Create a DataFrame
data = [{"Run": int(run), "Trial": int(trial), "MSE": float(mse)} for mse, trial, run in matches]
df = pd.DataFrame(data)

# Sort the DataFrame by Run and then Trial
df = df.sort_values(by=["Run", "Trial"]).reset_index(drop=True)
# Display the DataFrame
df


Unnamed: 0,Run,Trial,MSE
0,1,468,0.2253
1,1,469,0.1942
2,2,499,0.1388
3,2,500,0.1236
4,3,480,0.2444
5,3,481,0.2784


# Ax 1 run 1500 trials

In [4]:
import re
import pandas as pd

# Load the uploaded log file
log_file_path = 'ax_gramacy_06Dec24__1500trials_7528408.log'
with open(log_file_path, 'r') as file:
    log_content = file.readlines()

# Initialize lists to store extracted trial and MSE values
trials = []
mses = []

# Define regex patterns for trials and MSE
trial_pattern = r'Running optimization trial (\d+)...'
mse_pattern = r'Mean Squared Error: ([\d.]+)'

# Parse the log file to extract trial numbers and MSEs
for line in log_content:
    trial_match = re.search(trial_pattern, line)
    mse_match = re.search(mse_pattern, line)
    
    if trial_match:
        trial = int(trial_match.group(1))
        trials.append(trial)
    if mse_match:
        mse = float(mse_match.group(1))
        mses.append(mse)

# Create the DataFrame
df = pd.DataFrame({'Trial': trials, 'MSE': mses})
df



Unnamed: 0,Trial,MSE
0,1,19.5446
1,2,18.7856
2,3,26.2766
3,4,20.1966
4,5,16.7872
...,...,...
1495,1496,0.1024
1496,1497,0.0560
1497,1498,0.1033
1498,1499,0.0872


# Ax 100 repeats 500 trials

In [49]:
import re
import pandas as pd

# Define the file path
filename_ax = '../../../raw/ax_gramacy_06Dec24__500trialsK_CPUparallel_7528819'
log_file_path = f'{filename_ax}.log'  # Replace with your actual log file path

# Define the function to parse the Ax log file
def parse_ax_log(log_file_path):
    """
    Parses the Ax log file to extract Run, Trial, and MSE information.

    Parameters:
    - log_file_path (str): Path to the Ax log file.

    Returns:
    - pandas.DataFrame: DataFrame containing Run, Trial, and MSE columns.
    """

    # Regular expression pattern to match the MSE lines
    mse_pattern = re.compile(
        r"MSE\s+([0-9]*\.?[0-9]+)\s+for\s+trial\s+(\d+)\s+of\s+run\s+(\d+)/\d+",
        re.IGNORECASE
    )

    data = []

    try:
        with open(log_file_path, 'r') as file:
            for line_number, line in enumerate(file, 1):
                line = line.strip()
                match = mse_pattern.search(line)
                if match:
                    mse_value = float(match.group(1))
                    trial_num = int(match.group(2))
                    run_num = int(match.group(3))
                    data.append({
                        'Run': run_num,
                        'Trial': trial_num,
                        'MSE': mse_value
                    })
    except FileNotFoundError:
        print(f"Error: The file {log_file_path} does not exist.")
        return pd.DataFrame(columns=['Run', 'Trial', 'MSE'])
    except Exception as e:
        print(f"An error occurred while parsing the log file: {e}")
        return pd.DataFrame(columns=['Run', 'Trial', 'MSE'])

    # Create DataFrame
    df = pd.DataFrame(data)

    return df

# Parse the log file
df = parse_ax_log(log_file_path)

if df.empty:
    print("No data was parsed from the log file.")
else:
    # Display the first few rows of the DataFrame
    print("Parsed DataFrame:")
    print(df.head())

    # Optionally, save the DataFrame to a CSV file
    output_csv = 'parsed_data.csv'
    # Sort by Run and then Trial
    df = df.sort_values(by=["Run", "Trial"]).reset_index(drop=True)
    df.to_csv(output_csv, index=False)
    print(f"\nData has been saved to {output_csv}")


Parsed DataFrame:
   Run  Trial      MSE
0    4      1  16.8397
1    1      2  21.9212
2    4      2  22.6518
3    2      3  24.4388
4    4      3  27.2269

Data has been saved to parsed_data.csv


In [56]:
import pandas as pd
import re

def ax_log_time(log_file_path):
    """
    Parses a log file to extract iteration numbers and time taken, and saves the data to a DataFrame.

    Args:
        log_file_path (str): Path to the log file.

    Returns:
        pd.DataFrame: DataFrame containing 'Iteration' and 'Time' columns.
    """
    # Regular expression to match the desired log lines
    pattern = r"Iteration (\d+)/\d+ completed\. Time taken: ([\d.]+) seconds"

    data = []

    # Read the log file
    with open(log_file_path, 'r') as file:
        for line in file:
            match = re.search(pattern, line)
            if match:
                iteration = int(match.group(1))
                time_taken = float(match.group(2))
                data.append({'Iteration': iteration, 'Time': time_taken})

    # Create a DataFrame
    df = pd.DataFrame(data)
    #sort by Iteration
    df = df.sort_values(by="Iteration").reset_index(drop=True)
    # change column Iteration to Run
    df.rename(columns={'Iteration': 'Run'}, inplace=True)

    return df

filename = 'ax_gramacy_06Dec24__500trialsK_CPUparallel_7528819'
# Example usage
log_file_path = f'../../../raw/{filename}.log'
time_df = ax_log_time(log_file_path)

# # Save to a CSV file if needed
# time_df.to_csv('ax_log_time_output.csv', index=False)

# Display the DataFrame
time_df


Unnamed: 0,Run,Time
0,1,5135.62
1,2,4290.16
2,3,4811.63
3,4,4249.87
4,5,4172.76
...,...,...
95,96,4549.10
96,97,5031.42
97,98,4160.13
98,99,4006.75


In [57]:
# df = pd.read_csv(f'{filename_ax}.csv')
# change coumn to Run
df = df.rename(columns={"Iteration": "Run"})
df

Unnamed: 0,Run,Trial,MSE
0,1,2,21.9212
1,1,4,25.4510
2,1,6,21.7142
3,1,7,32.5342
4,1,8,18.8669
...,...,...,...
49940,100,496,0.2297
49941,100,497,0.2400
49942,100,498,0.7351
49943,100,499,0.2822


In [58]:
# Group by 'Run' and calculate the minimum MSE for each Run
min_mse_per_run = df.groupby('Run')['MSE'].min().reset_index()

# Optional: Rename the columns for clarity
min_mse_per_run.columns = ['Run', 'MSE']

# min_mse_per_run.to_csv(f'{filename_ax}_SUMMARY.csv', index=False)

min_mse_per_run


Unnamed: 0,Run,MSE
0,1,0.0939
1,2,0.1137
2,3,0.1468
3,4,0.1770
4,5,0.0786
...,...,...
95,96,0.1192
96,97,0.0698
97,98,0.0485
98,99,0.0812


In [59]:
# Group by 'Run' and calculate the minimum MSE for each Run
min_mse_per_run = df.groupby('Run')['MSE'].min().reset_index()

# Rename the columns for clarity (optional)
min_mse_per_run.columns = ['Run', 'MSE']

# Merge with time_df on 'Run'
merged_df = min_mse_per_run.merge(time_df, on='Run', how='inner')
# save to csv
merged_df.to_csv(f'{filename}_SUMMARY.csv', index=False)

# Output the merged DataFrame
merged_df


Unnamed: 0,Run,MSE,Time
0,1,0.0939,5135.62
1,2,0.1137,4290.16
2,3,0.1468,4811.63
3,4,0.1770,4249.87
4,5,0.0786,4172.76
...,...,...,...
95,96,0.1192,4549.10
96,97,0.0698,5031.42
97,98,0.0485,4160.13
98,99,0.0812,4006.75


In [61]:
# Calculate summary statistics for Min_MSE and Time
mse_summary = {
    'Best MSE': [merged_df['MSE'].min()],  # Since lower MSE is better
    'Worst MSE': [merged_df['MSE'].max()],
    'Average MSE': [merged_df['MSE'].mean()],
    'MSE StdDev': [merged_df['MSE'].std()],
    'Time Avg': [merged_df['Time'].mean()],
    'Time StdDev': [merged_df['Time'].std()]
}

# Create DataFrame for summary statistics
mse_summary_df = pd.DataFrame(mse_summary)

# Save the summary to a CSV file
mse_summary_df.to_csv(f'{filename}_FINAL_SUMMARY.csv', index=False)

# Output the summary DataFrame
mse_summary_df


Unnamed: 0,Best MSE,Worst MSE,Average MSE,MSE StdDev,Time Avg,Time StdDev
0,0.0194,0.2456,0.085695,0.043121,4595.754,400.691056
