In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import re

def parse_log_file(file_path):
    """
    Parse log file containing Iteration, Trial, and MSE data.
    
    Parameters:
    file_path (str): Path to the log file
    
    Returns:
    pandas.DataFrame: DataFrame with columns ['Iteration', 'Trial', 'MSE']
    """
    # Initialize lists to store data
    iterations = []
    trials = []
    mses = []
    
    # Compile regex patterns
    iter_pattern = re.compile(r'Iteration (\d+)/100')
    trial_pattern = re.compile(r'Trial (\d+)/1500')
    mse_pattern = re.compile(r'MSE = (\d+\.\d+)')
    
    current_iteration = None
    current_trial = None
    
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            
            # Check for Iteration
            iter_match = iter_pattern.search(line)
            if iter_match:
                current_iteration = int(iter_match.group(1))
                
            # Check for Trial
            trial_match = trial_pattern.search(line)
            if trial_match:
                current_trial = int(trial_match.group(1))
                
            # Check for MSE
            mse_match = mse_pattern.search(line)
            if mse_match:
                mse = float(mse_match.group(1))
                # Store all values when we find an MSE
                iterations.append(current_iteration)
                trials.append(current_trial)
                mses.append(mse)
    
    # Create DataFrame
    df = pd.DataFrame({
        'Iteration': iterations,
        'Trial': trials,
        'MSE': mses
    })
    
    return df

In [4]:
# Example usage
filename = 'RANDOM_gramacy_08Dec_1500trials_STANDBY_7532865'
filename = 'RANDOM_gramacy_07Dec_1500trials_STANDBY_7532633'
file_path = f'../../../raw/{filename}.log'
file_path = f'archive/{filename}.log'
df = parse_log_file(file_path)

# View the first few rows
print(df.head())


# Basic statistics
print(df.describe())

#change column to Run
df = df.rename(columns={'Iteration': 'Run'})
df.to_csv(f'{filename}.csv', index=False)

df

   Iteration  Trial      MSE
0          1      1  23.3222
1          1      2  24.9122
2          1      3  32.0023
3          1      4  14.7156
4          1      5  23.2290
           Iteration          Trial            MSE
count  150000.000000  150000.000000  150000.000000
mean       50.500000     750.500000      23.161097
std        28.866166     433.014049       4.888197
min         1.000000       1.000000       7.688400
25%        25.750000     375.750000      19.716100
50%        50.500000     750.500000      22.918750
75%        75.250000    1125.250000      26.362225
max       100.000000    1500.000000      46.853800


Unnamed: 0,Run,Trial,MSE
0,1,1,23.3222
1,1,2,24.9122
2,1,3,32.0023
3,1,4,14.7156
4,1,5,23.2290
...,...,...,...
149995,100,1496,26.3736
149996,100,1497,27.1540
149997,100,1498,28.5878
149998,100,1499,20.7565


# summary

In [6]:
# Group by 'Run' and calculate the minimum MSE for each Run
min_mse_per_run = df.groupby('Run')['MSE'].min().reset_index()

# Optional: Rename the columns for clarity
min_mse_per_run.columns = ['Run', 'MSE']

# min_mse_per_run.to_csv(f'{filename}_SUMMARY.csv', index=False)

min_mse_per_run


Unnamed: 0,Run,MSE
0,1,8.5834
1,2,8.8811
2,3,8.2475
3,4,7.7279
4,5,8.3779
...,...,...
95,96,10.6492
96,97,8.4957
97,98,9.9906
98,99,9.4689


# TIME

In [7]:
import re
import pandas as pd

def random_time_parse(filename):
    # Define the file path
    file_path = f'archive/{filename}.log'

    # Initialize lists to store parsed data
    iterations = []
    times = []

    # Define the regex pattern to match the desired log lines
    pattern = re.compile(r"Iteration (\d+)/100 completed\. Time taken: ([\d.]+) seconds")

    try:
        # Open and read the file
        with open(file_path, 'r') as file:
            for line in file:
                match = pattern.search(line)
                if match:
                    iteration = int(match.group(1))
                    time = float(match.group(2))

                    # Append the data to the lists
                    iterations.append(iteration)
                    times.append(time)

        # Create a DataFrame with the extracted data
        df = pd.DataFrame({
            'Iteration': iterations,
            'Time': times
        })

        return df

    except FileNotFoundError:
        print(f"The file {file_path} does not exist.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

# Example usage
filename = 'RANDOM_gramacy_07Dec_1500trials_STANDBY_7532633'
df_time = random_time_parse(filename)
if df_time is not None:
    print(df_time)


    Iteration   Time
0           1  15.47
1           2   0.65
2           3   0.64
3           4   0.65
4           5   0.66
..        ...    ...
95         96   0.64
96         97   0.65
97         98   0.64
98         99   0.65
99        100   0.64

[100 rows x 2 columns]


In [10]:
# change iteration ro Run
df_time = df_time.rename(columns={'Iteration': 'Run'})
df_time

Unnamed: 0,Run,Time
0,1,15.47
1,2,0.65
2,3,0.64
3,4,0.65
4,5,0.66
...,...,...
95,96,0.64
96,97,0.65
97,98,0.64
98,99,0.65


# Merge df_time and min_mse_per_run

In [14]:
df_merged = pd.merge(min_mse_per_run, df_time, on='Run')
# to csv
df_merged.to_csv(f'{filename}_SUMMARY.csv', index=False)
df_merged

Unnamed: 0,Run,MSE,Time
0,1,8.5834,15.47
1,2,8.8811,0.65
2,3,8.2475,0.64
3,4,7.7279,0.65
4,5,8.3779,0.66
...,...,...,...
95,96,10.6492,0.64
96,97,8.4957,0.65
97,98,9.9906,0.64
98,99,9.4689,0.65


In [13]:
# Calculate summary statistics for Min_MSE
mse_summary = {
    'Best MSE': [df_merged['MSE'].min()],  # Since lower MSE is better
    'Worst MSE': [df_merged['MSE'].max()],
    'Average MSE': [df_merged['MSE'].mean()],
    'MSE StdDev': [df_merged['MSE'].std()],
    'Average Time (s)': [df_merged['Time'].mean()],
    'Std Dev Time (s)': [df_merged['Time'].std()]
}
mse_summary_df = pd.DataFrame(mse_summary)
mse_summary_df.to_csv(f'{filename}_FINAL_SUMMARY.csv', index=False)
mse_summary_df

Unnamed: 0,Best MSE,Worst MSE,Average MSE,MSE StdDev,Average Time (s),Std Dev Time (s)
0,7.6884,11.611,9.661273,1.01026,0.7963,1.482207


## updated random parse

In [12]:
def parse_log_file(log_file_path):
    """
    Parses the log file to extract Iteration, Trial, and MSE.

    Args:
        log_file_path (str): Path to the log file.

    Returns:
        pd.DataFrame: DataFrame containing Iteration, Trial, and MSE.
    """
    # Regular expression pattern to match the desired lines
    pattern = re.compile(r"Iteration\s+(\d+)\s+Trial\s+(\d+)\s+MSE:\s+([\d.]+)")

    # Lists to store extracted data
    iterations = []
    trials = []
    mses = []

    # Open and read the log file
    with open(log_file_path, 'r') as file:
        for line in file:
            # Search for the pattern in each line
            match = pattern.search(line)
            if match:
                iteration, trial, mse = match.groups()
                iterations.append(int(iteration))
                trials.append(int(trial))
                mses.append(float(mse))

    # Create a DataFrame from the extracted data
    df = pd.DataFrame({
        'Iteration': iterations,
        'Trial': trials,
        'MSE': mses
    })

    return df

filename = 'RANDOM_gramacy_07Dec_1500trials_STANDBY_7532633'
log_file_path = f'{filename}.log'

In [13]:
df

Unnamed: 0,Run,Trial,MSE
0,1,1,23.3222
1,1,2,24.9122
2,1,3,32.0023
3,1,4,14.7156
4,1,5,23.2290
...,...,...,...
149995,100,1496,26.3736
149996,100,1497,27.1540
149997,100,1498,28.5878
149998,100,1499,20.7565
