In [2]:
import pandas as pd

# MLP HyperOPT

In [3]:
import pandas as pd
import re

def parse_hyperopt_log(file_path, max_iterations=100, trials_per_iteration=150):
    """
    Parses a Hyperopt log file to extract the final F1 score for each trial within each iteration.

    Args:
        file_path (str): Path to the Hyperopt log file.
        max_iterations (int): Number of iterations to parse.
        trials_per_iteration (int): Number of trials per iteration.

    Returns:
        pd.DataFrame: DataFrame containing Iteration, Trial, and F1_Score.
    """
    # Initialize a dictionary to store the parsed data
    data = {}

    # Initialize variables to track the current state
    current_iteration = None
    current_trial = None

    # Define regex patterns
    iter_start_pattern = re.compile(r"Starting optimization iteration (\d+)/\d+")
    trial_progress_pattern = re.compile(r"\|\s*(\d+)/(\d+)")
    f1_score_pattern = re.compile(r"Validation F1 Score:\s+([\d\.]+)")

    # Open and read the log file line by line
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()

            # Check for the start of a new iteration
            iter_match = iter_start_pattern.match(line)
            if iter_match:
                current_iteration = int(iter_match.group(1))
                if current_iteration > max_iterations:
                    # Stop parsing after reaching the maximum number of iterations
                    break
                continue  # Move to the next line

            # Check for a trial progress line
            trial_match = trial_progress_pattern.search(line)
            if trial_match and current_iteration is not None:
                current_trial = int(trial_match.group(1))
                continue  # Move to the next line

            # Capture the F1 score if present
            f1_match = f1_score_pattern.search(line)
            if f1_match and current_iteration is not None and current_trial is not None:
                f1_score = float(f1_match.group(1))
                # Use a tuple of (Iteration, Trial) as the key to ensure uniqueness
                key = (current_iteration, current_trial)
                data[key] = f1_score  # Overwrite any previous F1 score for this trial

    # Convert the dictionary to a DataFrame
    trial_data = [
        {'Iteration': key[0], 'Trial': key[1], 'F1_Score': score}
        for key, score in data.items()
    ]
    trial_data_df = pd.DataFrame(trial_data, columns=["Iteration", "Trial", "F1_Score"])

    return trial_data_df

# Path to your Hyperopt log file
hyperopt_file_path = '../../../plotting_data/raw/MLP_Hyperopt_10Jan25_7704199.log'

# Parse the log file
trial_data_hyperopt = parse_hyperopt_log(hyperopt_file_path)

# Display the first few rows of the resulting DataFrame
print("Trial Data (Iteration, Trial, F1_Score):")
print(trial_data_hyperopt.head())



# Optionally, display a summary of the DataFrame
print("\nSummary Statistics:")
print(trial_data_hyperopt.describe())

# Optionally, identify the maximum F1 Score
max_f1 = trial_data_hyperopt['F1_Score'].max()
print(f"\nMaximum F1 Score Achieved: {max_f1}")
trial_data_hyperopt


Trial Data (Iteration, Trial, F1_Score):
   Iteration  Trial  F1_Score
0          1      0    0.3468
1          1      1    0.4447
2          1      2    0.2992
3          1      3    0.3953
4          1      4    0.3212

Summary Statistics:
          Iteration         Trial      F1_Score
count  15000.000000  15000.000000  15000.000000
mean      50.500000     74.500000      0.470367
std       28.867032     43.301751      0.078370
min        1.000000      0.000000      0.051700
25%       25.750000     37.000000      0.435900
50%       50.500000     74.500000      0.491800
75%       75.250000    112.000000      0.525250
max      100.000000    149.000000      0.584900

Maximum F1 Score Achieved: 0.5849


Unnamed: 0,Iteration,Trial,F1_Score
0,1,0,0.3468
1,1,1,0.4447
2,1,2,0.2992
3,1,3,0.3953
4,1,4,0.3212
...,...,...,...
14995,100,145,0.5609
14996,100,146,0.5497
14997,100,147,0.5749
14998,100,148,0.5679


# add 1

In [5]:
# # Increment the Trial column by 1
# trial_data_hyperopt['Trial'] = trial_data_hyperopt['Trial'] + 1
# Save to CSV
trial_data_hyperopt.to_csv('MLP_Hyperopt_10Jan25_7704199_TRIALS.csv', index=False)
trial_data_hyperopt

Unnamed: 0,Iteration,Trial,F1_Score
0,1,0,0.3468
1,1,1,0.4447
2,1,2,0.2992
3,1,3,0.3953
4,1,4,0.3212
...,...,...,...
14995,100,145,0.5609
14996,100,146,0.5497
14997,100,147,0.5749
14998,100,148,0.5679


# Repeat Summary

In [6]:
import pandas as pd
import re

# Path to your Hyperopt log file
hyperopt_file_path = '../../../plotting_data/raw/MLP_Hyperopt_10Jan25_7704199.log'

# Read the log file
with open(hyperopt_file_path, 'r') as file:
    log_content = file.read()

# Define the regular expression pattern
# This pattern captures:
# 1. Iteration number
# 2. F1 score
# 3. Time taken in seconds
pattern = re.compile(
    r"Optimization Iteration\s+(\d+)\s+Complete:.*?"
    r"Best validation F1 score:\s*([0-9.]+).*?"
    r"Time taken:\s*([0-9.]+)\s+seconds",
    re.DOTALL
)

# Find all matches in the log content
matches = pattern.findall(log_content)

# Check if the expected number of iterations is found
expected_iterations = 100
if len(matches) != expected_iterations:
    print(f"Warning: Expected {expected_iterations} iterations, but found {len(matches)}.")

# Create a DataFrame from the matches
df = pd.DataFrame(matches, columns=['Iteration', 'F1_Score', 'Time'])

# Convert columns to appropriate data types
df['Iteration'] = df['Iteration'].astype(int)
df['F1_Score'] = df['F1_Score'].astype(float)
df['Time'] = df['Time'].astype(float)

# (Optional) Sort the DataFrame by Iteration number
df = df.sort_values('Iteration').reset_index(drop=True)

# Display the first few rows of the DataFrame
df

# Save the DataFrame to a CSV file
output_csv_path = 'MLP_Hyperopt_10Jan25_7704199_SUMMARY.csv'
df.to_csv(output_csv_path, index=False)
print(f"Data has been saved to {output_csv_path}")
df


Data has been saved to MLP_Hyperopt_10Jan25_7704199_SUMMARY.csv


Unnamed: 0,Iteration,F1_Score,Time
0,1,0.5727,153.44
1,2,0.5563,121.92
2,3,0.5805,124.40
3,4,0.5745,123.88
4,5,0.5701,124.40
...,...,...,...
95,96,0.5729,122.83
96,97,0.5755,123.71
97,98,0.5721,123.05
98,99,0.5729,122.71


# Final summary

In [7]:
import pandas as pd

# Calculate the summary statistics
best_f1_score = df['F1_Score'].max()
worst_f1_score = df['F1_Score'].min()
avg_f1_score = df['F1_Score'].mean()
std_dev_f1_score = df['F1_Score'].std()
# add average time and std dev of average time
avg_time = df['Time'].mean()
std_dev_time = df['Time'].std()

# Create the final summary DataFrame
final_summary = pd.DataFrame({
    'Best F1 Score': [best_f1_score],
    'Worst F1 Score': [worst_f1_score],
    'Average F1 Score': [avg_f1_score],
    'F1 Score Std Dev': [std_dev_f1_score],
    'Average Time': [avg_time],
    'Time Std Dev': [std_dev_time]
})

# Display the final summary DataFrame
final_summary.to_csv('MLP_Hyperopt_10Jan25_7704199_FINAL_SUMMARY.csv', index=False)
final_summary


Unnamed: 0,Best F1 Score,Worst F1 Score,Average F1 Score,F1 Score Std Dev,Average Time,Time Std Dev
0,0.5849,0.541,0.573483,0.005876,124.0784,4.4793
