In [1]:
import pandas as pd
import os
import re

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


# MLP RANDOM

In [2]:
# Path to the uploaded file
file_path = '../../../plotting_data/raw/MLP_Random_10Jan25_7704200.log'
# Define the paths to save the results
iteration_file_path = 'MLP_Random_10Jan25_7704200_ITERATION.csv'
summary_file_path = 'MLP_Random_10Jan25_7704200_SUMMARY.csv'

def MLP_Random_updated(file_path):
    """
    Process a RANDOM log file to extract:
        - Cross-Validation F1 for each Trial and Iteration.
        - Summary details for each Trial, including Best Trial F1 and Trial Time.

    Args:
        file_path (str): Path to the log file.

    Returns:
        tuple: Two DataFrames:
            - Iteration DataFrame with Trial, Iteration, and Cross-Validation F1.
            - Summary DataFrame with Trial, Best Trial F1, Best Parameters, and Time.
    """
    # Open and read the log file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Initialize lists to store extracted data
    iteration_data = []
    summary_data = []

    # Define regex patterns
    trial_pattern = re.compile(r"Trial (\d+)/\d+")
    iteration_pattern = re.compile(r"Iteration (\d+)/\d+")
    cv_f1_pattern = re.compile(r"Cross-Validation F1: ([\d\.]+)")
    trial_summary_pattern = re.compile(r"Trial (\d+) Summary:")
    best_f1_pattern = re.compile(r"Best Trial F1: ([\d\.]+)")
    best_params_pattern = re.compile(r"Best Parameters: (.+)")
    trial_time_pattern = re.compile(r"Trial Time: ([\d\.]+) seconds")  # New pattern for Trial Time

    # Variables to track current trial, iteration, and summary details
    current_trial = None
    current_iteration = None
    summary_trial = None  # Variable to track the trial being summarized
    best_f1 = None
    best_params = None
    trial_time = None  # Variable to store Trial Time

    for line in lines:
        line = line.strip()  # Remove leading/trailing whitespace

        # Check for Trial (during iterations)
        trial_match = trial_pattern.search(line)
        if trial_match:
            current_trial = int(trial_match.group(1))
            print(f"Debug: Found Trial {current_trial} during iterations")
            continue

        # Check for Iteration
        iteration_match = iteration_pattern.search(line)
        if iteration_match:
            current_iteration = int(iteration_match.group(1))
            print(f"Debug: Found Iteration {current_iteration} for Trial {current_trial}")
            continue

        # Check for Cross-Validation F1
        cv_f1_match = cv_f1_pattern.search(line)
        if cv_f1_match and current_trial is not None and current_iteration is not None:
            f1_score = float(cv_f1_match.group(1))
            print(f"Debug: Found Cross-Validation F1 {f1_score} for Trial {current_trial}, Iteration {current_iteration}")
            iteration_data.append((current_trial, current_iteration, f1_score))
            continue

        # Check for Trial Summary
        trial_summary_match = trial_summary_pattern.search(line)
        if trial_summary_match:
            # If there's an ongoing summary, append it before starting a new one
            if summary_trial is not None and best_f1 is not None and best_params is not None and trial_time is not None:
                summary_data.append((summary_trial, best_f1, best_params, trial_time))
                print(f"Debug: Saved summary for Trial {summary_trial}")

            # Start a new trial summary
            summary_trial = int(trial_summary_match.group(1))
            best_f1 = None
            best_params = None
            trial_time = None  # Reset Trial Time for the new trial
            print(f"Debug: Starting summary collection for Trial {summary_trial}")
            continue

        # Check for Trial Time
        trial_time_match = trial_time_pattern.search(line)
        if trial_time_match and summary_trial is not None:
            trial_time = float(trial_time_match.group(1))
            print(f"Debug: Found Trial Time {trial_time} seconds for Trial {summary_trial}")
            continue

        # Check for Best Trial F1
        best_f1_match = best_f1_pattern.search(line)
        if best_f1_match and summary_trial is not None:
            best_f1 = float(best_f1_match.group(1))
            print(f"Debug: Found Best Trial F1 {best_f1} for Trial {summary_trial}")
            continue

        # Check for Best Parameters
        best_params_match = best_params_pattern.search(line)
        if best_params_match and summary_trial is not None:
            best_params = best_params_match.group(1)
            print(f"Debug: Found Best Parameters for Trial {summary_trial}: {best_params}")
            # After finding all summary details, append to summary_data
            if best_f1 is not None and trial_time is not None:
                summary_data.append((summary_trial, best_f1, best_params, trial_time))
                print(f"Debug: Saved summary for Trial {summary_trial}")
                # Reset summary variables
                summary_trial = None
                best_f1 = None
                best_params = None
                trial_time = None
            continue

    # In case the last trial summary wasn't appended inside the loop
    if summary_trial is not None and best_f1 is not None and best_params is not None and trial_time is not None:
        summary_data.append((summary_trial, best_f1, best_params, trial_time))
        print(f"Debug: Saved summary for Trial {summary_trial} at end of file")

    # Create DataFrames for the results
    iteration_df = pd.DataFrame(iteration_data, columns=["Trial", "Iteration", "F1_Score"])
    summary_df = pd.DataFrame(summary_data, columns=["Trial", "Best_F1_Score", "Best_Parameters", "Time"])

    return iteration_df, summary_df

# Process the file
iteration_df, summary_df = MLP_Random_updated(file_path)

# Define paths for iteration and summary data
# Adjusted to match the naming convention
# iteration_file_path = 'mlp_RANDOM_07Jan25_Paper_7665194_ITERATION.csv'
# summary_file_path = 'mlp_RANDOM_07Jan25_Paper_7665194_SUMMARY.csv'

# Save the results to the specified paths
for path in [iteration_file_path, summary_file_path]:
    if os.path.dirname(path):
        os.makedirs(os.path.dirname(path), exist_ok=True)

iteration_df.to_csv(iteration_file_path, index=False)
summary_df.to_csv(summary_file_path, index=False)

print(f"Iteration data saved to: {iteration_file_path}")
print(f"Trial summary saved to: {summary_file_path}")


Debug: Found Trial 1 during iterations
Debug: Found Iteration 1 for Trial 1
Debug: Found Cross-Validation F1 0.2863 for Trial 1, Iteration 1
Debug: Found Iteration 2 for Trial 1
Debug: Found Cross-Validation F1 0.2515 for Trial 1, Iteration 2
Debug: Found Iteration 3 for Trial 1
Debug: Found Cross-Validation F1 0.3294 for Trial 1, Iteration 3
Debug: Found Iteration 4 for Trial 1
Debug: Found Cross-Validation F1 0.4113 for Trial 1, Iteration 4
Debug: Found Iteration 5 for Trial 1
Debug: Found Cross-Validation F1 0.336 for Trial 1, Iteration 5
Debug: Found Iteration 6 for Trial 1
Debug: Found Cross-Validation F1 0.3765 for Trial 1, Iteration 6
Debug: Found Iteration 7 for Trial 1
Debug: Found Cross-Validation F1 0.2775 for Trial 1, Iteration 7
Debug: Found Iteration 8 for Trial 1
Debug: Found Cross-Validation F1 0.341 for Trial 1, Iteration 8
Debug: Found Iteration 9 for Trial 1
Debug: Found Cross-Validation F1 0.4788 for Trial 1, Iteration 9
Debug: Found Iteration 10 for Trial 1
Debug: 

In [3]:
# change to F1_Score
summary_df = summary_df.rename(columns={"Best_F1_Score": "F1_Score"})
# change Trial to Iteration

summary_df = summary_df.rename(columns={"Trial": "Iteration"})
#drop best parameters
summary_df = summary_df.drop(columns=["Best_Parameters"])
# save to csv
summary_df.to_csv(summary_file_path, index=False)

summary_df

Unnamed: 0,Iteration,F1_Score,Time
0,1,0.5537,159.40
1,2,0.5681,154.99
2,3,0.5398,142.18
3,4,0.5408,149.58
4,5,0.5667,131.00
...,...,...,...
95,96,0.5401,139.98
96,97,0.5719,132.28
97,98,0.5529,134.13
98,99,0.5553,129.42


In [4]:

#Swap columns Trial And Iteration Name but keep the values the same 
iteration_df = iteration_df.rename(columns={'Trial': 'Iteration', 'Iteration': 'Trial'})

iteration_df.to_csv('MLP_Random_10Jan25_7704200_TRIAL.csv', index=False)
iteration_df

Unnamed: 0,Iteration,Trial,F1_Score
0,1,1,0.2863
1,1,2,0.2515
2,1,3,0.3294
3,1,4,0.4113
4,1,5,0.3360
...,...,...,...
19995,100,196,0.3074
19996,100,197,0.3376
19997,100,198,0.2936
19998,100,199,0.3424


In [5]:
def get_max_f1_per_trial(iteration_df):
    """
    Get the maximum F1_Score for each Trial.

    Args:
        iteration_df (pd.DataFrame): DataFrame with columns "Trial", "Iteration", "F1_Score".

    Returns:
        pd.DataFrame: DataFrame with columns "Trial" and "Max_F1_Score", one row per trial.
    """
    # Group by Trial and get the maximum F1_Score
    random_final_df = iteration_df.groupby("Iteration", as_index=False)["F1_Score"].max()
    #random_final_df.rename(columns={"F1_Score": "Max_F1_Score"}, inplace=True)
    
    
    return random_final_df

random_final_df = get_max_f1_per_trial(iteration_df)

In [6]:
import pandas as pd

# Calculate the summary statistics
best_f1_score = summary_df['F1_Score'].max()
worst_f1_score = summary_df['F1_Score'].min()
avg_f1_score = summary_df['F1_Score'].mean()
std_dev_f1_score = summary_df['F1_Score'].std()
# calculate Average time and std dev of average time
avg_time = summary_df['Time'].mean()
std_dev_time = summary_df['Time'].std()

# Create the final summary DataFrame
final_summary = pd.DataFrame({
    'Best F1 Score': [best_f1_score],
    'Worst F1 Score': [worst_f1_score],
    'Average F1 Score': [avg_f1_score],
    'F1 Score Std Dev': [std_dev_f1_score],
    'Average Time': [avg_time],
    'Time Std Dev': [std_dev_time]
})

# Display the final summary DataFrame
final_summary.to_csv('MLP_Random_10Jan25_7704200_FINAL_SUMMARY.csv', index=False)
final_summary


Unnamed: 0,Best F1 Score,Worst F1 Score,Average F1 Score,F1 Score Std Dev,Average Time,Time Std Dev
0,0.5743,0.5178,0.555339,0.011132,140.3555,6.636634
