In [1]:
import pandas as pd

In [11]:
import os
import glob
import pandas as pd
import re

def parse_log_file(filepath):
    """
    Parses a single log file to extract Run, Best MSE, and Elapsed time.

    Parameters:
        filepath (str): Path to the log file.

    Returns:
        List of dictionaries with keys: filename, Run, best_MSE, Time
    """
    pattern = re.compile(
        r"Run\s+(?P<run>\d+):\s+Best MSE:\s+(?P<mse>[\d.]+),\s+Elapsed time:\s+(?P<time>[\d.]+)\s+seconds"
    )
    data = []
    filename = os.path.basename(filepath)
    
    with open(filepath, 'r') as file:
        for line in file:
            match = pattern.search(line)
            if match:
                run = int(match.group('run'))
                mse = float(match.group('mse'))
                time = float(match.group('time'))
                data.append({
                    'filename': filename,
                    'Run': run,
                    'best_MSE': mse,
                    'Time': time
                })
    return data

def Qmax_YT_extract(filename):
    """
    Extracts Qmax and YT values from the filename.

    Parameters:
        filename (str): The name of the file.

    Returns:
        Tuple containing Qmax (int) and YT (int). Returns (None, None) if extraction fails.
    """
    # Example filename:
    # PADDY_Qmax20_YT10_20241213150301_QMAX20_YT10_PADDYgenerational_GAUSIANscaled_summary.log
    # We need to extract Qmax20_YT10 from the first part after 'PADDY_'

    pattern = re.compile(r"PADDY_Qmax(?P<Qmax>\d+)_YT(?P<YT>\d+)_")
    match = pattern.search(filename)
    if match:
        Qmax = int(match.group('Qmax'))
        YT = int(match.group('YT'))
        return Qmax, YT
    else:
        # If pattern not found, return None
        return None, None

def process_logs(input_dir, output_dir, output_filename='compiled_results.csv'):
    """
    Processes all .summary.log files in the input directory, computes statistics,
    extracts Qmax and YT, sorts the DataFrame, and saves the compiled data.

    Parameters:
        input_dir (str): Directory to search for .summary.log files.
        output_dir (str): Directory to save the compiled CSV.
        output_filename (str): Name of the output CSV file.

    Returns:
        pandas.DataFrame: Compiled dataframe containing all extracted data and statistics.
    """
    # Ensure input_dir exists
    if not os.path.isdir(input_dir):
        raise NotADirectoryError(f"Input directory '{input_dir}' does not exist.")

    # Create output_dir if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Search for files ending with 'summary.log'
    search_pattern = os.path.join(input_dir, '*summary.log')
    log_files = glob.glob(search_pattern)

    if not log_files:
        print(f"No files ending with 'summary.log' found in '{input_dir}'.")
        return pd.DataFrame(columns=[
            'filename', 'Run', 'best_MSE', 'Time',
            'Avg_Best_MSE', 'Std_Best_MSE', 'Avg_Time', 'Std_Time',
            'Qmax', 'YT'
        ])

    all_data = []
    for log_file in log_files:
        file_data = parse_log_file(log_file)
        all_data.extend(file_data)
        print(f"Processed file: {log_file}, extracted {len(file_data)} runs.")

    # Create DataFrame
    df = pd.DataFrame(all_data, columns=[
        'filename', 'Run', 'best_MSE', 'Time'
    ])

    # Compute statistics for each filename
    stats_df = df.groupby('filename').agg(
        Avg_Best_MSE=pd.NamedAgg(column='best_MSE', aggfunc='mean'),
        Std_Best_MSE=pd.NamedAgg(column='best_MSE', aggfunc='std'),
        Avg_Time=pd.NamedAgg(column='Time', aggfunc='mean'),
        Std_Time=pd.NamedAgg(column='Time', aggfunc='std')
    ).reset_index()

    # Merge statistics back into the main DataFrame
    df = pd.merge(df, stats_df, on='filename', how='left')

    # Optionally, round the statistical columns for better readability
    df['Avg_Best_MSE'] = df['Avg_Best_MSE'].round(6)
    df['Std_Best_MSE'] = df['Std_Best_MSE'].round(6)
    df['Avg_Time'] = df['Avg_Time'].round(2)
    df['Std_Time'] = df['Std_Time'].round(2)

    # Extract Qmax and YT from filename
    df[['Qmax', 'YT']] = df['filename'].apply(
        lambda x: pd.Series(Qmax_YT_extract(x))
    )

    # Sort the DataFrame by Qmax, YT, Run in ascending order
    df_sorted = df.sort_values(by=['Qmax', 'YT', 'Run'], ascending=[True, True, True])

    # Reset index after sorting
    df_sorted.reset_index(drop=True, inplace=True)

    # Save to CSV
    output_path = os.path.join(output_dir, output_filename)
    df_sorted.to_csv(output_path, index=False)
    print(f"Compiled and sorted data with statistics saved to '{output_path}'.")

    return df_sorted

# ----------------------------
# Usage Example in Jupyter Notebook
# ----------------------------

# Define the input and output directories
input_dir = '../../paddy_dec15_GRID_PS/'    # Replace with your input directory path
output_dir = 'data/'  # Replace with your output directory path
output_filename = 'GRID_PS_2.csv'  # Optional: Change the output filename if desired

# Process the log files and compile the results
try:
    compiled_df = process_logs(input_dir, output_dir, output_filename)
    print("\nSample of the compiled and sorted DataFrame:")
    display(compiled_df.head())  # Display first few rows using Jupyter's display
except Exception as e:
    print(f"An error occurred: {e}")


Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10_PADDYpopulation_GAUSIANscaled_summary.log, extracted 20 runs.
Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax1000_YT30_20241215150318_QMAX1000_YT30_PADDYpopulation_GAUSIANscaled_summary.log, extracted 20 runs.
Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax500_YT25_20241215150318_QMAX500_YT25_PADDYpopulation_GAUSIANscaled_summary.log, extracted 20 runs.
Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax1000_YT25_20241215150318_QMAX1000_YT25_PADDYpopulation_GAUSIANscaled_summary.log, extracted 20 runs.
Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax1000_YT10_20241215150318_QMAX1000_YT10_PADDYpopulation_GAUSIANscaled_summary.log, extracted 20 runs.
Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax500_YT15_20241215150318_QMAX500_YT15_PADDYpopulation_GAUSIANscaled_summary.log, extracted 20 runs.
Processed file: ../../paddy_dec15_GRID_PS/PADDY_Qmax500_YT30_20241215150318_QMAX500_YT30

Unnamed: 0,filename,Run,best_MSE,Time,Avg_Best_MSE,Std_Best_MSE,Avg_Time,Std_Time,Qmax,YT
0,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,1,2.580441,82.08,2.181749,0.421389,93.14,7.77,500,10
1,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,2,2.372298,85.91,2.181749,0.421389,93.14,7.77,500,10
2,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,3,2.793069,91.11,2.181749,0.421389,93.14,7.77,500,10
3,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,4,1.893977,95.07,2.181749,0.421389,93.14,7.77,500,10
4,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,5,2.165112,97.36,2.181749,0.421389,93.14,7.77,500,10


In [12]:
compiled_df

Unnamed: 0,filename,Run,best_MSE,Time,Avg_Best_MSE,Std_Best_MSE,Avg_Time,Std_Time,Qmax,YT
0,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,1,2.580441,82.08,2.181749,0.421389,93.14,7.77,500,10
1,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,2,2.372298,85.91,2.181749,0.421389,93.14,7.77,500,10
2,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,3,2.793069,91.11,2.181749,0.421389,93.14,7.77,500,10
3,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,4,1.893977,95.07,2.181749,0.421389,93.14,7.77,500,10
4,PADDY_Qmax500_YT10_20241215150318_QMAX500_YT10...,5,2.165112,97.36,2.181749,0.421389,93.14,7.77,500,10
...,...,...,...,...,...,...,...,...,...,...
195,PADDY_Qmax1000_YT30_20241215150318_QMAX1000_YT...,16,1.545759,195.54,1.454002,0.293602,186.37,17.41,1000,30
196,PADDY_Qmax1000_YT30_20241215150318_QMAX1000_YT...,17,1.417193,181.18,1.454002,0.293602,186.37,17.41,1000,30
197,PADDY_Qmax1000_YT30_20241215150318_QMAX1000_YT...,18,2.158709,176.05,1.454002,0.293602,186.37,17.41,1000,30
198,PADDY_Qmax1000_YT30_20241215150318_QMAX1000_YT...,19,1.066826,210.14,1.454002,0.293602,186.37,17.41,1000,30
