In [1]:
import pandas as pd

# paddy - GRID PARSE

In [2]:
import pandas as pd
import os
import glob
import re  # Import regular expressions module

# Define the directory containing the log files
log_dir = 'data/GD/'

# Use glob to find all .log files in the directory
log_files = glob.glob(os.path.join(log_dir, '*.log'))

# Initialize an empty list to store data from all log files
all_data = []

# Function to extract Qmax and YT from the filename
def extract_qmax_yt(filename):
    """
    Extracts Qmax and YT values from a given filename.
    
    Parameters:
        filename (str): The name of the file.
        
    Returns:
        tuple: A tuple containing Qmax and YT as integers. Returns (None, None) if not found.
    """
    # Define a regex pattern to capture Qmax and YT values
    pattern = re.compile(r'Qmax(\d+)_YT(\d+)', re.IGNORECASE)
    match = pattern.search(filename)
    
    if match:
        qmax = int(match.group(1))
        yt = int(match.group(2))
        return qmax, yt
    else:
        # If pattern not found, return None
        return None, None

# Iterate through each log file
for log_file_path in log_files:
    fitness_scores = []
    filename = os.path.basename(log_file_path)  # Extract the filename
    
    try:
        with open(log_file_path, 'r') as file:
            lines = file.readlines()
        
        # Iterate through the lines using index to allow lookahead
        for i, line in enumerate(lines):
            if "Top seed in generation:" in line:
                # Ensure there is a next line to read
                if i + 1 < len(lines):
                    next_line = lines[i + 1].strip()
                    if "fitness:" in next_line:
                        try:
                            # Extract fitness score
                            fitness = float(next_line.split("fitness:")[1].split()[0])
                            fitness_scores.append(fitness)
                        except (IndexError, ValueError) as e:
                            print(f"Error parsing fitness in file {filename} on line {i+2}: {e}")
    except FileNotFoundError:
        print(f"File not found: {log_file_path}")
        continue
    except Exception as e:
        print(f"An error occurred while processing {filename}: {e}")
        continue
    
    # Generate Count and Iteration columns
    count = list(range(1, len(fitness_scores) + 1))
    iterations = [(i % 8) + 1 for i in range(len(fitness_scores))]
    
    # Create a temporary DataFrame for the current log file
    temp_df = pd.DataFrame({
        'Count': count,
        'Iteration': iterations,
        'Fitness': fitness_scores,
        'Filename': filename
    })
    
    # Append the temporary DataFrame to the all_data list
    all_data.append(temp_df)

# Concatenate all DataFrames into a single DataFrame
if all_data:
    df = pd.concat(all_data, ignore_index=True)
    # Display the DataFrame
    print("Combined DataFrame:")
    print(df.head())
else:
    print("No data found in the log files.")
    df = pd.DataFrame()  # Create an empty DataFrame to avoid errors later

# --- Extract Qmax and YT from Filename ---
if not df.empty:
    # Apply the extraction function to the 'Filename' column
    df[['Qmax', 'YT']] = df['Filename'].apply(lambda x: pd.Series(extract_qmax_yt(x)))
    
    # Display the updated DataFrame with Qmax and YT
    print("\nDataFrame with Qmax and YT:")
    print(df.head())
else:
    print("No data to process for Qmax and YT extraction.")

# Optional: Save the DataFrame to a CSV file
# df.to_csv('combined_fitness_scores_with_qmax_yt.csv', index=False)
df

Combined DataFrame:
   Count  Iteration   Fitness  \
0      1          1  0.388066   
1      2          2  0.460553   
2      3          3  0.473766   
3      4          4  0.502600   
4      5          5  0.536443   

                                            Filename  
0  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...  
1  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...  
2  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...  
3  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...  
4  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...  

DataFrame with Qmax and YT:
   Count  Iteration   Fitness  \
0      1          1  0.388066   
1      2          2  0.460553   
2      3          3  0.473766   
3      4          4  0.502600   
4      5          5  0.536443   

                                            Filename  Qmax  YT  
0  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...    10   5  
1  MLP_PADDY_Qmax10_YT5_20241213230415_Generation...    10   5  
2  MLP_PADDY_Qmax10_YT5_2024

Unnamed: 0,Count,Iteration,Fitness,Filename,Qmax,YT
0,1,1,0.388066,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
1,2,2,0.460553,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
2,3,3,0.473766,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
3,4,4,0.502600,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
4,5,5,0.536443,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
...,...,...,...,...,...,...
3009,156,4,0.528634,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
3010,157,5,0.582096,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
3011,158,6,0.577689,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
3012,159,7,0.581697,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6


# making into 1 list

In [3]:
import pandas as pd
import os
import glob
import re

# Function to extract Qmax and YT from the filename
def extract_qmax_yt(filename):
    """
    Extracts Qmax and YT values from a given filename.
    
    Parameters:
        filename (str): The name of the file.
        
    Returns:
        tuple: A tuple containing Qmax and YT as integers. Returns (None, None) if not found.
    """
    # Define a regex pattern to capture Qmax and YT values
    pattern = re.compile(r'Qmax(\d+)_YT(\d+)', re.IGNORECASE)
    match = pattern.search(filename)
    
    if match:
        qmax = int(match.group(1))
        yt = int(match.group(2))
        return qmax, yt
    else:
        # If pattern not found, return None
        return None, None

# Function to process a single log directory and return a DataFrame
def process_log_dir(log_dir):
    """
    Processes all .log files in the specified directory and extracts relevant data into a DataFrame.
    
    Parameters:
        log_dir (str): The directory containing the log files.
        
    Returns:
        pd.DataFrame: A DataFrame containing the combined data from all log files in the directory.
    """
    # Use glob to find all .log files in the directory
    log_files = glob.glob(os.path.join(log_dir, '*.log'))
    
    # Initialize an empty list to store data from all log files
    all_data = []
    
    # Iterate through each log file
    for log_file_path in log_files:
        fitness_scores = []
        filename = os.path.basename(log_file_path)  # Extract the filename
        
        try:
            with open(log_file_path, 'r') as file:
                lines = file.readlines()
            
            # Iterate through the lines using index to allow lookahead
            for i, line in enumerate(lines):
                if "Top seed in generation:" in line:
                    # Ensure there is a next line to read
                    if i + 1 < len(lines):
                        next_line = lines[i + 1].strip()
                        if "fitness:" in next_line:
                            try:
                                # Extract fitness score
                                fitness = float(next_line.split("fitness:")[1].split()[0])
                                fitness_scores.append(fitness)
                            except (IndexError, ValueError) as e:
                                print(f"Error parsing fitness in file {filename} on line {i+2}: {e}")
        except FileNotFoundError:
            print(f"File not found: {log_file_path}")
            continue
        except Exception as e:
            print(f"An error occurred while processing {filename}: {e}")
            continue
        
        # Generate Count and Iteration columns
        count = list(range(1, len(fitness_scores) + 1))
        iterations = [(i % 8) + 1 for i in range(len(fitness_scores))]
        
        # Create a temporary DataFrame for the current log file
        temp_df = pd.DataFrame({
            'Count': count,
            'Iteration': iterations,
            'Fitness': fitness_scores,
            'Filename': filename
        })
        
        # Append the temporary DataFrame to the all_data list
        all_data.append(temp_df)
    
    # Concatenate all DataFrames into a single DataFrame
    if all_data:
        df = pd.concat(all_data, ignore_index=True)
        
        # Apply the extraction function to the 'Filename' column
        df[['Qmax', 'YT']] = df['Filename'].apply(lambda x: pd.Series(extract_qmax_yt(x)))
        
        print(f"\nProcessed directory: {log_dir}")
        print(df.head())  # Display the first few rows of the DataFrame
        return df
    else:
        print(f"No data found in the log files within directory: {log_dir}")
        return pd.DataFrame()  # Return an empty DataFrame to avoid errors later

# List of directories to process
directories = {
    'GD': 'data/GD/15dec/',
    'GS': 'data/GS/',
    'PD': 'data/PD/15dec/',
    'PS': 'data/PS/'
}

# Dictionary to hold the resulting DataFrames
dataframes = {}

# Iterate through each directory and process the log files
for key, dir_path in directories.items():
    df = process_log_dir(dir_path)
    dataframes[key] = df
    # Optionally, assign each DataFrame to a separate variable
    globals()[f'df_{key}'] = df

# Accessing the individual DataFrames
df_GD = dataframes['GD']
df_GS = dataframes['GS']
df_PD = dataframes['PD']
df_PS = dataframes['PS']

# Optionally, display summaries of each DataFrame
print("\nSummary of DataFrames:")
print(f"df_GD: {df_GD.shape[0]} records")
print(f"df_GS: {df_GS.shape[0]} records")
print(f"df_PD: {df_PD.shape[0]} records")
print(f"df_PS: {df_PS.shape[0]} records")

# Optional: Save each DataFrame to separate CSV files
# df_GD.to_csv('combined_fitness_scores_GD.csv', index=False)
# df_GS.to_csv('combined_fitness_scores_GS.csv', index=False)
# df_PD.to_csv('combined_fitness_scores_PD.csv', index=False)
# df_PS.to_csv('combined_fitness_scores_PS.csv', index=False)



Processed directory: data/GD/15dec/
   Count  Iteration   Fitness  \
0      1          1  0.389668   
1      2          2  0.456946   
2      3          3  0.506807   
3      4          4  0.512211   
4      5          5  0.556667   

                                            Filename  Qmax  YT  
0  MLP_PADDY_Qmax20_YT4_20241215222449_Generation...    20   4  
1  MLP_PADDY_Qmax20_YT4_20241215222449_Generation...    20   4  
2  MLP_PADDY_Qmax20_YT4_20241215222449_Generation...    20   4  
3  MLP_PADDY_Qmax20_YT4_20241215222449_Generation...    20   4  
4  MLP_PADDY_Qmax20_YT4_20241215222449_Generation...    20   4  

Processed directory: data/GS/
   Count  Iteration   Fitness  \
0      1          1  0.388066   
1      2          2  0.477369   
2      3          3  0.511213   
3      4          4  0.525827   
4      5          5  0.576090   

                                            Filename  Qmax  YT  
0  MLP_PADDY_Qmax20_YT4_20241213230635_Generation...    20   4  
1  MLP_PADDY_Q

# TRANSFORM

In [4]:
df_GD

Unnamed: 0,Count,Iteration,Fitness,Filename,Qmax,YT
0,1,1,0.389668,MLP_PADDY_Qmax20_YT4_20241215222449_Generation...,20,4
1,2,2,0.456946,MLP_PADDY_Qmax20_YT4_20241215222449_Generation...,20,4
2,3,3,0.506807,MLP_PADDY_Qmax20_YT4_20241215222449_Generation...,20,4
3,4,4,0.512211,MLP_PADDY_Qmax20_YT4_20241215222449_Generation...,20,4
4,5,5,0.556667,MLP_PADDY_Qmax20_YT4_20241215222449_Generation...,20,4
...,...,...,...,...,...,...
1394,35,3,0.575689,MLP_PADDY_Qmax30_YT3_20241215222449_Generation...,30,3
1395,36,4,0.579293,MLP_PADDY_Qmax30_YT3_20241215222449_Generation...,30,3
1396,37,5,0.583297,MLP_PADDY_Qmax30_YT3_20241215222449_Generation...,30,3
1397,38,6,0.583294,MLP_PADDY_Qmax30_YT3_20241215222449_Generation...,30,3


In [5]:

def transform_and_save_df(df, label, output_dir='data/results/'):
    """
    Transforms the DataFrame to the desired format and saves it as a CSV file.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to transform.
        label (str): The label indicating the type (e.g., 'GD', 'GS').
        output_dir (str): The directory where the CSV file will be saved.
        
    Returns:
        None
    """
    if df.empty:
        print(f"No data to transform and save for {label}.")
        return
    
    # Transform the DataFrame
    df['Repeat'] = (df['Count'] - 1) // 8 + 1  # Calculate Repeat
    df['Generation'] = df['Iteration']       # Rename Iteration to Generation
    df.rename(columns={'Fitness': 'F1_Score'}, inplace=True)  # Rename Fitness to F1_Score
    
    # Keep only the required columns in the desired order
    transformed_df = df[['Repeat', 'Generation', 'F1_Score', 'Filename', 'Qmax', 'YT']]
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Define the output file path
    output_file = os.path.join(output_dir, f'GRID_{label}.csv')
    
    # Save the transformed DataFrame to CSV
    transformed_df.to_csv(output_file, index=False)
    print(f"Saved transformed data to {output_file}")
    
    # Optionally, return the transformed DataFrame
    return transformed_df


# Define the output directory for CSV files
output_directory = 'data/results/'
# Iterate through each DataFrame, transform, and save as CSV
transformed_dataframes = {}
for key, df in dataframes.items():
    transformed_df = transform_and_save_df(df, key, output_dir=output_directory)
    transformed_dataframes[key] = transformed_df

# Optionally, assign transformed DataFrames to separate variables
df_GD_transformed = transformed_dataframes.get('GD', pd.DataFrame())
df_GS_transformed = transformed_dataframes.get('GS', pd.DataFrame())
df_PD_transformed = transformed_dataframes.get('PD', pd.DataFrame())
df_PS_transformed = transformed_dataframes.get('PS', pd.DataFrame())


Saved transformed data to data/results/GRID_GD.csv
Saved transformed data to data/results/GRID_GS.csv
Saved transformed data to data/results/GRID_PD.csv
Saved transformed data to data/results/GRID_PS.csv


# FINAL SUMMARY

In [6]:
import pandas as pd
import os

def create_final_summary(df, label, output_dir='data/results/'):
    """
    Creates a final summary DataFrame from the transformed DataFrame and saves it as a CSV file.
    
    Parameters:
        df (pd.DataFrame): The transformed DataFrame to summarize.
        label (str): The label indicating the type (e.g., 'GD', 'GS', 'PD', 'PS') to be used in the output filename.
        output_dir (str): The directory where the summary CSV file will be saved.
        
    Returns:
        pd.DataFrame: The final summary DataFrame.
    """
    if df.empty:
        print(f"No data to summarize for {label}. Skipping summary creation.")
        return pd.DataFrame()
    
    # --- Group by 'Repeat' and 'Filename' to get the Best F1_Score ---
    summary_df = df.groupby(['Repeat', 'Filename'], as_index=False).agg(
        Best_F1_Score=('F1_Score', 'max'),
        Qmax=('Qmax', 'first'),  # Assuming Qmax is constant per Filename
        YT=('YT', 'first')       # Assuming YT is constant per Filename
    )
    
    # --- Calculate Final Summary Statistics per Filename ---
    final_summary_df = summary_df.groupby('Filename').agg(
        Best_F1_Score=('Best_F1_Score', 'max'),
        Worst_F1_Score=('Best_F1_Score', 'min'),
        Avg_F1_Score=('Best_F1_Score', 'mean'),
        StdDev_F1_Score=('Best_F1_Score', 'std'),
        Qmax=('Qmax', 'first'),  # Since Qmax is constant per Filename
        YT=('YT', 'first')       # Since YT is constant per Filename
    ).reset_index()
    
    # Optional: Round the statistics for better readability
    final_summary_df['Avg_F1_Score'] = final_summary_df['Avg_F1_Score'].round(4)
    final_summary_df['StdDev_F1_Score'] = final_summary_df['StdDev_F1_Score'].round(4)
    
    # Reorder columns to have Filename first, followed by Qmax and YT, then the statistics
    final_summary_df = final_summary_df[['Filename', 'Qmax', 'YT', 
                                         'Best_F1_Score', 'Worst_F1_Score', 
                                         'Avg_F1_Score', 'StdDev_F1_Score']]
    
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Define the output file path
    output_file = os.path.join(output_dir, f'SUMMARY_{label}.csv')
    
    # Save the final summary DataFrame to CSV
    final_summary_df.to_csv(output_file, index=False)
    print(f"Saved final summary to {output_file}")
    
    return final_summary_df

# --- Example Usage ---

# Assume transformed_dataframes is a dictionary containing the transformed DataFrames
# for each label ('GD', 'GS', 'PD', 'PS')
# Example:
# transformed_dataframes = {
#     'GD': df_GD_transformed,
#     'GS': df_GS_transformed,
#     'PD': df_PD_transformed,
#     'PS': df_PS_transformed
# }

# For demonstration, let's create dummy transformed DataFrames
# You should replace this with your actual transformed DataFrames

# Example Dummy DataFrames (Remove or replace these with actual data)
# -----------------------------------------------------------------------------------
# Uncomment and modify the following lines with your actual transformed DataFrames
# df_GD_transformed = pd.read_csv('data/results/GRID_GD.csv')
# df_GS_transformed = pd.read_csv('data/results/GRID_GS.csv')
# df_PD_transformed = pd.read_csv('data/results/GRID_PD.csv')
# df_PS_transformed = pd.read_csv('data/results/GRID_PS.csv')

# For the purpose of this example, let's assume transformed_dataframes is already defined
# -----------------------------------------------------------------------------------

# List of labels and corresponding transformed DataFrames
labels = ['GD', 'GS', 'PD', 'PS']
transformed_dataframes = {
    'GD': df_GD_transformed,
    'GS': df_GS_transformed,
    'PD': df_PD_transformed,
    'PS': df_PS_transformed
}

# Iterate through each transformed DataFrame, create summary, and save as CSV
for label in labels:
    df = transformed_dataframes.get(label, pd.DataFrame())
    summary = create_final_summary(df, label, output_dir='data/results/')
    # Optionally, store the summaries in a dictionary if further processing is needed
    # summaries[label] = summary

# --- Optional: Accessing the Summaries ---
# The summaries are saved as CSV files in 'data/results/' directory.
# You can also access them programmatically if stored in a dictionary.


Saved final summary to data/results/SUMMARY_GD.csv
Saved final summary to data/results/SUMMARY_GS.csv
Saved final summary to data/results/SUMMARY_PD.csv
Saved final summary to data/results/SUMMARY_PS.csv


In [4]:
# Transform the DataFrame to the desired format
df['Repeat'] = (df['Count'] - 1) // 8 + 1  # Calculate Repeat
df['Generation'] = df['Iteration']  # Rename Iteration to Generation
df.rename(columns={'Fitness': 'F1_Score'}, inplace=True)  # Rename Fitness to F1_Score

# Keep only the required columns in the desired order
transformed_df = df[['Repeat', 'Generation', 'F1_Score', 'Filename','Qmax', 'YT']]
transformed_df.to_csv('data/results/GRID_GD.csv', index=False)
transformed_df


Unnamed: 0,Repeat,Generation,F1_Score,Filename,Qmax,YT
0,1,1,0.388066,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
1,1,2,0.460553,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
2,1,3,0.473766,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
3,1,4,0.502600,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
4,1,5,0.536443,MLP_PADDY_Qmax10_YT5_20241213230415_Generation...,10,5
...,...,...,...,...,...,...
3009,20,4,0.528634,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
3010,20,5,0.582096,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
3011,20,6,0.577689,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
3012,20,7,0.581697,MLP_PADDY_Qmax10_YT6_20241213230415_Generation...,10,6
