In [2]:
import pandas as pd

# MLP Paddy

In [3]:
import pandas as pd

# Load the uploaded log file
log_file_path = '../../../plotting_data/raw/MLP_Paddy_10Jan25_7692464.log'

# Initialize lists to store extracted data
fitness_scores = []
time_values = []

# Open and read the log file
with open(log_file_path, 'r') as file:
    for line in file:
        # Extract fitness scores
        if "Top seed in generation:" in line:
            try:
                next_line = next(file).strip()
                if "fitness:" in next_line:
                    # Extract the fitness score as a float
                    fitness = float(next_line.split("fitness:")[1].split()[0])
                    fitness_scores.append(fitness)
            except StopIteration:
                # Handle the case where the file ends unexpectedly
                break

        # Extract time values
        elif "Iteration" in line and "completed in" in line:
            parts = line.split()
            try:
                # Assuming the line format is:
                # "Iteration X completed in Y seconds"
                time = float(parts[4])
                time_values.append(time)
            except (IndexError, ValueError):
                # Handle lines that don't match the expected format
                continue

# Verify that the number of fitness scores is a multiple of 8
# If not, handle the remaining entries appropriately
group_size = 8
total_groups = len(fitness_scores) // group_size
if len(fitness_scores) % group_size != 0:
    total_groups += 1  # Account for the last incomplete group

# Ensure that the number of time values matches the number of groups
# If there are more time values than groups, truncate the excess
# If there are fewer, pad with the last available time value
if len(time_values) < total_groups:
    # Pad with the last time value
    last_time = time_values[-1] if time_values else None
    time_values.extend([last_time] * (total_groups - len(time_values)))
elif len(time_values) > total_groups:
    # Truncate the excess time values
    time_values = time_values[:total_groups]

# Create the Time column by repeating each time value 'group_size' times
time_column = []
for t in time_values:
    time_column.extend([t] * group_size)

# Trim the Time column to match the number of fitness scores
time_column = time_column[:len(fitness_scores)]

# Generate Count and Iteration columns
count = list(range(1, len(fitness_scores) + 1))
iterations = [(i % group_size) + 1 for i in range(len(fitness_scores))]

# Create the DataFrame
df = pd.DataFrame({
    'Count': count,
    'Iteration': iterations,
    'Fitness': fitness_scores,
    'Time': time_column
})

# Display the first few rows of the DataFrame
print(df.head())

# Optionally, display the entire DataFrame or save it to a file
# print(df)
# df.to_csv('processed_log_data.csv', index=False)


   Count  Iteration   Fitness    Time
0      1          1  0.388066  197.24
1      2          2  0.460553  197.24
2      3          3  0.473766  197.24
3      4          4  0.502600  197.24
4      5          5  0.536443  197.24


In [4]:
df

Unnamed: 0,Count,Iteration,Fitness,Time
0,1,1,0.388066,197.24
1,2,2,0.460553,197.24
2,3,3,0.473766,197.24
3,4,4,0.502600,197.24
4,5,5,0.536443,197.24
...,...,...,...,...
795,796,4,0.527030,237.32
796,797,5,0.535241,237.32
797,798,6,0.568879,237.32
798,799,7,0.576088,237.32


In [5]:
import pandas as pd

# Assuming 'df' has already been created and includes the 'Time' column
# Here's the updated transformation process:

# 1. Calculate the 'Repeat' column
df['Repeat'] = (df['Count'] - 1) // 8 + 1  # Each group of 8 counts is one Repeat

# 2. Rename 'Iteration' to 'Generation'
df.rename(columns={'Iteration': 'Generation'}, inplace=True)

# 3. Rename 'Fitness' to 'F1_Score'
df.rename(columns={'Fitness': 'F1_Score'}, inplace=True)

# 4. Optionally, if 'Time' should be unique per 'Repeat', we can take the first 'Time' value in each group
# This step depends on how you want to represent 'Time' in the transformed DataFrame
# If you want to keep 'Time' for each row, you can skip this aggregation

# Option A: Keep 'Time' for each row (repeats 'Time' per group)
# No additional steps needed as 'Time' is already in 'df'

# Option B: Aggregate 'Time' per 'Repeat' (one 'Time' per 'Repeat')
# Uncomment the following lines if you prefer this option

# time_per_repeat = df.groupby('Repeat')['Time'].first().reset_index()
# transformed_df = df[['Repeat', 'Generation', 'F1_Score']].drop_duplicates().merge(time_per_repeat, on='Repeat')
# transformed_df = transformed_df[['Repeat', 'Generation', 'F1_Score', 'Time']]

# For this example, we'll proceed with Option A

# 5. Keep only the required columns in the desired order
transformed_df = df[['Repeat', 'Generation', 'F1_Score', 'Time']]

# 6. Save the transformed DataFrame to a CSV file
transformed_df.to_csv('MLP_Paddy_10Jan25_7692464_GENERATIONS.csv', index=False)

# 7. Display the first few rows of the transformed DataFrame
transformed_df

# Optionally, display the entire DataFrame or perform further analysis
# print(transformed_df)


Unnamed: 0,Repeat,Generation,F1_Score,Time
0,1,1,0.388066,197.24
1,1,2,0.460553,197.24
2,1,3,0.473766,197.24
3,1,4,0.502600,197.24
4,1,5,0.536443,197.24
...,...,...,...,...
795,100,4,0.527030,237.32
796,100,5,0.535241,237.32
797,100,6,0.568879,237.32
798,100,7,0.576088,237.32


In [6]:
import pandas as pd

# Assuming 'transformed_df' has been created and includes the 'Repeat', 'Generation', 'F1_Score', and 'Time' columns

# 1. Group by 'Repeat' and calculate the maximum 'F1_Score' and corresponding 'Time'
summary_df = transformed_df.groupby('Repeat', as_index=False).agg({
    'F1_Score': 'max',      # Find the maximum F1_Score per Repeat
    'Time': 'first'         # Since Time is the same for each Repeat, we can take the first entry
})


# 3. (Optional) Save the summary DataFrame to a CSV file
summary_df.to_csv('MLP_Paddy_10Jan25_7692464_SUMMARY.csv', index=False)

# 4. Display the first few rows of the summary DataFrame
summary_df


Unnamed: 0,Repeat,F1_Score,Time
0,1,0.572085,197.24
1,2,0.582699,194.83
2,3,0.581898,202.58
3,4,0.581696,271.62
4,5,0.577088,224.40
...,...,...,...
95,96,0.583099,213.03
96,97,0.575288,217.93
97,98,0.582496,242.31
98,99,0.578491,197.04


In [8]:
import pandas as pd

# ---------------------------
# Previous Steps (Assumed Completed)
# ---------------------------

# Assuming 'summary_df' has been created and includes the following columns:
# 'Repeat', 'F1_Score', 'Time'

# Example of summary_df structure:
#    Repeat  F1_Score    Time
# 0       1       0.536443  227.29
# 1       2       0.570282  127.33
# 2       3       0.582298  150.45
# ...   ...            ...     ...

# ---------------------------
# Step 5: Calculate Final Summary Statistics
# ---------------------------

# 1. Calculate statistics for 'F1_Score'
best_f1_max = summary_df['F1_Score'].max()
best_f1_min = summary_df['F1_Score'].min()
best_f1_mean = summary_df['F1_Score'].mean()
best_f1_std = summary_df['F1_Score'].std()

# 2. Calculate statistics for 'Time'
time_mean = summary_df['Time'].mean()
time_std = summary_df['Time'].std()

# 3. Compile all statistics into a dictionary
final_summary = {
    'F1_Score': best_f1_max,
    'Worst_F1_Score': best_f1_min,
    'Avg_F1_Score': best_f1_mean,
    'StdDev_F1_Score': best_f1_std,
    'Avg_Time': time_mean,
    'StdDev_Time': time_std
}

# 4. Convert the dictionary to a DataFrame for better presentation
final_summary_df = pd.DataFrame([final_summary])

# 5. Save the final summary to a CSV file
final_summary_df.to_csv('MLP_Paddy_10Jan25_7692464_FINAL_SUMMARY.csv', index=False)

# 6. Display the final summary DataFrame
final_summary_df


Unnamed: 0,F1_Score,Worst_F1_Score,Avg_F1_Score,StdDev_F1_Score,Avg_Time,StdDev_Time
0,0.588904,0.564474,0.579899,0.004087,232.2227,22.470136
