In [4]:
# Open the original log file in read mode and a new file in write mode
with open('/home/choi/Wave_Transformer/optuna_/py_torch_test630796.log', 'r') as read_file, open('filtered_py_torch_test630796.log', 'w') as write_file:
    # Iterate over each line in the original file
    for line in read_file:
        # Check if '.pt' is not in the current line
        if '.pt' not in line:
            # Write the line to the new file if '.pt' is not found
            write_file.write(line)

# After running this script, 'filtered_log_file.log' will contain all the lines
# from the original log file except those that contain '.pt'


In [5]:
file_names = [
    'filtered_py_torch_test630796.log',
    'filtered_py_torch_test630782.log',
    'filtered_py_torch_test630781.log',
    'filtered_py_torch_test630779.log'
]

with open('merged_test_losses.log', 'w') as output_file:
    for file_name in file_names:
        with open(file_name, 'r') as read_file:
            for line in read_file:
                if line.startswith('Test Loss'):
                    output_file.write(line)


In [6]:
# Define the path to the log file
log_file_path = 'merged_test_losses.log'
sorted_log_file_path = 'sorted_merged_test_losses.log'

# Read the lines from the log file
with open(log_file_path, 'r') as file:
    lines = file.readlines()

# Extract test loss values and associate them with their lines
lines_with_loss = []
for line in lines:
    # Find the test loss value in each line and convert it to float
    try:
        loss_value = float(line.strip().split(': ')[-1])
        lines_with_loss.append((loss_value, line))
    except ValueError:
        # Handle the case where conversion to float fails
        print("Error converting loss value to float in line:", line)

# Sort the lines by the test loss values in ascending order
lines_with_loss.sort(key=lambda x: x[0])

# Write the sorted lines back to a new file
with open(sorted_log_file_path, 'w') as file:
    for _, line in lines_with_loss:
        file.write(line)

print(f'Sorted lines have been written to {sorted_log_file_path}.')


Sorted lines have been written to sorted_merged_test_losses.log.


In [9]:
import pandas as pd

# Define the path to the sorted log file
sorted_log_file_path = 'sorted_merged_test_losses.log'

# Read the log file into a DataFrame
# Assuming each line in the log file is a separate entry in the DataFrame
df = pd.read_csv(sorted_log_file_path, header=None, names=['Log Entry'])



In [51]:
# Read the log file into a DataFrame assuming the path 'sorted_merged_test_losses.log'
df = pd.read_csv('sorted_merged_test_losses.log', sep=',', header=None)

# Drop the column that contains the repeated phrase 'Test Loss for configuration:'
df = df.drop(0, axis=1)

# Function to extract column names and their values, ensuring Test Loss is correctly captured
def extract_columns_and_values(row):
    config_dict = {}
    for item in row:
        if pd.isnull(item):
            continue
        if ':' in item:  # This indicates the presence of a Test Loss value
            key, value = item.rsplit(':', 1)  # Split from the right to correctly capture Test Loss
            key = key.split('=')[-1].strip()  # Ensure the key is correctly extracted in case of Test Loss
        else:
            key, value = item.split('=')
            key = key.strip()
            value = value.strip()
        config_dict[key] = value
    return pd.Series(config_dict)

# Apply the function to each row
df_transformed = df.apply(extract_columns_and_values, axis=1)


In [52]:
# Check if your DataFrame column names are correctly referenced
print(df_transformed.columns)

Index(['1', '3', '5', 'Data Load Type', 'Pos_Encoder_Type', 'Revin',
       'attention_type', 'batch_size', 'dropout_enabled', 'general_skip',
       'kernel_size', 'skip_enabled', 'step_size'],
      dtype='object')


In [53]:
# Combine the '1', '3', '5' columns into a single 'Test Loss' column
# First, fill NaN values in column '1' with values from '3', then with values from '5'
df_transformed['Test Loss'] = df_transformed['1'].fillna(df_transformed['3']).fillna(df_transformed['5'])

# Drop the original columns '1', '3', '5'
df_transformed = df_transformed.drop(columns=['1', '3', '5'])


In [54]:
df_transformed

Unnamed: 0,Data Load Type,Pos_Encoder_Type,Revin,attention_type,batch_size,dropout_enabled,general_skip,kernel_size,skip_enabled,step_size,Test Loss
0,multivariate,Original,True,log,128,True,skip,5,True,1,0.2045
1,multivariate,Projected,True,original,64,True,skip,5,True,1,0.2051
2,multivariate,Projected,True,original,128,True,skip,5,True,1,0.2055
3,multivariate,Projected,True,original,128,True,skip,3,True,1,0.2056
4,multivariate,Projected,True,original,64,True,skip,5,True,1,0.2060
...,...,...,...,...,...,...,...,...,...,...,...
402,multivariate,Original,True,log,32,True,skip,7,True,64,1.0695
403,multivariate,Original,True,log,128,True,skip,5,True,64,1.0724
404,multivariate,Original,True,log,128,True,skip,5,True,64,1.0734
405,multivariate,Original,True,log,32,True,skip,5,True,64,1.0803


In [55]:
## Example for 'attention_type'
attention_type_mapping = df_transformed['attention_type'].astype('category').cat.categories
print("Mapping for 'attention_type':", dict(enumerate(attention_type_mapping)))

# 
# Convert 'Test Loss' to numeric, ensuring all values are treated as floats
df_transformed['Test Loss'] = pd.to_numeric(df_transformed['Test Loss'], errors='coerce')

# Group by 'attention_type' and calculate the mean of 'Test Loss' for each category
attention_type_avg_loss = df_transformed.groupby('attention_type')['Test Loss'].mean()

# Sort the results to see which 'attention_type' is associated with lower 'Test Loss'
sorted_attention_type_avg_loss = attention_type_avg_loss.sort_values()


Mapping for 'attention_type': {0: 'log', 1: 'original'}


In [56]:
sorted_attention_type_avg_loss

attention_type
original    0.281924
log         0.412857
Name: Test Loss, dtype: float64

In [57]:
# Calculate mean 'Test Loss' for each category in all categorical columns
categorical_columns = ['Data Load Type', 'Pos_Encoder_Type', 'attention_type', 'general_skip', 'dropout_enabled', 'skip_enabled']
for col in categorical_columns:
    print(f"Mean 'Test Loss' for each category in {col}:")
    print(df_transformed.groupby(col)['Test Loss'].mean().sort_values(), "\n")


Mean 'Test Loss' for each category in Data Load Type:
Data Load Type
multivariate    0.36726
univariate      0.98770
Name: Test Loss, dtype: float64 

Mean 'Test Loss' for each category in Pos_Encoder_Type:
Pos_Encoder_Type
Projected    0.355617
Original     0.389321
Name: Test Loss, dtype: float64 

Mean 'Test Loss' for each category in attention_type:
attention_type
original    0.281924
log         0.412857
Name: Test Loss, dtype: float64 

Mean 'Test Loss' for each category in general_skip:
general_skip
skip    0.368784
Name: Test Loss, dtype: float64 

Mean 'Test Loss' for each category in dropout_enabled:
dropout_enabled
True    0.368784
Name: Test Loss, dtype: float64 

Mean 'Test Loss' for each category in skip_enabled:
skip_enabled
True    0.368784
Name: Test Loss, dtype: float64 



In [58]:
# Ensure 'Test Loss' and other numeric columns are in the correct data type
numeric_columns = ['batch_size', 'kernel_size', 'step_size']  # Add other numeric columns as needed
df_transformed[numeric_columns] = df_transformed[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Calculate correlations
numeric_correlations = df_transformed[numeric_columns + ['Test Loss']].corr()['Test Loss']
print("Correlations between numeric features and 'Test Loss':")
print(numeric_correlations.drop('Test Loss', axis=0).sort_values())


Correlations between numeric features and 'Test Loss':
kernel_size    0.075527
batch_size     0.097493
step_size      0.835938
Name: Test Loss, dtype: float64


In [59]:
# Sort the DataFrame in ascending order by 'Test Loss'
df_sorted_by_test_loss = df_transformed.sort_values(by='Test Loss', ascending=True)

# Display the first few rows of the sorted DataFrame to verify
print(df_sorted_by_test_loss.head())


  Data Load Type Pos_Encoder_Type Revin attention_type  batch_size  \
0   multivariate         Original  True            log         128   
1   multivariate        Projected  True       original          64   
2   multivariate        Projected  True       original         128   
3   multivariate        Projected  True       original         128   
4   multivariate        Projected  True       original          64   

  dropout_enabled general_skip  kernel_size skip_enabled  step_size  Test Loss  
0            True         skip            5         True          1     0.2045  
1            True         skip            5         True          1     0.2051  
2            True         skip            5         True          1     0.2055  
3            True         skip            3         True          1     0.2056  
4            True         skip            5         True          1     0.2060  


In [60]:

# Example numeric feature to bin and analyze
feature = 'batch_size'

# Define bins for the numeric feature. Adjust these based on your specific data range and granularity preference.
# Example: Binning 'batch_size' into 4 equally ranged bins
df_transformed[feature + '_binned'] = pd.qcut(df_transformed[feature], q=4, duplicates='drop')

# Calculate the mean 'Test Loss' for each bin
mean_test_loss_per_bin = df_transformed.groupby(feature + '_binned')['Test Loss'].mean()

print(f"Mean 'Test Loss' for each bin in {feature}:")
print(mean_test_loss_per_bin)


Mean 'Test Loss' for each bin in batch_size:
batch_size_binned
(31.999, 64.0]    0.350475
(64.0, 128.0]     0.412060
Name: Test Loss, dtype: float64


  mean_test_loss_per_bin = df_transformed.groupby(feature + '_binned')['Test Loss'].mean()


In [61]:

# Example numeric feature to bin and analyze
feature = 'step_size'

# Define bins for the numeric feature. Adjust these based on your specific data range and granularity preference.
# Example: Binning 'batch_size' into 4 equally ranged bins
df_transformed[feature + '_binned'] = pd.qcut(df_transformed[feature], q=4, duplicates='drop')

# Calculate the mean 'Test Loss' for each bin
mean_test_loss_per_bin = df_transformed.groupby(feature + '_binned')['Test Loss'].mean()

print(f"Mean 'Test Loss' for each bin in {feature}:")
print(mean_test_loss_per_bin)


Mean 'Test Loss' for each bin in step_size:
step_size_binned
(0.999, 8.0]    0.233323
(8.0, 16.0]     0.269115
(16.0, 32.0]    0.294032
(32.0, 64.0]    0.886521
Name: Test Loss, dtype: float64


  mean_test_loss_per_bin = df_transformed.groupby(feature + '_binned')['Test Loss'].mean()


In [62]:

# Example numeric feature to bin and analyze
feature = 'kernel_size'

# Define bins for the numeric feature. Adjust these based on your specific data range and granularity preference.
# Example: Binning 'batch_size' into 4 equally ranged bins
df_transformed[feature + '_binned'] = pd.qcut(df_transformed[feature], q=4, duplicates='drop')

# Calculate the mean 'Test Loss' for each bin
mean_test_loss_per_bin = df_transformed.groupby(feature + '_binned')['Test Loss'].mean()

print(f"Mean 'Test Loss' for each bin in {feature}:")
print(mean_test_loss_per_bin)


Mean 'Test Loss' for each bin in kernel_size:
kernel_size_binned
(2.999, 5.0]    0.356409
(5.0, 7.0]      0.412372
Name: Test Loss, dtype: float64


  mean_test_loss_per_bin = df_transformed.groupby(feature + '_binned')['Test Loss'].mean()
