POSPac Logfile parser:

Default file location @ Z:\RESOURCES\Production\Bridge\02_Logfiles\01_POS_Processing

Created by GWarren on 8/2/2023  

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import numpy as np
import re

In [None]:
# folder path for bulk log files
folder_path = r"Z:\RESOURCES\Production\Bridge\02_Logfiles\01_POS_Processing"

# Apply optional date range
# Write None for no range, write 'YYMMDD' in quotes for timerange
START_DATE = None  # '210408' or NONE
END_DATE = None    # '210410' or NONE

In [None]:
# Function to parse the lines into key-value pairs considering the dots in values
def parse_line_corrected(line):
    key_parts = line.split('.')
    key = key_parts[0].strip()
    value = '.'.join(key_parts[1:]).lstrip('.').strip() if len(key_parts) > 1 else None
    return key, value

# Exclusion list
exclude_keys = ["MoveOut Settings", "POF Import Settings", "MESSAGES", "METRICS"]

# Function to check if a line should be excluded
def should_exclude(line):
    return line.strip() in exclude_keys or line.strip() == ''

def parse_line_final(line):
    if "Processing started for" in line:
        return "Successful run", "POS processing succeeded!" in line
    else:
        return parse_line_corrected(line)
    
def convert_to_MB(value):
    if "GB" in str(value):
        return float(value.replace("GB", "").strip()) * 1024
    elif "MB" in str(value):
        return float(value.replace("MB", "").strip())
    else:
        return value

def extract_minutes(value):
    parts = str(value).split()
    return float(parts[0]) if parts else value

def convert_to_seconds(value):
    if "minutes" in str(value):
        return float(value.replace("minutes", "").strip()) * 60
    elif "hours" in str(value):
        return float(value.replace("hours", "").strip()) * 3600
    else:
        return value
    
def process_file(file_path):
    key_value_pairs_final = []
    with open(file_path, 'r') as file:
        for line in file:
            if not should_exclude(line):
                key, value = parse_line_final(line.strip())
                key_value_pairs_final.append((key, value))
    df = pd.DataFrame(key_value_pairs_final, columns=['Key', 'Value'])
    df_transposed = df.set_index('Key').transpose()
    return df_transposed

def process_folder(folder_path, start_date=None, end_date=None):
    all_dataframes = []
    
    # Convert start_date and end_date to datetime objects if they're not None
    start_date = datetime.strptime(start_date, "%y%m%d") if start_date else None
    end_date = datetime.strptime(end_date, "%y%m%d") if end_date else None
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.txt'):
            # Use regex to match the first six digits (the date) and ignore following characters
            date_match = re.match(r"(\d{6})", filename)
            if date_match:
                file_date = datetime.strptime(date_match.group(), "%y%m%d")

                # Check if the file_date is within the range
                if ((not start_date or file_date >= start_date) and (not end_date or file_date <= end_date)):
                    file_path = os.path.join(folder_path, filename)
                    df_transposed = process_file(file_path)
                    all_dataframes.append(df_transposed)
    final_df = pd.concat(all_dataframes, ignore_index=True)
    
    # Applying the conversion to the "SBET Size" column
    final_df['SBET Size'] = final_df['SBET Size'].apply(convert_to_MB)

    # Cleaning up the "Processing Rate" column
    final_df['Processing Rate'] = final_df['Processing Rate'].apply(extract_minutes)
    final_df.rename(columns={'Processing Rate': 'Processing Rate (min/SN hr)'}, inplace=True)
    
    # Converting specified columns to seconds and updating the column names
    columns_to_convert = ['SBET Extract Time', 'HTDP Transform Time', 'POF Import Time', 'Copy Time', 'Total Proc Time', 'Flight Duration']
    for col in columns_to_convert:
        final_df[col] = final_df[col].apply(convert_to_seconds)
        final_df.rename(columns={col: col + ' (sec)'}, inplace=True)

    return final_df

df_final = process_folder(folder_path, START_DATE, END_DATE)
print(df_final.columns)

In [None]:
# Plotting a scatter plot for "SBET Size" vs "Copy Time" with a trend line
correlation = df_final['SBET Size'].corr(df_final['Copy Time (sec)'])

plt.figure(figsize=(10, 6))
plt.scatter(df_final['SBET Size'], df_final['Copy Time (sec)'])
plt.plot(df_final['SBET Size'], np.poly1d(np.polyfit(df_final['SBET Size'], df_final['Copy Time (sec)'], 1))(df_final['SBET Size']), color='red')
plt.xlabel('SBET Size (MB)')
plt.ylabel('Copy Time (seconds)')
plt.title('Scatter Plot of "SBET Size" vs "Copy Time"')
plt.tight_layout()
plt.show()

correlation

In [None]:
# Average Copy Time by machine
average_copy_time_by_run_from = df_final.groupby('Run from')['Copy Time (sec)'].mean().sort_values(ascending=False)

# Plotting a bar chart for average "Copy Time" by "Run from"
plt.figure(figsize=(10, 6))
average_copy_time_by_run_from.plot(kind='bar')
plt.ylabel('Average Copy Time (minutes)')
plt.title('Average Copy Time by "Run from" Device')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Calculate Avg Proc Rate by Device, with outliers removed

# Calculate the mean and standard deviation
mean_processing_rate = df_final['Processing Rate (min/SN hr)'].mean()
std_processing_rate = df_final['Processing Rate (min/SN hr)'].std()

# Define a threshold for excluding extreme values (4 standard deviations from the mean)
threshold = 4 * std_processing_rate

# Create a filter for extreme values
extreme_values_filter = abs(df_final['Processing Rate (min/SN hr)'] - mean_processing_rate) > threshold

# Filter the DataFrame to exclude extreme values
filtered_df_final = df_final[~extreme_values_filter]

# Get the extreme (removed) values, print
removed_values = df_final[extreme_values_filter]['Processing Rate (min/SN hr)']
print("Removed values:")
print(removed_values.tolist())

# Grouping by "Run from" and calculating the average "Processing Rate (min/SN hr)"
average_processing_rate_by_run_from = filtered_df_final.groupby('Run from')['Processing Rate (min/SN hr)'].mean().sort_values(ascending=False)

# Plotting a bar chart for average "Processing Rate (min/SN hr)" by "Run from"
plt.figure(figsize=(10, 6))
average_processing_rate_by_run_from.plot(kind='bar')
plt.ylabel('Average Processing Rate (min/SN hr)')
plt.title('Average Processing Rate (min/SN hr) by Device')
plt.xticks(rotation=70)
plt.tight_layout()
plt.show()

In [None]:
# Count of jobs on each machine
job_count_by_run_from = df_final['Run from'].value_counts().sort_values(ascending=False)

# Plotting a bar chart for the count of jobs by "Run from"
plt.figure(figsize=(10, 6))
job_count_by_run_from.plot(kind='bar')
plt.ylabel('Count of Jobs')
plt.title('Count of Jobs by "Run from" Device')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

job_count_by_run_from

In [None]:
#Get count of processing method, geoid, and epoch date
proc_mode_counts = df_final['Proc Mode'].value_counts()
print("Processing Modes: \n",proc_mode_counts)

geoid_counts = df_final['Geoid'].value_counts()
print("\nGeoids: \n",geoid_counts)

target_counts = df_final['Target Date'].value_counts()
print("\nTarget Epoch: \n",target_counts)

print(df_final.columns)

In [None]:
#Anomaly Detection w. SKlearn
from sklearn.preprocessing import StandardScaler

features = df_final[['Total Proc Time (sec)', 'SBET Size']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

from sklearn.ensemble import IsolationForest

model = IsolationForest(contamination=0.05) # Adjust contamination as needed
model.fit(scaled_features)

anomaly_scores = model.decision_function(scaled_features)
anomalies = model.predict(scaled_features)

plt.scatter(features['Total Proc Time (sec)'], features['SBET Size'], c=anomalies)
plt.xlabel('Total Proc Time (sec)')
plt.ylabel('SBET Size')
plt.title('Anomaly Detection')
plt.show()
