In [None]:
"""
Load dataframe and remove columns with static or missing values
"""
import pandas as pd

path = "../../data/processed/8.8_ajot/1_mbps/worker1.feather"
df = pd.read_feather(path)

# Count number of rows and cols in the original df
print(f"Loaded {len(df)} rows and {len(df.columns)} columns")

# Count the number of unique values in each column
unique_counts = df.nunique()

# Find all static columns (columns with only one or two unique values)
static_columns = unique_counts[unique_counts <= 2].index

# Remove the static columns from the dataframe
df = df.drop(static_columns, axis=1)
print(f"Removing {len(static_columns)} static columns ({len(df.columns)} remaining)")

if len(df.columns) < 100:
    # Only display if the df is small enough to not stall the IDE (thousands of columns really slows things down)
    df.head()

In [None]:
"""
Change the dataframe headers to a more human-readable format
"""

from utils.header_cleaner import clean_up_headers
cleaned_df = clean_up_headers(df)
cleaned_df.index = cleaned_df["timestamp"]
cleaned_df



In [None]:
"""
TODO: Extend this script to do some plotting (maybe copy-and-paste this code into separate file first)
"""

import difflib

"""
Find columns that are similar to the search word
"""
target_word = 'package_joules_total dynamic'
search_space = [x for x in cleaned_df.columns if "kepler" in x]  # Only account for columns with "kepler" in it
closest_matches = difflib.get_close_matches(target_word, search_space, cutoff=0.05, n=5)
print(closest_matches)

"""
Transform these columns from total count to rate (e.g., "joules total" to "joules per second")
"""
per_second_values = []
df2 = cleaned_df.copy().sort_index()
measurement_interval = df2.index[1] - df2.index[0]
df2.index -= df2.index[0]
for word in closest_matches:
    new_name = f"per_second:{word}"
    df2[new_name] = (df2[word].diff() / measurement_interval)
    per_second_values.append(new_name)
df2 = df2.drop(df2.index[0])  # First value from diff() is trash

"""
Plot these columns
"""
df2[closest_matches].plot()
df2[per_second_values].plot()