In [None]:
import pandas as pd
import glob
import os
from autogluon.tabular import TabularPredictor

# Path to the directory containing feather files
feather_directory = "combined_data"

# Initialize an empty list to store individual DataFrames with yolomodel
dataframes = []

# Read each feather file
for file in glob.glob(f"{feather_directory}/*.feather"):
    # Extract the base filename
    filename = os.path.basename(file)

    # Extract yolomodel from the filename
    yolomodel = filename.split('_')[1]

    # Read the dataframe
    df = pd.read_feather(file)

    # Remove timestamp column, assuming it's named 'timestamp' as a placeholder
    df = df.drop(columns=['timestamp']) if 'timestamp' in df.columns else df

    # Add the extracted yolomodel as a new column
    df['yolomodel'] = yolomodel

    # Append to the list of DataFrames
    dataframes.append(df)

# Determine common columns across all DataFrames
common_columns = list(set.intersection(*(set(df.columns) for df in dataframes)))

# Standardize each DataFrame to have only columns that are common across all DataFrames
dataframes = [df[common_columns] for df in dataframes]

# Combine all DataFrames into a single DataFrame
os.makedirs("datasets", exist_ok=True)
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_feather("datasets/predict_yolomodel.feather")

In [None]:
import pandas as pd
import glob
import os
from autogluon.tabular import TabularPredictor

# Path to the directory containing feather files
feather_directory = "combined_data"

# Initialize an empty list to store individual DataFrames with yolomodel
dataframes = []

remaining = 2000

# Read each feather file
for file in glob.glob(f"{feather_directory}/*.feather"):
    # Extract the base filename
    filename = os.path.basename(file)

    # Extract yolomodel from the filename
    yolomodel = filename.split('_')[1]

    # Read the dataframe
    df = pd.read_feather(file)

    # Remove timestamp column, assuming it's named 'timestamp' as a placeholder
    df = df.drop(columns=['timestamp']) if 'timestamp' in df.columns else df

    # Add the extracted yolomodel as a new column
    df['yolomodel'] = yolomodel

    # Append to the list of DataFrames
    dataframes.append(df)
    remaining -= 1
    if remaining == 0:
        break

# Determine and print the columns of each DataFrame
# for idx, df in enumerate(dataframes):
#     print(f"Columns of DataFrame {idx}: {df.columns.tolist()}")

# Determine the differences in columns among the DataFrames
def print_column_differences(dfs):
    all_columns = [set(df.columns) for df in dfs]
    for i, cols in enumerate(all_columns):
        differences = cols.difference(*[cols2 for j, cols2 in enumerate(all_columns) if j != i])
        if differences:
            print(f"Differences for DataFrame {i}: {differences}")

print_column_differences(dataframes)

# Determine common columns across all DataFrames
common_columns = list(set.intersection(*(set(df.columns) for df in dataframes)))

# Standardize each DataFrame to have only columns that are common across all DataFrames
dataframes = [df[common_columns] for df in dataframes]

# Combine all DataFrames into a single DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)
combined_df.to_feather("datasets/predict_yolomodel.feather")


In [None]:
print(f"Number of rows: {combined_df.shape[0]} and Number of columns: {combined_df.shape[1]}")

In [None]:
import json
import glob
import os
import pandas as pd
import Levenshtein

# Define the target column as a JSON string
target = '{"__name__": "node_cpu_seconds_rate", "cpu": "0", "instance": "worker", "mode": "user"}'

# Initialize the directory path and the list to store filenames with no close match
feather_directory = "combined_data"
no_close_match_files = []

# Loop through each feather file in the directory
for file in glob.glob(f"{feather_directory}/*.feather"):
    # Extract the base filename
    filename = os.path.basename(file)

    # Extract yolomodel from the filename
    yolomodel = filename.split('_')[1]

    # Read the dataframe
    df = pd.read_feather(file)

    # Check if the target is among the columns
    if target not in df.columns:
        closest_distance = float('inf')
        closest_column = None
        for col in df.columns:
            json_col = json.loads(col)
            # if json_col["__name__"] == "node_cpu_seconds_rate":
            dist = Levenshtein.distance(json.dumps(json_col), target)
            if dist < closest_distance:
                closest_distance = dist
                closest_column = col

        if closest_column:
            print(f"File: {file} - Closest column: {closest_column}")
        else:
            no_close_match_files.append(file)

# Optionally, print the files with no close matches
if no_close_match_files:
    print("Files with no close matches:", no_close_match_files)
# pip install python-Levenshtein