In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity

# Define the cosine similarity function
def cosine_similarity(hist1, hist2):
    hist1 = np.array(hist1)
    hist2 = np.array(hist2)
    dot_product = np.dot(hist1, hist2)
    norm_hist1 = np.linalg.norm(hist1)
    norm_hist2 = np.linalg.norm(hist2)
    similarity = dot_product / (norm_hist1 * norm_hist2)
    return similarity

# Define the directories and metrics
folder1 = 'baseline'
folder2 = 'low'
output_base_dir = os.path.join(os.getcwd(), 'median')

metrics = {
    'cpu_usage_report_combined_median.csv': '%cpu_utilization',
    'memory_usage_report_combined_median.csv': '%memused',
}

# Function to create histogram from file
def create_histogram(folder, application):
    hist = []
    for report, metric in metrics.items():
        file_path = os.path.join(output_base_dir, folder, report)
        if os.path.exists(file_path):
            df = pd.read_csv(file_path, sep=';')
            df_app = df[df['application'] == application]
            if not df_app.empty:
                hist.extend(df_app[metric].values)
            else:
                print(f"No data for application {application} in {file_path}.")
        else:
            print(f"File {file_path} does not exist.")
    return hist

# Function to pad histograms to the same length
def pad_histograms(hist1, hist2):
    max_len = max(len(hist1), len(hist2))
    if len(hist1) < max_len:
        hist1 = np.pad(hist1, (0, max_len - len(hist1)), 'constant')
    if len(hist2) < max_len:
        hist2 = np.pad(hist2, (0, max_len - len(hist2)), 'constant')
    return hist1, hist2

# Load the CSV files to get application list
cpu_usage_df = pd.read_csv(os.path.join(output_base_dir, folder1, 'cpu_usage_report_combined_median.csv'), sep=';')
applications = cpu_usage_df['application'].unique()

# Dictionary to store similarity results
similarity_results = {}

# Calculate similarity for each application
for app in applications:
    hist1 = create_histogram(folder1, app)
    hist2 = create_histogram(folder2, app)
    
    if hist1 and hist2:
        hist1, hist2 = pad_histograms(hist1, hist2)
        similarity = cosine_similarity(hist1, hist2)
        similarity_results[app] = similarity
    else:
        print(f"Histograms for application {app} could not be created due to missing data.")

# Convert similarity results to DataFrame for display
similarity_df = pd.DataFrame.from_dict(similarity_results, orient='index', columns=['Cosine Similarity'])

# Display the similarity matrix
print(similarity_df)


          Cosine Similarity
dfsioe             0.294004
kmeans             0.445941
pagerank           0.549399
terasort           0.453092
