In [2]:
import pandas as pd
from scipy.stats import ks_2samp, chi2_contingency,entropy
import numpy as np
import matplotlib.pyplot as plt
import sys, re, os
from scipy.spatial.distance import jensenshannon
import time

In [3]:
# Function to compare two columns using appropriate statistical test
def compare_columns_ks(col1, col2, column_name):
    if col1.dtype == 'O':  
        # Chi-squared test for categorical data
        contingency_table = pd.crosstab(col1, col2)
        chi2, p, _, _ = chi2_contingency(contingency_table)
        test = 'Chi-squared'
    else:
        # KS test for numerical data
        ks_statistic, p = ks_2samp(col1, col2)
        test = 'Kolmogorov-Smirnov'

    if p < 0.2:
        print(f"{column_name}: No Significant difference (p-value: {p:.3f}) - {test} test")
    else:
        print(f"{column_name}: Significant difference (p-value: {p:.3f}) - {test} test")

In [4]:
def kl_divergence(p, q):
    return entropy(p, q)

def compare_columns_kl(col1, col2, column_name):
    if col1.dtype == 'O':
        # Chi-squared test for categorical data
        contingency_table = pd.crosstab(col1, col2)
        chi2, p, _, _ = chi2_contingency(contingency_table)
        test = 'Chi-squared'
        if p < 0.05:
            print(f"{column_name}: Significant difference (p-value: {p:.3f}) - {test} test")
        else:
            print(f"{column_name}: No significant difference (p-value: {p:.3f}) - {test} test")
    else:
        # Convert data to probability distributions
        # Here, using histograms
        p_hist, _ = np.histogram(col1, bins=30, range=(min(col1.min(), col2.min()), max(col1.max(), col2.max())), density=True)
        q_hist, _ = np.histogram(col2, bins=30, range=(min(col1.min(), col2.min()), max(col1.max(), col2.max())), density=True)
        
        # Calculate KL divergence
        kl_div = kl_divergence(p_hist, q_hist)
        print(f"{column_name}: KL Divergence: {kl_div:.3f}")


In [5]:
def js_divergence(p, q):
    return jensenshannon(p, q)**2 

def compare_columns_js(col1, col2, column_name):
    if col1.dtype == 'O':
        # Chi-squared test for categorical data
        contingency_table = pd.crosstab(col1, col2)
        chi2, p, _, _ = chi2_contingency(contingency_table)
        test = 'Chi-squared'
        if p < 0.05:
            print(f"{column_name}: Significant difference (p-value: {p:.3f}) - {test} test")
        else:
            print(f"{column_name}: No significant difference (p-value: {p:.3f}) - {test} test")
    else:
        # Convert data to probability distributions
        # Here, using histograms
        p_hist, _ = np.histogram(col1, bins=30, range=(min(col1.min(), col2.min()), max(col1.max(), col2.max())), density=True)
        q_hist, _ = np.histogram(col2, bins=30, range=(min(col1.min(), col2.min()), max(col1.max(), col2.max())), density=True)
        
        # Calculate JS divergence
        js_div = js_divergence(p_hist, q_hist)
        print(f"{column_name}: JS Divergence: {js_div:.3f}")


In [6]:
def datavalue_similarity(col1, col2, column_name):
    if col1.dtype == 'O':  
        # Chi-squared test for categorical data
        contingency_table = pd.crosstab(col1, col2)
        chi2, p, _, _ = chi2_contingency(contingency_table)
        test = 'Chi-squared'
    else:
        # mean
        mean1, mean2 = col1.mean(), col2.mean()

        # mean difference
        mean_diff_percent = abs(mean1 - mean2) / mean1 * 100

        # min,max difference
        min1, max1 = col1.min(), col1.max()
        min2, max2 = col2.min(), col2.max()
        min_diff_percent = abs(min1 - min2) / min1 * 100
        max_diff_percent = abs(max1 - max2) / max1 * 100

        # check if difference within 5%
        if mean_diff_percent < 5 and min_diff_percent < 5 and max_diff_percent < 5:
            print(f"Mean and extreme differences are less than 5%")
        else:
            print(f"Mean and extreme differences are more than 5%")
        print(f"mean difference: {mean_diff_percent:.2f}%")
        print(f"minimum difference: {min_diff_percent:.2f}%")
        print(f"maximum difference: {max_diff_percent:.2f}%")


# data test

In [7]:
exp = f"load_stimulus_exp-1"
#exp_compare = f"load_stimulus_exp-2"
exp_compare = f"resource_stimulus_tree"

In [8]:
current_dir = os.getcwd()
file_path = os.path.abspath(f'../datasets/{exp}.csv')

In [9]:
file_path_compare =  os.path.abspath(f'../datasets/{exp_compare}.csv')

In [10]:
# Load data
data1 = pd.read_csv(file_path)
data2 = pd.read_csv(file_path_compare)

# List of key columns to compare
key_columns = ["input_rate","output_rate","latency"] 

# Compare key columns
for column in key_columns:
    if column in data1.columns and column in data2.columns:
        compare_columns_ks(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")

input_rate: No Significant difference (p-value: 0.000) - Kolmogorov-Smirnov test
output_rate: No Significant difference (p-value: 0.000) - Kolmogorov-Smirnov test
latency: No Significant difference (p-value: 0.106) - Kolmogorov-Smirnov test


In [11]:
# kl
for column in key_columns:
    if column in data1.columns and column in data2.columns:
        compare_columns_kl(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")

input_rate: KL Divergence: inf
output_rate: KL Divergence: inf
latency: KL Divergence: inf


In [12]:
# js
for column in key_columns:
    if column in data1.columns and column in data2.columns:
        compare_columns_js(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")

input_rate: JS Divergence: 0.552
output_rate: JS Divergence: 0.591
latency: JS Divergence: 0.066


In [13]:
# data value difference
for column in key_columns:
    print(f'the feature is :{column}')
    if column in data1.columns and column in data2.columns:
        datavalue_similarity(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")

the feature is :input_rate
Mean and extreme differences are more than 5%
mean difference: 59.03%
minimum difference: 72.06%
maximum difference: 73.64%
the feature is :output_rate
Mean and extreme differences are more than 5%
mean difference: 38.09%
minimum difference: inf%
maximum difference: 72.09%
the feature is :latency
Mean and extreme differences are more than 5%
mean difference: 32.05%
minimum difference: 0.22%
maximum difference: 0.00%


  min_diff_percent = abs(min1 - min2) / min1 * 100


# To distinguish dataset

In [14]:
exp = f"load_stimulus_exp-1"
#exp_compare = f"load_stimulus_exp-2"
exp_compare = f"resource_stimulus_tree"

In [15]:
current_dir = os.getcwd()
file_path = os.path.abspath(f'../datasets/{exp}.csv')
file_path_compare =  os.path.abspath(f'../datasets/{exp_compare}.csv')

data1 = pd.read_csv(file_path)
data2 = pd.read_csv(file_path_compare)

In [16]:
import pandas as pd

def compare_dataset_headers(file1, file2):
    data1 = pd.read_csv(file1)
    data2 = pd.read_csv(file2)

    if list(data1.columns) == list(data2.columns):
        print("same feature set")
        return True
    else:
        print("different feature set")
        return False

In [17]:
compare_dataset_headers(file_path, file_path_compare)

different feature set


False

In [18]:
# data value difference
for column in key_columns:
    print(f'the feature is :{column}')
    if column in data1.columns and column in data2.columns:
        datavalue_similarity(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")

the feature is :input_rate
Mean and extreme differences are more than 5%
mean difference: 59.03%
minimum difference: 72.06%
maximum difference: 73.64%
the feature is :output_rate
Mean and extreme differences are more than 5%
mean difference: 38.09%
minimum difference: inf%
maximum difference: 72.09%
the feature is :latency
Mean and extreme differences are more than 5%
mean difference: 32.05%
minimum difference: 0.22%
maximum difference: 0.00%


  min_diff_percent = abs(min1 - min2) / min1 * 100


In [19]:
# js
for column in key_columns:
    if column in data1.columns and column in data2.columns:
        compare_columns_js(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")

input_rate: JS Divergence: 0.552
output_rate: JS Divergence: 0.591
latency: JS Divergence: 0.066


In [41]:
import time 

start_time=time.time()




# explort data
exp = f"load_stimulus_exp-1"
#exp_compare = f"load_stimulus_exp-2"
exp_compare = f"resource_stimulus_tree"

current_dir = os.getcwd()
file_path = os.path.abspath(f'../datasets/{exp}.csv')
file_path_compare =  os.path.abspath(f'../datasets/{exp_compare}.csv')

data2 = pd.read_csv(file_path)
data1 = pd.read_csv(file_path_compare)

#columns to be compare

additional_columns = ["output_rate", "latency"]
key_columns = [col for col in data1.columns if 'LLC-loads' in col] +[col for col in data1.columns if 'LLC-load-misses' in col] +[col for col in data1.columns if 'L1-dcache-load-misses' in col] +[col for col in data1.columns if 'cache-references' in col] + additional_columns
            

print(key_columns)

#step 1
compare_dataset_headers(file_path, file_path_compare)
#step 2, compare data samilarity and data distribution
for column in key_columns:
    if column in data1.columns and column in data2.columns:
        datavalue_similarity(data1[column], data2[column], column)
        compare_columns_js(data1[column], data2[column], column)
    else:
        print(f"{column} not found in both datasets")
        
end_time = time.time()
elapsed_time = end_time - start_time
print(f'elapsed time :{elapsed_time}')

['flow_tracker_LLC-loads', 'nf_router_LLC-loads', 'ndpi_stats_LLC-loads', 'payload_scan_LLC-loads', 'flow_tracker_LLC-load-misses', 'nf_router_LLC-load-misses', 'ndpi_stats_LLC-load-misses', 'payload_scan_LLC-load-misses', 'flow_tracker_L1-dcache-load-misses', 'nf_router_L1-dcache-load-misses', 'ndpi_stats_L1-dcache-load-misses', 'payload_scan_L1-dcache-load-misses', 'flow_tracker_cache-references', 'nf_router_cache-references', 'ndpi_stats_cache-references', 'payload_scan_cache-references', 'output_rate', 'latency']
different feature set
flow_tracker_LLC-loads not found in both datasets
nf_router_LLC-loads not found in both datasets
ndpi_stats_LLC-loads not found in both datasets
payload_scan_LLC-loads not found in both datasets
flow_tracker_LLC-load-misses not found in both datasets
nf_router_LLC-load-misses not found in both datasets
ndpi_stats_LLC-load-misses not found in both datasets
payload_scan_LLC-load-misses not found in both datasets
flow_tracker_L1-dcache-load-misses not fo