In [None]:
from power_data_analyzer import PowerDataAnalyzer
import os

current_directory = os.path.abspath(os.getcwd())
parent_directory = os.path.dirname(current_directory)
root_directory = os.path.join(parent_directory, "raw_data/processed-results/merged-power-measurements/")

# Plot arbitrary sample
analyzer = PowerDataAnalyzer(root_directory)
power_comparison_plot = analyzer.plot_random_sample(200)


In [None]:
error_df = analyzer.get_error_df()

# Boxplot with SmartWatts error rate for the power measurements
analyzer.plot_error_rate_boxplot(error_df)

In [None]:
import pandas as pd
import numpy as np
from scipy.integrate import trapz

dfs = analyzer.get_error_df()

df_high = dfs['High']
df_high['workload'] = 'HIGH'
df_medium = dfs['Medium']
df_medium['workload'] = 'MEDIUM'
df_low = dfs['Low']
df_low['workload'] = "LOW"


def get_energy_df(df):
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    time_diff = (df['timestamp'] - df['timestamp'].iloc[0]).dt.total_seconds()

    # Convert 'smartwatts_power' and 'wattsup_power' columns to energy
    df['smartwatts_energy'] = np.round(trapz(df['smartwatts_power'], time_diff), 0)
    df['wattsup_energy'] = np.round(trapz(df['wattsup_power'], time_diff), 0)

    res = pd.DataFrame({
        'workload': [list(df['workload'])[0]],
        'smartwatts_energy': [list(df['smartwatts_energy'])[0]],
        'wattsup_energy': [list(df['wattsup_energy'])[0]]
    })    
    return res

def get_files_by_workload(run_number, workload):
    run_directory = os.path.join(root_directory, f'run_{run_number}')
    return os.path.join(run_directory, f'run_{run_number}-{workload}.csv')

def read_and_get_energy_df(run_number, workload):
    df = pd.read_csv(get_files_by_workload(run_number, workload))
    df['workload'] = workload
    energy_df = get_energy_df(df)
    energy_df['run_number'] = run_number    
    return energy_df

main_df = pd.DataFrame()
workloads = ['HIGH', 'MEDIUM', 'LOW']

for run_number in analyzer.run_numbers:
    run_df = pd.DataFrame()
    energy_dfs = [read_and_get_energy_df(run_number, workload) for workload in workloads]
    run_df = pd.concat([run_df] + energy_dfs, ignore_index=True)
    main_df = pd.concat([main_df, run_df], ignore_index=True)
    root_directory2 = os.path.join(parent_directory, "raw_data/processed-results")
    run_df.to_csv(f"{root_directory2}/energy-measurements/run_{run_number}-energy.csv", index=False)

main_df.to_csv("combined-energy.csv", index=False)

### Data summaries generation for the energy measurements.


In [None]:
import pandas as pd 

main_df = pd.read_csv('combined-energy.csv')
high = main_df[main_df['workload'] == 'HIGH']
medium = main_df[main_df['workload'] == 'MEDIUM']
low = main_df[main_df['workload'] == 'LOW']

def generate_summary(df):
    summary = df.drop(columns='run_number').describe().drop(labels='count')
    summary['smartwatts_energy'] = summary['smartwatts_energy'].astype(int)
    summary['wattsup_energy'] = summary['wattsup_energy'].astype(int)
    summary = summary.rename(columns={'smartwatts_energy': 'SmartWatts', 'wattsup_energy': 'Wattsup Pro'})
    summary = summary.rename_axis('Power meter', axis=0)
    transposed_summary = summary.transpose().rename(columns=str.capitalize)
    transposed_summary['CV'] = (transposed_summary['Std'] / transposed_summary['Mean']) * 100
    transposed_summary['CV'] =  transposed_summary['CV'].round(2)
    return transposed_summary

low = main_df[main_df['workload'] == 'LOW']
medium = main_df[main_df['workload'] == 'MEDIUM']
high = main_df[main_df['workload'] == 'HIGH']

low_workload_summary = generate_summary(low)
medium_workload_summary = generate_summary(medium)
high_workload_summary = generate_summary(high)


### Performing the correlation test using Pearson's correlation coefficient.

In [None]:
from scipy.stats import pearsonr

main_df = pd.read_csv('combined-energy.csv')
high = main_df[main_df['workload'] == 'HIGH']
medium = main_df[main_df['workload'] == 'MEDIUM']
low = main_df[main_df['workload'] == 'LOW']

def perform_correlation_test(energy_df):
    x = energy_df['smartwatts_energy']
    y = energy_df['wattsup_energy']

    # Compute the correlation coefficient using pearsonr
    correlation, p_value = pearsonr(x, y)
    return energy_df['workload'],correlation, p_value

results=[]
for workload in [low, medium, high]:
    results.append(perform_correlation_test(workload))

df_results = pd.DataFrame(results, columns=['Workload', 'Correlation Coefficient', 'P-value'])
df_results['Workload'] = ['LOW', 'MEDIUM', 'HIGH']
df_results['Correlation Coefficient'] = df_results['Correlation Coefficient'].round(3)

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 6))

wattsup_boxes = axes[0].boxplot([low['wattsup_energy'], medium['wattsup_energy'], high['wattsup_energy']],
                                labels=['Low', 'Medium', 'High'], patch_artist=True)
axes[0].set_ylabel('Energy(J)', labelpad=10, fontsize = 11)  
axes[0].set_title('Watts Up Pro Measurements')
axes[0].set_xlabel("Workload", fontsize = 11, labelpad = 10)

smartwatts_boxes = axes[1].boxplot([low['smartwatts_energy'], medium['smartwatts_energy'], high['smartwatts_energy']],
                                  labels=['Low', 'Medium', 'High'], patch_artist=True)
axes[1].set_ylabel('Energy(J)', labelpad=10, fontsize = 11)
axes[1].set_title("SmartWatts Measurements")
axes[1].set_ylim(10000,45000)
axes[1].set_xlabel("Workload", fontsize = 11, labelpad = 10)
colors = ['lightgreen', 'orange', 'red']

for box, color in zip(wattsup_boxes['boxes'], colors):
    box.set_facecolor(color)

for box, color in zip(smartwatts_boxes['boxes'], colors):
    box.set_facecolor(color)

plt.tight_layout()

plt.show()


In [None]:
import matplotlib.pyplot as plt

df = main_df
# Calculate relative error for the energy estimated at each workload
df['relative_error'] = (abs(df['smartwatts_energy'] - df['wattsup_energy']) / df['wattsup_energy']) * 100

# Rearrange the order of workloads and relative errors to match low, medium, high
workloads = df['workload'].unique()
relative_errors = [df.loc[df['workload'] == workload, 'relative_error'] for workload in workloads]
workloads = [workloads[2].capitalize(), workloads[1].capitalize(), workloads[0].capitalize()]  # Switch the order of workloads
relative_errors = [relative_errors[2], relative_errors[1], relative_errors[0]]  # Switch the order of relative errors

plt.boxplot(relative_errors, widths=0.20)

plt.xticks(range(1, len(workloads) + 1), workloads)

plt.ylim(70, 90)

plt.ylabel('SmartWatts Error Rate (%)', labelpad=10)
plt.xlabel('Workload', labelpad=10)

plt.show()


### Generate scatterplots for a more in-depth analysis of the correlations between the energy collected by the measurement tools per workload level.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

def plot_correlations(df):
    x = df['smartwatts_energy']
    y = df['wattsup_energy']

    x = np.array(x).reshape(-1, 1)
    y = np.array(y)

    model = LinearRegression()
    model.fit(x, y)

    y_pred = model.predict(x)
    return x, y, y_pred


In [None]:
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt

figure, axis = plt.subplots(2, 2)
x,y, y_pred = plot_correlations(low)
axis[0, 0].scatter(x, y, s = 18, alpha = 0.56)
axis[0, 0].plot(x, y_pred , color = 'red')
axis[0, 0].set_xlabel('SmartWatts energy(J)',fontsize=8)
axis[0, 0].set_ylabel('Watts Up Pro energy(J)', fontsize=8)
axis[0, 0].set_title('Low workload', fontsize = 9)
axis[0, 0].tick_params(axis='x', labelsize=8)
axis[0, 0].tick_params(axis='y', labelsize=8)

x1,y1,y1_pred = plot_correlations(medium)
axis[0, 1].scatter(x1, y1, s = 18, alpha = 0.56)
axis[0, 1].plot(x1, y1_pred , color = 'red')
axis[0, 1].set_xlabel('SmartWatts energy(J)',fontsize=8)
axis[0, 1].set_ylabel('Watts Up Pro energy(J)', fontsize=8)
axis[0, 1].set_title('Medium workload', fontsize = 9)
axis[0,1].tick_params(axis='x', labelsize=8)
axis[0,1].tick_params(axis='y', labelsize=8)

x2,y2,y2_pred = plot_correlations(high)
axis[1, 0].scatter(x2,y2,s = 18,alpha = 0.56)
axis[1, 0].plot(x2, y2_pred , color = 'red')
axis[1, 0].set_xlabel('SmartWatts energy(J)',fontsize=8)
axis[1, 0].set_ylabel('Watts Up Pro energy(J)', fontsize=8)
axis[1, 0].set_title('High workload', fontsize = 9)
axis[1,0].tick_params(axis='x', labelsize=8)
axis[1,0].tick_params(axis='y', labelsize=8)

x_combined = np.concatenate([x, x1, x2])  
y_combined = np.concatenate([y, y1, y2])  

x_combined = np.array(x_combined).reshape(-1, 1)

# Fit linear regression model
model = LinearRegression()
model.fit(x_combined, y_combined)

y_pred = model.predict(x_combined)

# Plot the scatterplot and the best fit line
axis[1,1].scatter(x, y, s = 15, alpha = 0.56 , color = 'green')
axis[1,1].scatter(x1, y1, s = 15, alpha = 0.43 , color = 'orange')
axis[1,1].scatter(x2, y2, s = 15, alpha = 0.5 , color = 'purple')
axis[1, 1].set_xlabel('SmartWatts energy(J)',fontsize=8)
axis[1, 1].set_ylabel('Watts Up Pro energy(J)', fontsize=8)
axis[1, 1].set_title('All workloads', fontsize = 9)
axis[1,1].tick_params(axis='x', labelsize=8)
axis[1,1].tick_params(axis='y', labelsize=8)


legend = axis[1,1].legend(['low', 'medium', 'high'], loc='lower right', labelspacing=0.05)
legend.get_frame().set_linewidth(0.5)  
legend.get_frame().set_edgecolor('black')  
for text in legend.get_texts():
    text.set_fontsize(6)  

legend.set_bbox_to_anchor((1, 0))  

axis[1,1].add_artist(legend)

plt.plot(x_combined, y_pred, color='red')
plt.subplots_adjust(wspace=0.6, hspace=0.6)
plt.show()

### This part corresponds to the investigation of the interruptions, detailed in the Discussion and Threats to Validity sections of the paper.


In [None]:
import os

def get_files_by_workload(run_number, workload):
    run_directory = os.path.join(root_directory, f'run_{run_number}')
    return os.path.join(run_directory, f'run_{run_number}-{workload}.csv')

concatenated = pd.DataFrame()
dfs = []
for run_number in analyzer.run_numbers:
    for workload in workloads:
        df = pd.read_csv(get_files_by_workload(run_number, workload.upper()))
        df['timestamp'] = pd.to_datetime(df['timestamp'])

        # Compute difference between consecutive timestamps
        df['time_diff'] = df['timestamp'].diff().dt.total_seconds()

        # Get all interruptions with an interruption > 10 seconds
        interruptions = df[df['time_diff'] > 10]
        df['interruptions'] = len(interruptions)
        df['run_number'] = run_number
        df['workload'] = workload
        concatenated = pd.concat([concatenated, df], axis=0)
        
concatenated['time_diff'].value_counts()
time_diff_counts = concatenated['time_diff'].value_counts().sort_values(ascending=True)
filtered_counts = time_diff_counts[time_diff_counts.index >=0]
total_count = filtered_counts.sum()
percentage = (filtered_counts / total_count) * 100

total_interruptions = concatenated.groupby(['run_number', 'workload'])['interruptions'].sum().reset_index()
total_interruptions = total_interruptions.rename(columns={'interruptions': 'total_interruptions'})
total_interruptions_by_workload = total_interruptions.groupby('workload')['total_interruptions'].sum().reset_index()

total_sum = total_interruptions_by_workload['total_interruptions'].sum()

total_interruptions_by_workload['percentage'] = (total_interruptions_by_workload['total_interruptions'] / total_sum) * 100
total_interruptions_by_workload = total_interruptions_by_workload.sort_values(by='total_interruptions', ascending=False)
