In [None]:
import tarfile
import os

extracted_files = []
extraction_dir = "./extracted_files"

if not os.path.exists(extraction_dir):
    os.makedirs(extraction_dir)

#tar_files = [
#    "./data/2023-09-29-00-00-01raspberrypi-RRUL.tar.gz",
#    "./data/2023-09-29-06-00-01raspberrypi-RRUL.tar.gz",
#    "./data/2023-09-29-12-00-01raspberrypi-RRUL.tar.gz"
#]

data_dir = "./lund-pi/mainswitch"  # Assuming this is the location of the "data" folder
data_dir = "./erik"  # Assuming this is the location of the "data" folder

tar_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.tar.gz')]


for tar_file in tar_files:
    with tarfile.open(tar_file, 'r:gz') as archive:
        archive.extractall(extraction_dir)
        for member in archive.getmembers():
            if member.name.endswith('.csv'):
                print("Found Data: " + member.name)
                extracted_files.append(member.name)


In [None]:
import pandas as pd

df_list_with_filename = []

for csv_file in extracted_files:
    file_path = os.path.join(extraction_dir, csv_file)
    df_temp = pd.read_csv(file_path)
    df_temp['filename'] = csv_file
    df_list_with_filename.append(df_temp)

combined_df_with_filename = pd.concat(df_list_with_filename, ignore_index=True)

In [None]:
#combined_df_with_filename
#combined_df_with_filename['filename'].unique()

In [None]:
import matplotlib.pyplot as plt

file_names = combined_df_with_filename['filename'].unique()

plt.figure(figsize=(15, 8))

for file in file_names:
    subset = combined_df_with_filename[combined_df_with_filename['filename'] == file]
    plt.plot(subset['rrul'], subset['TCP download avg'], label=file)

plt.title('TCP Download Average by Time')
plt.xlabel('Time (s)')
plt.ylabel('TCP Download Average')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px

# Plot TCP Upload Average using Plotly
fig_upload = px.line(combined_df_with_filename, x='rrul', y='TCP download avg', color='filename',
                     labels={'TCP download avg': 'TCP download avg', 'rrul': 'Time (s)'},
                     title='TCP download avg by Time')

# Display the plot
fig_upload.show()

In [None]:

under_30_df = combined_df_with_filename[combined_df_with_filename['TCP download avg'] < 30]

# Plot TCP Upload Average using Plotly
fig_under_30 = px.line(under_30_df, x='rrul', y='TCP download avg', color='filename',
                     labels={'TCP download avg': 'TCP download avg', 'rrul': 'Time (s)'},
                     title='TCP download avg by Time')

# Display the plot
fig_under_30.show()

In [None]:
# Plotting a histogram of TCP download avg grouped by filename
plt.figure(figsize=(15, 8))

# Plot data for each file
for file in file_names:
    subset = combined_df_with_filename[combined_df_with_filename['filename'] == file]
    plt.hist(subset['TCP download avg'], bins=50, alpha=0.5, label=file)

# Add title, labels, legend, grid and layout adjustments
plt.title('Histogram of TCP Download Average by Filename')
plt.xlabel('TCP Download Average')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Display the histogram
plt.show()


In [None]:
# Filter the data where TCP download avg is above 50
above_50_df = combined_df_with_filename[combined_df_with_filename['TCP download avg'] > 50]

# Count the number of occurrences for each filename
above_50_counts = above_50_df['filename'].value_counts()

above_50_counts


In [None]:
# Plot histogram using Plotly for the distributions above 50 TCP Download
fig_above_50 = px.histogram(above_50_df, x='TCP download avg', color='filename', 
                            title='Distribution of TCP Download Average > 50 by Filename', 
                            labels={'TCP download avg': 'TCP Download Average'},
                            nbins=50)

# Display the plot
fig_above_50.show()


In [None]:
# Create a histogram using Plotly for the entire dataset
fig_all_data = px.histogram(combined_df_with_filename, x='TCP download avg', color='filename', 
                            title='Distribution of TCP Download Average by Filename', 
                            labels={'TCP download avg': 'TCP Download Average'},
                            nbins=100)

# Display the plot
fig_all_data.show()

# Save the Plotly figure as an HTML file
#html_path_all_data = "/mnt/data/tcp_download_avg_all_data_plotly.html"
#fig_all_data.write_html(html_path_all_data)

#html_path_all_data


In [None]:
# Filter the data where TCP download avg is under 30
under_30_df = combined_df_with_filename[combined_df_with_filename['TCP download avg'] < 30]

# Create a histogram using Plotly for the filtered dataset with 50 bins
fig_under_30 = px.histogram(under_30_df, x='TCP download avg', color='filename', 
                            title='Distribution of TCP Download Average < 30 by Filename', 
                            labels={'TCP download avg': 'TCP Download Average'},
                            nbins=50)

fig_under_30.show()


In [None]:
# Use matplotlib to generate a static histogram for data under 30 TCP Download Average
plt.figure(figsize=(15, 8))

# Plot data for each file in the under_30_df
for file in file_names:
    subset = under_30_df[under_30_df['filename'] == file]
    plt.hist(subset['TCP download avg'], bins=50, alpha=0.5, label=file)

# Add title, labels, legend, grid and layout adjustments
plt.title('Histogram of TCP Download Average < 30 by Filename')
plt.xlabel('TCP Download Average')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True)
plt.tight_layout()

# Display the histogram
plt.show()


In [None]:
# Calculate the median of the 'TCP download avg' for the under_30_df dataset
median_tcp_download_avg = under_30_df['TCP download avg'].median()
median_tcp_download_avg

In [None]:
# Filtering the data within the range of 20-25 Mbps
subset_20_25 = under_30_df[(under_30_df['TCP download avg'] >= 20) & (under_30_df['TCP download avg'] <= 25)]

# Plotting histogram and KDE for the filtered data
plt.figure(figsize=(15, 8))
under_30_df['TCP download avg'].hist(bins=50, alpha=0.5, label='Histogram', density=True)
under_30_df['TCP download avg'].plot(kind='kde', label='KDE', color='red')

# Adding title, labels, legend, grid, and layout adjustments
plt.title('Distribution of TCP Download Average (20-25 Mbps)')
plt.xlabel('TCP Download Average')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.show()


In [None]:
from scipy.stats import gaussian_kde
import numpy as np

# Fit KDE to the data
kde = gaussian_kde(under_30_df['TCP download avg'])

# Define a range of values within 20-25 Mbps to find the peak
x_vals = np.linspace(0, 40, 1000)

# Evaluate the KDE for the defined range
kde_vals = kde(x_vals)

# Find the value where the KDE is maximum (peak)
peak_value = x_vals[np.argmax(kde_vals)]

peak_value

# Define a range of values within 20-25 Mbps to find the peak
x_vals = np.linspace(0, 40, 1000)

# Evaluate the KDE for the defined range
kde_vals = kde(x_vals)

# Find the value where the KDE is maximum (peak)
peak_value = x_vals[np.argmax(kde_vals)]

peak_value
