In [117]:
import os
import random
import numpy   as np
import seaborn as sns
import pandas  as pd
import joblib

from typing import Dict

# Machine learning methods
from sklearn            import tree
from sklearn.tree       import plot_tree
from sklearn.tree       import export_text
from sklearn.tree       import DecisionTreeClassifier
from sklearn.metrics    import accuracy_score
from sklearn.metrics    import confusion_matrix
from sklearn.metrics    import classification_report
from sklearn.metrics    import f1_score

from collections import defaultdict
from concurrent.futures import ProcessPoolExecutor

# Plotting utilities
import matplotlib.pyplot as plt

# Import formatter
from matplotlib.ticker import FuncFormatter

In [118]:
# Constants
N = 25
K = 20

VALUES = [0, 1, 2, 3]
LABELS = ["LD", "SD", "HD", "FULL-HD"]
COLORS = ["black", "red", "green", "blue"]

# Define the name of the new columns
NUMBER          = "id"
CLASS           = "class"
PREDICTED_CLASS = "predicted_class"
TESTBED_RATE    = "testbed_rate"
AVG_VIDEO_RATE  = "avg_video_rate"

In [119]:
def get_video_class(kbits: float) -> int:
    if 0 <= kbits <= 288:
        return 0  # LD
    elif 289 <= kbits <= 1500:
        return 1  # SD
    elif 1501 <= kbits <= 6500:
        return 2  # HD
    elif 6501 <= kbits <= 8000:
        return 3  # FULL-HD
    
def format_bytes(num_bytes: float) -> str:
    units = ['B', 'KB', 'MB', 'GB', 'TB']
    size  = float(num_bytes)

    for unit in units:
        if size < 1024:
            return f"{size:.2f} {unit}"
        size /= 1024
    return f"{size:.2f} {units[-1]}"

In [120]:
# Define directories
dir = os.path.join(os.getcwd(), "tcp")

In [121]:
# Define the rate of the experiments
rates = ["1500kbits", "3000kbits", "4500kbits", "6000kbits", "7500kbits", "50000kbits"]

In [122]:
# Define dictionary for staging data
data : Dict[str, list[str]] = {}

In [123]:
# For each rate, get N samples
for rate in rates:
    data[rate] = [os.path.join(dir, rate, file) for file in os.listdir(os.path.join(dir, rate))]

In [124]:
# For each rate, create training set and testing set
tests = []

In [None]:
for rate in rates:
    
    # Get k random samples at each rate
    random_samples = random.sample(data[rate], K)

    # Generate the training set
    for sample in random_samples:
        frame = pd.read_csv(sample, sep=" ")
        frame[CLASS] = frame[AVG_VIDEO_RATE].apply(get_video_class)
        tests.append(frame)
        
print(f"[TESTING SET]: number of streaming periods = {len(tests)} over {len(rates)} bandwidth")

[TRAINING SET]: number of streaming periods = 120 over 6 bandwidth


In [126]:
features = []
for i in range(0, 10):    
    # Volumetric metrics
    features.append(f"s_bytes_all_#{i}")
    features.append(f"c_bytes_all_#{i}")
    # features.append(f"s_ack_cnt_#{i}")
    # features.append(f"c_ack_cnt_#{i}")
    # features.append(f"s_ack_cnt_p_#{i}")
    # features.append(f"c_ack_cnt_p_#{i}")

    # Temporal metrics
    features.append(f"idle_#{i}")
    features.append(f"max_span_#{i}")
    features.append(f"min_span_#{i}")
    features.append(f"avg_span_#{i}")
    features.append(f"std_span_#{i}")

# Volumetric metrics (outer columns)
features.append(f"s_bytes_all")
features.append(f"c_bytes_all")
# features.append(f"s_ack_cnt")
# features.append(f"c_ack_cnt")
# features.append(f"s_ack_cnt_p")
# features.append(f"c_ack_cnt_p")

# Temporal metrics (outer columns)
features.append(f"idle")
features.append(f"max_span")
features.append(f"min_span")
features.append(f"avg_span")
features.append(f"std_span")

In [127]:
# Load the UDP model
model = joblib.load(os.path.join("model_udp", "saved_models", "udp_model.pkl"))

In [128]:
# Initialize a list to store the scores
scores = []

# Test the UDP model with TCP-based streaming periods
for num, test in enumerate(tests):
    x_test = test[features]
    y_test = test[CLASS]
    
    # Test the model
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)  # Save the score
    
    #print(f"[TEST N° {num}] scored accuracy = {score:.2f}%")

# Calculate and print statistics
max_score = max(scores)
min_score = min(scores)
avg_score = np.mean(scores)
std_dev   = np.std(scores)

print("[Statitics, classifing TCP-bases HAS flows using UDP-based trained model]")
print(f"\tMax Accuracy: {max_score:.2f}%")
print(f"\tMin Accuracy: {min_score:.2f}%")
print(f"\tAvg Accuracy: {avg_score:.2f}%")
print(f"\tStd Deviation: {std_dev:.2f}%")

[Statitics, classifing TCP-bases HAS flows using UDP-based trained model]
	Max Accuracy: 1.00%
	Min Accuracy: 0.57%
	Avg Accuracy: 0.92%
	Std Deviation: 0.09%


In [129]:
# Define directories
dir = os.path.join(os.getcwd(), "udp")

In [130]:
# Define dictionary for staging data
data : Dict[str, list[str]] = {}

In [131]:
# For each rate, get N samples
for rate in rates:
    data[rate] = [os.path.join(dir, rate, file) for file in os.listdir(os.path.join(dir, rate))]

In [132]:
# For each rate, create training set and testing set
tests = []

In [133]:
for rate in rates:
    
    # Get k random samples at each rate
    random_samples = random.sample(data[rate], K)

    # Generate the training set
    for sample in random_samples:
        frame = pd.read_csv(sample, sep=" ")
        frame[CLASS] = frame[AVG_VIDEO_RATE].apply(get_video_class)
        tests.append(frame)
        
print(f"[TESTING SET]: number of streaming periods = {len(tests)} over {len(rates)} bandwidth")

[TESTING SET]: number of streaming periods = 120 over 6 bandwidth


In [134]:
features = []
for i in range(0, 10):    
    # Volumetric metrics
    features.append(f"s_bytes_all_#{i}")
    features.append(f"c_bytes_all_#{i}")
    features.append(f"s_ack_cnt_#{i}")
    features.append(f"c_ack_cnt_#{i}")
    features.append(f"s_ack_cnt_p_#{i}")
    features.append(f"c_ack_cnt_p_#{i}")

    # Temporal metrics
    features.append(f"idle_#{i}")
    features.append(f"max_span_#{i}")
    features.append(f"min_span_#{i}")
    features.append(f"avg_span_#{i}")
    features.append(f"std_span_#{i}")

# Volumetric metrics (outer columns)
features.append(f"s_bytes_all")
features.append(f"c_bytes_all")
features.append(f"s_ack_cnt")
features.append(f"c_ack_cnt")
features.append(f"s_ack_cnt_p")
features.append(f"c_ack_cnt_p")

# Temporal metrics (outer columns)
features.append(f"idle")
features.append(f"max_span")
features.append(f"min_span")
features.append(f"avg_span")
features.append(f"std_span")

In [135]:
# Load the TCP model
model = joblib.load(os.path.join("model_tcp", "saved_models", "tcp_model.pkl"))

In [136]:
# Initialize a list to store the scores
scores = []

for test in tests:
    missing_columns = [col for col in features if col not in test.columns]

    # Add missing columns with zero values
    for col in missing_columns:
        test[col] = 0

# Test the UDP model with TCP-based streaming periods
for num, test in enumerate(tests):
    x_test = test[features]
    y_test = test[CLASS]
    
    # Test the model
    preds = model.predict(x_test)
    score = accuracy_score(y_test, preds)
    scores.append(score)  # Save the score
    
    #print(f"[TEST N° {num}] scored accuracy = {score:.2f}%")

# Calculate and print statistics
max_score = max(scores)
min_score = min(scores)
avg_score = np.mean(scores)
std_dev   = np.std(scores)

print("[Statitics, classifing UDP-bases HAS flows using TCP-based trained model]")
print(f"\tMax Accuracy: {max_score:.2f}%")
print(f"\tMin Accuracy: {min_score:.2f}%")
print(f"\tAvg Accuracy: {avg_score:.2f}%")
print(f"\tStd Deviation: {std_dev:.2f}%")

[Statitics, classifing UDP-bases HAS flows using TCP-based trained model]
	Max Accuracy: 1.00%
	Min Accuracy: 0.73%
	Avg Accuracy: 0.94%
	Std Deviation: 0.06%
