In [2]:
import os
import joblib
import random
import numpy   as np
import pandas  as pd
import seaborn as sns

# Maching Learning Stuff
from sklearn            import tree
from sklearn.tree       import plot_tree
from sklearn.tree       import export_text
from sklearn.tree       import DecisionTreeClassifier
from sklearn.metrics    import accuracy_score
from sklearn.metrics    import confusion_matrix
from sklearn.metrics    import classification_report
from sklearn.metrics    import f1_score

from collections import defaultdict

# Multi-threading Stuff
from concurrent.futures import ProcessPoolExecutor

# Plotting utilities
import matplotlib.pyplot as plt

# Import formatter
from matplotlib.ticker import FuncFormatter


plt.rcParams['font.family'] = 'monospace'

In [3]:
RATES = ["1500kbits", "3000kbits", "4500kbits", "6000kbits", "7500kbits", "50000kbits"]

In [4]:
VALUES = [0, 1, 2, 3]
LABELS = ["LD", "SD", "HD", "FULL-HD"]
COLORS = ["black", "red", "green", "violet"]

# Define the name of the new columns
NUMBER          = "id"
CLASS           = "class"
PREDICTED_CLASS = "predicted_class"
TESTBED_RATE    = "testbed_rate"
AVG_VIDEO_RATE  = "avg_video_rate"

# Define how many tests to in dataset
N = 10

In [5]:
def get_video_class(kbits: float) -> int:
    if 0 <= kbits <= 288:
        return 0  # LD
    elif 289 <= kbits <= 1500:
        return 1  # SD
    elif 1501 <= kbits <= 6500:
        return 2  # HD
    elif 6501 <= kbits <= 8000:
        return 3  # FULL-HD
    
def format_bytes(num_bytes: float) -> str:
    units = ['B', 'KB', 'MB', 'GB', 'TB']
    size  = float(num_bytes)

    for unit in units:
        if size < 1024:
            return f"{size:.2f} {unit}"
        size /= 1024
    return f"{size:.2f} {units[-1]}"

In [6]:
training_set_columns = []
trailing_set_columns = []

for i in range(0, 10):    
    # colums_to_keep.append(f"ts_#{i}")
    # colums_to_keep.append(f"te_#{i}")
    
    training_set_columns.append(f"s_bytes_all_#{i}")
    training_set_columns.append(f"c_bytes_all_#{i}")
    
    training_set_columns.append(f"idle_#{i}")
    training_set_columns.append(f"max_span_#{i}")
    training_set_columns.append(f"min_span_#{i}")
    training_set_columns.append(f"avg_span_#{i}")
    training_set_columns.append(f"std_span_#{i}")
    
    # colums_to_keep.append(f"s_bytes_uniq_#{i}")
    # colums_to_keep.append(f"c_bytes_uniq_#{i}")
    
    # training_set_columns.append(f"s_ack_cnt_#{i}")
    # training_set_columns.append(f"c_ack_cnt_#{i}")
    
    # training_set_columns.append(f"s_ack_cnt_p_#{i}")
    # training_set_columns.append(f"c_ack_cnt_p_#{i}")
    
    # training_set_columns.append(f"avg_video_rate_#{i}")
    # training_set_columns.append(f"avg_audio_rate_#{i}")
    # training_set_columns.append(f"video_requests_sequence_#{i}")
    # training_set_columns.append(f"audio_requests_sequence_#{i}")
    
# training_set_columns.append(f"ts")
# training_set_columns.append(f"te")

training_set_columns.append(f"s_bytes_all")
training_set_columns.append(f"c_bytes_all")

training_set_columns.append(f"idle")
training_set_columns.append(f"max_span")
training_set_columns.append(f"min_span")
training_set_columns.append(f"avg_span")
training_set_columns.append(f"std_span")

# training_set_columns.append(f"s_bytes_uniq")
# training_set_columns.append(f"c_bytes_uniq")
    
# training_set_columns.append(f"s_ack_cnt")
# training_set_columns.append(f"c_ack_cnt")
    
# training_set_columns.append(f"s_ack_cnt_p")
# training_set_columns.append(f"c_ack_cnt_p")

# training_set_columns.append(f"avg_audio_rate")
# training_set_columns.append(f"avg_video_rate")
# training_set_columns.append(f"video_requests_sequence")
# training_set_columns.append(f"audio_requests_sequence")

# Remove new columns
# training_set_columns.append(CLASS)
# training_set_columns.append(NUMBER)
# training_set_columns.append(TESTBED_RATE)

In [7]:
# Define file paths
UDP_BASED_MODEL = os.path.join(os.path.dirname(os.getcwd()), "udp_based", "saved_models", "udp_decision_tree.pkl")

# Load models
udp_model = joblib.load(UDP_BASED_MODEL)

In [8]:
frames = []

# Locate on disk
curr_dir = os.getcwd()
protocol = "tcp_data"

for rate in os.listdir(os.path.join(curr_dir, protocol)):
    files = os.listdir(os.path.join(curr_dir, protocol, rate))
    random_files = random.sample(files, min(N, len(files)))
    for file in random_files:
        frame = pd.read_csv(os.path.join(curr_dir, protocol, rate, file), sep=" ")
        frame[CLASS] = frame[AVG_VIDEO_RATE].apply(get_video_class)
        frames.append(frame)

In [9]:
for idx, frame in enumerate(frames):
    x_test = frame[training_set_columns]
    y_test = frame[CLASS]

    # Make predictions
    preds = udp_model.predict(x_test)
    score = accuracy_score(y_test, preds)
    
    print(f"Test {idx+1} - Accuracy: {score * 100:.2f}%")

Test 1 - Accuracy: 86.21%
Test 2 - Accuracy: 96.55%
Test 3 - Accuracy: 96.67%
Test 4 - Accuracy: 93.10%
Test 5 - Accuracy: 100.00%
Test 6 - Accuracy: 93.10%
Test 7 - Accuracy: 93.33%
Test 8 - Accuracy: 100.00%
Test 9 - Accuracy: 100.00%
Test 10 - Accuracy: 100.00%
Test 11 - Accuracy: 72.41%
Test 12 - Accuracy: 93.33%
Test 13 - Accuracy: 86.67%
Test 14 - Accuracy: 70.00%
Test 15 - Accuracy: 46.67%
Test 16 - Accuracy: 53.33%
Test 17 - Accuracy: 53.33%
Test 18 - Accuracy: 86.67%
Test 19 - Accuracy: 86.67%
Test 20 - Accuracy: 63.33%
Test 21 - Accuracy: 90.00%
Test 22 - Accuracy: 83.33%
Test 23 - Accuracy: 96.67%
Test 24 - Accuracy: 96.67%
Test 25 - Accuracy: 96.55%
Test 26 - Accuracy: 82.76%
Test 27 - Accuracy: 83.33%
Test 28 - Accuracy: 90.00%
Test 29 - Accuracy: 86.67%
Test 30 - Accuracy: 96.67%
Test 31 - Accuracy: 96.67%
Test 32 - Accuracy: 100.00%
Test 33 - Accuracy: 100.00%
Test 34 - Accuracy: 96.67%
Test 35 - Accuracy: 100.00%
Test 36 - Accuracy: 96.55%
Test 37 - Accuracy: 89.66%
Tes