In [1]:
# Install Dependencies
!pip install gdown scikit-learn pandas

import gdown
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score



In [2]:
# Download Preprocessed Data

url = 'https://drive.google.com/uc?id=1nUNgJzVOyKywhGODfwN5TfpSGFJE1OTG'
output = 'modeling_dataset.csv'
gdown.download(url, output, quiet=False)

df = pd.read_csv('modeling_dataset.csv')
print("✅ Loaded:", df.shape)

Downloading...
From: https://drive.google.com/uc?id=1nUNgJzVOyKywhGODfwN5TfpSGFJE1OTG
To: /content/modeling_dataset.csv
100%|██████████| 318k/318k [00:00<00:00, 74.5MB/s]

✅ Loaded: (2176, 10)





In [3]:
# ***Regression Model for avg_chunk_size***
features = [
    'packet_count',
    'total_bytes',
    'avg_packet_size',
    'avg_inter_packet_delay',
    'std_inter_packet_delay'
]

X = df[features].fillna(0)

y_reg = df['avg_chunk_size']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(n_estimators=500, random_state=42)
regressor.fit(X_train_r, y_train_r)
y_pred_r = regressor.predict(X_test_r)

mse = mean_squared_error(y_test_r, y_pred_r)
print(f"avg_chunk_size Regressor MSE: {mse:.2f}")

byte_error = np.sqrt(mse)
print(f"avg_chunk_size Regressor Byte Error: {byte_error:.2f}")

avg_chunk_size Regressor MSE: 8554652684.92
avg_chunk_size Regressor Byte Error: 92491.37


In [4]:
# ***Classification Model for Rebuffering (chunk count unexpectedly drops)

# Define rebuffering as chunk_count < 8
# If predicted fewer than 8 chunks were used, this implies rebuffeirng occured!
#
#
df['rebuffered'] = (df['chunk_count'] < 8).astype(int)
y_clf = df['rebuffered']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.2, random_state=42)

classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train_c, y_train_c)
y_pred_c = classifier.predict(X_test_c)

acc = accuracy_score(y_test_c, y_pred_c)
report = classification_report(y_test_c, y_pred_c)

print(f"\nRebuffering Classifier Accuracy: {acc:.2f}")
print("Classification Report:\n", report)


Rebuffering Classifier Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94       348
           1       0.81      0.69      0.75        88

    accuracy                           0.91       436
   macro avg       0.87      0.83      0.85       436
weighted avg       0.90      0.91      0.90       436



In [5]:
!pip install trustee

Collecting trustee
  Downloading trustee-1.1.6-py3-none-any.whl.metadata (6.3 kB)
Collecting furo<2023.0.0,>=2022.6.21 (from trustee)
  Downloading furo-2022.12.7-py3-none-any.whl.metadata (6.0 kB)
Collecting pandas<2.0.0,>=1.1.0 (from trustee)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting prettytable==3.0.0 (from trustee)
  Downloading prettytable-3.0.0-py3-none-any.whl.metadata (22 kB)
Collecting setuptools<58.0.0,>=57.0.0 (from trustee)
  Downloading setuptools-57.5.0-py3-none-any.whl.metadata (4.9 kB)
Collecting sphinx-gallery<0.12.0,>=0.11.1 (from trustee)
  Downloading sphinx_gallery-0.11.1-py3-none-any.whl.metadata (4.9 kB)
Collecting sphinxemoji<0.3.0,>=0.2.0 (from trustee)
  Downloading sphinxemoji-0.2.0.tar.gz (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting term

In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from trustee import RegressionTrustee

#Assume df is already loaded
# df = pd.read_csv("your_dataset.csv")

#Define features
features = [
    'packet_count',
    'total_bytes',
    'avg_packet_size',
    'avg_inter_packet_delay',
    'std_inter_packet_delay'
]
X = df[features].fillna(0)

#Regression Target: avg_chunk_size
y = df['avg_chunk_size']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=500, random_state=42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

#Evaluate the regressor
mse = mean_squared_error(y_test, y_pred)
print(f" avg_chunk_size Regressor MSE: {mse:.2f}")
print(f" avg_chunk_size Regressor RMSE: {np.sqrt(mse):.2f}")
print(f" avg_chunk_size Regressor R²: {r2_score(y_test, y_pred):.2f}")

#Apply Trustee to explain the regressor
trustee = RegressionTrustee(expert=regressor)
trustee.fit(X_train, y_train, num_iter=50, num_stability_iter=10, samples_size=0.3, verbose=True)
dt, pruned_dt, agreement, reward = trustee.explain()
dt_y_pred = dt.predict(X_test)

#Evaluate surrogate tree
print("\n Trustee Fidelity Report (Tree vs Forest):")
print(f"  MSE: {mean_squared_error(y_pred, dt_y_pred):.2f}")
print(f"  R²:  {r2_score(y_pred, dt_y_pred):.2f}")

print("\n Trustee Real Performance Report (Tree vs Ground Truth):")
print(f"  MSE: {mean_squared_error(y_test, dt_y_pred):.2f}")
print(f"  R²:  {r2_score(y_test, dt_y_pred):.2f}")


📦 avg_chunk_size Regressor MSE: 8554652684.92
📦 avg_chunk_size Regressor RMSE: 92491.37
📈 avg_chunk_size Regressor R²: 0.56
Initializing training dataset using RandomForestRegressor(n_estimators=500, random_state=42) as expert model
Expert model score: 0.942238968652949
Initializing Trustee outer-loop with 10 iterations
########## Outer-loop Iteration 0/10 ##########
Initializing Trustee inner-loop with 10 iterations
########## Inner-loop Iteration 0/50 ##########
Sampling 365 points from training dataset with (1218, 1218) entries
Student model 0-0 trained with depth 22 and 255 leaves:
Student model score: 0.6110940909073574
Student model 0-0 fidelity: 0.6110940909073574
########## Inner-loop Iteration 1/50 ##########
Sampling 365 points from training dataset with (1328, 1328) entries
Student model 0-1 trained with depth 15 and 249 leaves:
Student model score: 0.5328416425834113
Student model 0-1 fidelity: 0.5328416425834113
########## Inner-loop Iteration 2/50 ##########
Sampling 365 

