In [1]:
# Install Dependencies
!pip install gdown scikit-learn pandas

import gdown
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, mean_absolute_percentage_error, mean_absolute_error



In [2]:
!pip uninstall numpy -y
!pip install --no-cache-dir numpy


Found existing installation: numpy 2.0.2
Uninstalling numpy-2.0.2:
  Successfully uninstalled numpy-2.0.2
Collecting numpy
  Downloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.8/16.8 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.6 which is incompatible.
tensorflow 2.18.0 requires numpy<2.1.0,>=1.26.0, but you have numpy 2.2.6 which is incompatible.[0m[31m
[0mSuccessfully ins

In [3]:
# Download Preprocessed Data

url = 'https://drive.google.com/uc?id=1nUNgJzVOyKywhGODfwN5TfpSGFJE1OTG'
output = 'modeling_dataset.csv'
gdown.download(url, output, quiet=False)

df = pd.read_csv('modeling_dataset.csv')
print("✅ Loaded:", df.shape)

Downloading...
From: https://drive.google.com/uc?id=1nUNgJzVOyKywhGODfwN5TfpSGFJE1OTG
To: /content/modeling_dataset.csv
100%|██████████| 318k/318k [00:00<00:00, 5.27MB/s]

✅ Loaded: (2176, 10)





In [4]:
# ***Regression Model for avg_chunk_size***
features = [
    'packet_count',
    'total_bytes',
    'avg_packet_size',
    'avg_inter_packet_delay',
    'std_inter_packet_delay'
]

X = df[features].fillna(0)

y_reg = df['avg_chunk_size']
X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y_reg, test_size=0.2, random_state=42)

regressor = RandomForestRegressor(n_estimators=500, random_state=42)
regressor.fit(X_train_r, y_train_r)
y_pred_r = regressor.predict(X_test_r)

mse = mean_squared_error(y_test_r, y_pred_r)
print(f"avg_chunk_size Regressor MSE: {mse:.2f}")

byte_error = np.sqrt(mse)
print(f"avg_chunk_size Regressor Byte Error: {byte_error:.2f}")

mape = mean_absolute_percentage_error(y_test_r, y_pred_r)
print(f"MAPE: {mape:.4f}")

mae = mean_absolute_error(y_test_r, y_pred_r)
print(f"MAE: {mae:.2f}")

avg_chunk_size Regressor MSE: 8554652684.92
avg_chunk_size Regressor Byte Error: 92491.37
MAPE: 0.3651
MAE: 61765.60


In [5]:
# ***Classification Model for Rebuffering (chunk count unexpectedly drops)

# Define rebuffering as chunk_count < 4.65
# If predicted fewer than 4 chunks were used, this implies rebuffeirng occured!
#
#
df['rebuffered'] = (df['chunk_count'] < 4).astype(int)
y_clf = df['rebuffered']

X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.2, random_state=42)

classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train_c, y_train_c)
y_pred_c = classifier.predict(X_test_c)

acc = accuracy_score(y_test_c, y_pred_c)
report = classification_report(y_test_c, y_pred_c)

print(f"\nRebuffering Classifier Accuracy: {acc:.2f}")
print("Classification Report:\n", report)


Rebuffering Classifier Accuracy: 0.98
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99       430
           1       0.00      0.00      0.00         6

    accuracy                           0.98       436
   macro avg       0.49      0.50      0.50       436
weighted avg       0.97      0.98      0.98       436



In [6]:
!pip install trustee

Collecting trustee
  Downloading trustee-1.1.6-py3-none-any.whl.metadata (6.3 kB)
Collecting furo<2023.0.0,>=2022.6.21 (from trustee)
  Downloading furo-2022.12.7-py3-none-any.whl.metadata (6.0 kB)
Collecting pandas<2.0.0,>=1.1.0 (from trustee)
  Downloading pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting prettytable==3.0.0 (from trustee)
  Downloading prettytable-3.0.0-py3-none-any.whl.metadata (22 kB)
Collecting setuptools<58.0.0,>=57.0.0 (from trustee)
  Downloading setuptools-57.5.0-py3-none-any.whl.metadata (4.9 kB)
Collecting sphinx-gallery<0.12.0,>=0.11.1 (from trustee)
  Downloading sphinx_gallery-0.11.1-py3-none-any.whl.metadata (4.9 kB)
Collecting sphinxemoji<0.3.0,>=0.2.0 (from trustee)
  Downloading sphinxemoji-0.2.0.tar.gz (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting term

In [7]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from trustee import ClassificationTrustee

# Load dataset
df = pd.read_csv("/content/modeling_dataset.csv")

# Define input features
features = [
    'packet_count',
    'total_bytes',
    'avg_packet_size',
    'avg_inter_packet_delay',
    'std_inter_packet_delay'
]
X = df[features].fillna(0)

# Define binary rebuffering label: 1 if chunk_count < 8
df['rebuffered'] = (df['chunk_count'] < 8).astype(int)
y_clf = df['rebuffered']

# Train/test split
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_clf, test_size=0.2, random_state=42)

# Train Logistic Regression model
classifier = LogisticRegression(max_iter=500)
classifier.fit(X_train_c, y_train_c)
y_pred_c = classifier.predict(X_test_c)

# Evaluate base model
acc = accuracy_score(y_test_c, y_pred_c)
print(f"\nRebuffering Classifier Accuracy: {acc:.2f}")
print("Classification Report:\n", classification_report(y_test_c, y_pred_c))

# TRUSTEE EXPLANATION
trustee = ClassificationTrustee(expert=classifier)
trustee.fit(X_train_c, y_train_c, num_iter=50, num_stability_iter=10, samples_size=0.3, verbose=True)

# Extract surrogate decision tree and evaluate
dt, pruned_dt, agreement, reward = trustee.explain()
dt_y_pred = dt.predict(X_test_c)

print("\nTrustee Decision Tree Fidelity (Tree vs Model):")
print(classification_report(y_pred_c, dt_y_pred))

print("\nTrustee Decision Tree vs Ground Truth (Tree vs Actual):")
print(classification_report(y_test_c, dt_y_pred))



Rebuffering Classifier Accuracy: 0.91
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.96      0.94       348
           1       0.81      0.69      0.75        88

    accuracy                           0.91       436
   macro avg       0.87      0.83      0.85       436
weighted avg       0.90      0.91      0.90       436

Initializing training dataset using LogisticRegression(max_iter=500) as expert model
Expert model score: 0.8022773126782774
Initializing Trustee outer-loop with 10 iterations
########## Outer-loop Iteration 0/10 ##########
Initializing Trustee inner-loop with 10 iterations
########## Inner-loop Iteration 0/50 ##########
Sampling 365 points from training dataset with (1218, 1218) entries
Student model 0-0 trained with depth 3 and 4 leaves:
Student model score: 1.0
Student model 0-0 fidelity: 1.0
########## Inner-loop Iteration 1/50 ##########
Sampling 365 points from training dataset with (1328, 1328) en



In [8]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.multiclass import unique_labels
from trustee import ClassificationTrustee
from trustee.report.trust import TrustReport

# Load dataset
df = pd.read_csv("/content/modeling_dataset.csv")

# Define features for rebuffering prediction
features = [
    'packet_count',
    'total_bytes',
    'avg_packet_size',
    'avg_inter_packet_delay',
    'std_inter_packet_delay'
]
X = df[features].fillna(0)

# Define rebuffering label: 1 if chunk_count < 8, else 0
df['rebuffered'] = (df['chunk_count'] < 8).astype(int)
y = df['rebuffered']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Trustee explanation
trustee = ClassificationTrustee(expert=clf)
trustee.fit(X_train, y_train, num_iter=50, num_stability_iter=10, samples_size=0.3, verbose=True)
dt, pruned_dt, agreement, reward = trustee.explain()
dt_y_pred = dt.predict(X_test)

# Evaluate surrogate fidelity
print("\nModel Explanation Global Fidelity Report (Tree vs Forest):")
print(classification_report(y_pred, dt_y_pred))

print("\nModel Explanation Score Report (Tree vs Ground Truth):")
print(classification_report(y_test, dt_y_pred))

# Save TrustReport
OUTPUT_PATH = "out/"
REPORT_PATH = f"{OUTPUT_PATH}/report/rebuffering_trust_report.obj"
os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)

trust_report = TrustReport(
    blackbox=clf,
    X=X,
    y=y,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    max_iter=5,
    num_pruning_iter=5,
    trustee_num_iter=10,
    trustee_num_stability_iter=5,
    trustee_sample_size=0.3,
    analyze_branches=True,
    analyze_stability=True,
    top_k=10,
    verbose=True,
    class_names=["No Rebuffer", "Rebuffer"],
    feature_names=features,
    is_classify=True,
)

trust_report.save(REPORT_PATH)
print(f"\nReport saved to: {REPORT_PATH}")


Initializing training dataset using RandomForestClassifier(random_state=42) as expert model
Expert model score: 0.9986622367445565
Initializing Trustee outer-loop with 10 iterations
########## Outer-loop Iteration 0/10 ##########
Initializing Trustee inner-loop with 10 iterations
########## Inner-loop Iteration 0/50 ##########
Sampling 365 points from training dataset with (1218, 1218) entries
Student model 0-0 trained with depth 11 and 29 leaves:
Student model score: 0.7068798449612403
Student model 0-0 fidelity: 0.7068798449612403
########## Inner-loop Iteration 1/50 ##########
Sampling 365 points from training dataset with (1328, 1328) entries
Student model 0-1 trained with depth 11 and 31 leaves:
Student model score: 0.7010869565217391
Student model 0-1 fidelity: 0.7010869565217391
########## Inner-loop Iteration 2/50 ##########
Sampling 365 points from training dataset with (1438, 1438) entries
Student model 0-2 trained with depth 10 and 28 leaves:
Student model score: 0.642678660



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Student model 3-9 fidelity: 0.7193877551020409
########## Outer-loop Iteration 4/5 ##########
Initializing Trustee inner-loop with 5 iterations
########## Inner-loop Iteration 0/10 ##########
Sampling 365 points from training dataset with (5618, 5618) entries
Student model 4-0 trained with depth 3 and 6 leaves:
Student model score: 0.7912713472485768
Student model 4-0 fidelity: 0.7912713472485768
########## Inner-loop Iteration 1/10 ##########
Sampling 365 points from training dataset with (5728, 5728) entries
Student model 4-1 trained with depth 1 and 2 leaves:
Student model score: 0.8810810810810812
Student model 4-1 fidelity: 0.8810810810810812
########## Inner-loop Iteration 2/10 ##########
Sampling 365 points from training dataset with (5838, 5838) entries
Student model 4-2 trained with depth 1 and 2 leaves:
Student model score: 0.7830374753451677
Student model 4-2 fidelity: 0.7830374753451677
########## Inner-loop I



Saving decision trees...
Done!
Plotting...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Done!
Done!

Report saved to: out//report/rebuffering_trust_report.obj


<Figure size 3000x300 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

<Figure size 5000x300 with 0 Axes>

<Figure size 5000x300 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

<Figure size 5000x1000 with 0 Axes>

<Figure size 5000x1000 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

<Figure size 5000x1000 with 0 Axes>

<Figure size 5000x1000 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

In [9]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from trustee import RegressionTrustee

#Assume df is already loaded
# df = pd.read_csv("your_dataset.csv")

#Define features
features = [
    'packet_count',
    'total_bytes',
    'avg_packet_size',
    'avg_inter_packet_delay',
    'std_inter_packet_delay'
]
X = df[features].fillna(0)

#Regression Target: avg_chunk_size
y = df['avg_chunk_size']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Train Random Forest Regressor
regressor = RandomForestRegressor(n_estimators=500, random_state=42)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

#Evaluate the regressor
mse = mean_squared_error(y_test, y_pred)
print(f" avg_chunk_size Regressor MSE: {mse:.2f}")
print(f" avg_chunk_size Regressor RMSE: {np.sqrt(mse):.2f}")
print(f" avg_chunk_size Regressor R²: {r2_score(y_test, y_pred):.2f}")

#Apply Trustee to explain the regressor
trustee = RegressionTrustee(expert=regressor)
trustee.fit(X_train, y_train, num_iter=50, num_stability_iter=10, samples_size=0.3, verbose=True)
dt, pruned_dt, agreement, reward = trustee.explain()
dt_y_pred = dt.predict(X_test)

#Evaluate surrogate tree
print("\n Trustee Fidelity Report (Tree vs Forest):")
print(f"  MSE: {mean_squared_error(y_pred, dt_y_pred):.2f}")
print(f"  R²:  {r2_score(y_pred, dt_y_pred):.2f}")

print("\n Trustee Real Performance Report (Tree vs Ground Truth):")
print(f"  MSE: {mean_squared_error(y_test, dt_y_pred):.2f}")
print(f"  R²:  {r2_score(y_test, dt_y_pred):.2f}")


 avg_chunk_size Regressor MSE: 8554652684.92
 avg_chunk_size Regressor RMSE: 92491.37
 avg_chunk_size Regressor R²: 0.56
Initializing training dataset using RandomForestRegressor(n_estimators=500, random_state=42) as expert model
Expert model score: 0.9439754516988023
Initializing Trustee outer-loop with 10 iterations
########## Outer-loop Iteration 0/10 ##########
Initializing Trustee inner-loop with 10 iterations
########## Inner-loop Iteration 0/50 ##########
Sampling 365 points from training dataset with (1218, 1218) entries
Student model 0-0 trained with depth 17 and 255 leaves:
Student model score: 0.5958687619662386
Student model 0-0 fidelity: 0.5958687619662386
########## Inner-loop Iteration 1/50 ##########
Sampling 365 points from training dataset with (1328, 1328) entries
Student model 0-1 trained with depth 15 and 249 leaves:
Student model score: 0.5012763730138841
Student model 0-1 fidelity: 0.5012763730138841
########## Inner-loop Iteration 2/50 ##########
Sampling 365 po



In [10]:
import os
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
#from trustee import RegressionTrustee
from trustee.report.trust import TrustReport
OUTPUT_PATH = "out/"
REPORT_PATH = f"{OUTPUT_PATH}/report/trust_report.obj"
os.makedirs(os.path.dirname(REPORT_PATH), exist_ok=True)

trust_report = TrustReport(
        regressor,
        X=X,
        y=y,
        X_train=X_train,
        y_train=y_train,
        X_test=X_test,
        y_test=y_test,
        max_iter=5,
        num_pruning_iter=0,
        train_size=0.7,
        trustee_num_iter=10,
        trustee_num_stability_iter=5,
        trustee_sample_size=0.3,
        analyze_branches=True,
        analyze_stability=True,
        top_k=10,
        verbose=True,
        feature_names=features,
        is_classify=False,
)

trust_report.save(REPORT_PATH)
print(f"\n Report saved to: {REPORT_PATH}")

Running Trust Report...
Preparing data...
Done!
Progress |----------------------------------------------------------------------------------------------------| 0.9% Complete
Collecting blackbox information...
Done!
Progress |█---------------------------------------------------------------------------------------------------| 1.8% Complete
Collecting trustee information...
Fitting blackbox model...
Done!
Blackbox model score report with training data:
R2 Score: 0.5607297300914976
Using Classification Trustee algorithm to extract DT...
Initializing training dataset using RandomForestRegressor(n_estimators=500, random_state=42) as expert model
Expert model score: 0.9448306217667652
Initializing Trustee outer-loop with 5 iterations
########## Outer-loop Iteration 0/5 ##########
Initializing Trustee inner-loop with 5 iterations
########## Inner-loop Iteration 0/10 ##########
Sampling 365 points from training dataset with (1218, 1218) entries
Student model 0-0 trained with depth 15 and 2



Done!
Plotting...


[1;30;43mStreaming output truncated to the last 5000 lines.[0m


Done!
Done!

 Report saved to: out//report/trust_report.obj


<Figure size 3000x300 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

<Figure size 5000x300 with 0 Axes>

<Figure size 5000x300 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

<Figure size 4000x300 with 0 Axes>

<Figure size 4000x300 with 0 Axes>