In [1]:
!git clone https://github.com/dantle1/CS293NProject.git


Cloning into 'CS293NProject'...
remote: Enumerating objects: 1107, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 1107 (delta 1), reused 0 (delta 0), pack-reused 1100 (from 5)[K
Receiving objects: 100% (1107/1107), 1.22 GiB | 25.06 MiB/s, done.
Resolving deltas: 100% (959/959), done.
Updating files: 100% (1061/1061), done.


In [2]:
# TShark Packet Extraction Script for Google Colab (Filtered by IP)
# Project: CS 293N - Raw Packet Filtering from PCAP

import os
import pandas as pd
from datetime import datetime, timedelta
import glob
import numpy as np

In [3]:
# Install TShark
!apt-get install -y tshark

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libbcg729-0 libc-ares2 liblua5.2-0 libnl-genl-3-200 libpcap0.8 libsbc1
  libsmi2ldbl libspandsp2 libspeexdsp1 libwireshark-data libwireshark15
  libwiretap12 libwsutil13 wireshark-common
Suggested packages:
  snmp-mibs-downloader geoipupdate geoip-database geoip-database-extra
  libjs-leaflet libjs-leaflet.markercluster wireshark-doc
The following NEW packages will be installed:
  libbcg729-0 libc-ares2 liblua5.2-0 libnl-genl-3-200 libpcap0.8 libsbc1
  libsmi2ldbl libspandsp2 libspeexdsp1 libwireshark-data libwireshark15
  libwiretap12 libwsutil13 tshark wireshark-common
0 upgraded, 15 newly installed, 0 to remove and 35 not upgraded.
Need to get 23.0 MB of archives.
After this operation, 120 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpcap0.8 amd64 1.10.1-4ubuntu1.22.04.1

In [4]:
# CS 293N: Multi-PCAP Alignment with Shared video_sent.1.log (Stream ID or Fallback Temporal Matching)

# === CONFIGURATION ===
PCAP_DIR = '/content/CS293NProject/pcap_qoe'
VIDEO_LOG_PATH = '/content/CS293NProject/withPcap_5k_trim6m_shape6m_pfifo_latency100/video_sent.1.log'
FILTER_IP = '128.111.5.228'
WINDOW_SIZE = 10
SLIDE_STEP = 1

# === LOAD FULL VIDEO LOG FILE ===
video_cols = ['timestamp', 'channel', 'session_id', 'stream_id', 'user_id', 'stream_id1', 'stream_id2',
              'video_ts', 'format', 'size', 'ssim', 'cwnd', 'in_flight', 'rtt',
              'send_time', 'acked_time', 'buffer', 'cum_rebuf', 'uuid']

video_df = pd.read_csv(VIDEO_LOG_PATH, names=video_cols, header=None, index_col=False)
video_df['timestamp'] = pd.to_numeric(video_df['timestamp'], errors='coerce')
video_df['timestamp_sec'] = video_df['timestamp'] / 1000.0
video_df['timestamp'] = pd.to_datetime(video_df['timestamp_sec'], unit='s')
#video_df['bin'] = video_df['timestamp'].dt.floor('10s')

print("‚úÖ Loaded full video_sent log with", len(video_df), "entries")

# convert video_df to csv
video_df.to_csv('/content/video_df.csv', index=False)
#print(video_df.head())

‚úÖ Loaded full video_sent log with 33218 entries


In [5]:
# === PREP OUTPUT FILE ===
output_csv_path = "/content/modeling_dataset.csv"
with open(output_csv_path, "w") as f:
    f.write("bin_start,packet_count,total_bytes,avg_packet_size,avg_inter_packet_delay,std_inter_packet_delay,chunk_count,avg_ssim,avg_format,avg_chunk_size\n")

# === LOOP OVER PCAP FILES ===
all_chunks = []
pcap_files = sorted(glob.glob(os.path.join(PCAP_DIR, '*.pcap.pcap')))
print(f"üìÇ Found {len(pcap_files)} PCAPs")

for pcap_path in pcap_files:
    base_name = os.path.basename(pcap_path).replace('.pcap.pcap', '')
    output_file = f"/content/filtered_{base_name}.csv"

    # Extract user_id from filename
    user_number = base_name.split('_')[2]  # e.g. '48' from 'qoe_on_48_profile7'
    user_id = f"jaber{user_number}"

    # Filter the video log for this user only
    user_video_df = video_df[video_df['user_id'] == user_id] # Filter to just the matching user_id

    # Run TShark to extract only relevant packets
    !tshark -r "{pcap_path}" -Y "ip.src == {FILTER_IP}" -T fields \
        -e frame.time_epoch -e ip.src -e ip.dst -e frame.len \
        -E header=n -E separator=, > "{output_file}"

    with open(output_file) as f:
        lines = f.readlines()

    if len(lines) == 0:
        print(f"‚ö†Ô∏è No packets found in {pcap_path}, skipping.")
        continue

    # Load packet data
    packet_df = pd.read_csv(output_file, names=['epoch', 'ip_src', 'ip_dst', 'length'], header=None)
    packet_df['timestamp'] = pd.to_datetime(packet_df['epoch'], unit='s', errors='coerce')

    # Align sliding windows
    start_time = max(packet_df['timestamp'].min(), user_video_df['timestamp'].min())
    end_time = min(packet_df['timestamp'].max(), user_video_df['timestamp'].max())

    current = start_time
    while current + timedelta(seconds=WINDOW_SIZE) <= end_time: #step through the 10s interval
        window_start = current
        window_end = current + timedelta(seconds=WINDOW_SIZE)

        #extract the packets and video logs from the specified window
        packets = packet_df[(packet_df['timestamp'] >= window_start) & (packet_df['timestamp'] < window_end)]
        video = user_video_df[(user_video_df['timestamp'] >= window_start) & (user_video_df['timestamp'] < window_end)]

        if packets.empty or video.empty:
            current += timedelta(seconds=SLIDE_STEP)
            continue

        # Packet features
        total_bytes = packets['length'].astype(float).sum() #sum of all packets in the interval
        avg_packet_size = packets['length'].astype(float).mean()
        packet_count = len(packets)

        timestamps = packets['timestamp'].sort_values().values
        if len(timestamps) > 1:
            inter_arrival_times = (timestamps[1:] - timestamps[:-1]).astype('timedelta64[ms]').astype(float)
            avg_inter_packet_delay = inter_arrival_times.mean()
            std_inter_packet_delay = inter_arrival_times.std()
        else:
            avg_inter_packet_delay = 0
            std_inter_packet_delay = 0

        # Video features
        avg_ssim = video['ssim'].astype(float).mean()
        chunk_count = len(video)
        avg_format = video['format'].mode()[0] if not video['format'].mode().empty else None
        avg_size = video['size'].astype(float).mean()

        with open(output_csv_path, "a") as f:
            f.write(f"{window_start},{packet_count},{total_bytes},{avg_packet_size},{avg_inter_packet_delay},{std_inter_packet_delay},{chunk_count},{avg_ssim},{avg_format},{avg_size}\n")

        current += timedelta(seconds=SLIDE_STEP)

    print(f"‚úÖ Session {base_name}: Sliding windows completed")

print("\nüìä Sliding window feature extraction complete. Appended to:", output_csv_path)


üìÇ Found 990 PCAPs
Running as user "root" and group "root". This could be dangerous.
tshark: The file "/content/CS293NProject/pcap_qoe/qoe_on_10_profile1.pcap.pcap" appears to have been cut short in the middle of a packet.
‚úÖ Session qoe_on_10_profile1: Sliding windows completed
Running as user "root" and group "root". This could be dangerous.
tshark: The file "/content/CS293NProject/pcap_qoe/qoe_on_10_profile10.pcap.pcap" appears to have been cut short in the middle of a packet.
‚úÖ Session qoe_on_10_profile10: Sliding windows completed
Running as user "root" and group "root". This could be dangerous.
tshark: The file "/content/CS293NProject/pcap_qoe/qoe_on_10_profile2.pcap.pcap" appears to have been cut short in the middle of a packet.
‚úÖ Session qoe_on_10_profile2: Sliding windows completed
Running as user "root" and group "root". This could be dangerous.
tshark: The file "/content/CS293NProject/pcap_qoe/qoe_on_10_profile3.pcap.pcap" appears to have been cut short in the middle 

In [11]:
# import pandas as pd
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder
# from sklearn.metrics import classification_report, accuracy_score

# # === Load dataset ===
# df = pd.read_csv("/content/modeling_dataset.csv")

# # === Drop rows with missing labels ===
# df = df.dropna(subset=['avg_format'])

# # === Define features and target ===
# features = [
#     'packet_count',
#     'total_bytes',
#     'avg_packet_size',
#     'avg_inter_packet_delay',
#     'std_inter_packet_delay'
# ]

# X = df[features].fillna(0)

# y = df['avg_format']

# # === Encode the target variable AFTER dropping rows with missing labels ===
# le = LabelEncoder()
# y_encoded = le.fit_transform(y)

# # === Train/test split ===
# X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# # === Train model ===
# clf = RandomForestClassifier(n_estimators=100, random_state=42)
# clf.fit(X_train, y_train)

# # === Evaluate ===
# y_pred = clf.predict(X_test)
# print("‚úÖ Accuracy:", accuracy_score(y_test, y_pred))
# print("\nüìä Classification Report:")
# from sklearn.utils.multiclass import unique_labels
# labels = unique_labels(y_test, y_pred)
# print(classification_report(y_test, y_pred, labels=labels, target_names=le.inverse_transform(labels)))
