In [1]:
!git clone https://github.com/dantle1/CS293NProject.git


Cloning into 'CS293NProject'...
remote: Enumerating objects: 84, done.[K
remote: Counting objects: 100% (84/84), done.[K
remote: Compressing objects: 100% (55/55), done.[K
remote: Total 84 (delta 17), reused 71 (delta 16), pack-reused 0 (from 0)[K
Receiving objects: 100% (84/84), 40.87 MiB | 14.98 MiB/s, done.
Resolving deltas: 100% (17/17), done.


In [2]:
# TShark Packet Extraction Script for Google Colab (Filtered by IP)
# Project: CS 293N - Raw Packet Filtering from PCAP

import os
import pandas as pd
from datetime import datetime

In [4]:
# Step 2: Install TShark
!apt-get install -y tshark

# === CONFIG ===
PCAP_FILE = '/content/CS293NProject/puffer_6M_profile_on50_13/puffer_6M_profile_on50_13.pcap'
FILTER_IP = '128.111.5.228'
OUTPUT_FILE = '/content/filtered_packets_13.csv'

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tshark is already the newest version (3.6.2-2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [5]:
# === STEP 3: Run TShark on single PCAP file ===
!tshark -r "{PCAP_FILE}" -Y "ip.src == {FILTER_IP}" -T fields \
    -e frame.time_epoch -e ip.src -e ip.dst -e frame.len \
    -E header=n -E separator=, > {OUTPUT_FILE}

print("✅ Packet filtering complete. Saved to:", OUTPUT_FILE)

Running as user "root" and group "root". This could be dangerous.
✅ Packet filtering complete. Saved to: /content/filtered_packets_13.csv


In [6]:
# === STEP 4: Load and Convert TShark Output ===
df = pd.read_csv(OUTPUT_FILE, header=None,
                 names=['epoch', 'ip_src', 'ip_dst', 'length'])
df['timestamp'] = pd.to_datetime(df['epoch'], unit='s', errors='coerce')

print("\n📊 Sample filtered packet data:")
print(df.head())


📊 Sample filtered packet data:
          epoch         ip_src        ip_dst  length  \
0  1.742585e+09  128.111.5.228  192.168.40.2      74   
1  1.742585e+09  128.111.5.228  192.168.40.2      66   
2  1.742585e+09  128.111.5.228  192.168.40.2      74   
3  1.742585e+09  128.111.5.228  192.168.40.2      86   
4  1.742585e+09  128.111.5.228  192.168.40.2     387   

                      timestamp  
0 2025-03-21 19:25:54.069990873  
1 2025-03-21 19:25:54.080837965  
2 2025-03-21 19:25:54.081504107  
3 2025-03-21 19:25:54.082763910  
4 2025-03-21 19:25:54.082923174  


In [8]:
# Save the DataFrame to a file
df.to_csv('filtered_packets_13.csv', index=False)

# Trigger a download in the browser
from google.colab import files
files.download('filtered_packets_13.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [10]:
display((df['epoch']).head())

Unnamed: 0,epoch
0,1742585000.0
1,1742585000.0
2,1742585000.0
3,1742585000.0
4,1742585000.0


In [19]:
# convert timestamps (epochs in ms) to human readable time (datetime)
df['datetime'] = df['timestamp'].apply(pd.to_datetime)

In [23]:
print("Minimum timestamp: ", min(df['datetime']))
print("Maximum timestamp: ", max(df['datetime']))

Minimum timestamp:  2025-03-21 19:25:54.069990873
Maximum timestamp:  2025-03-21 19:26:47.227486849


In [21]:
# 10 second bins
df['bin'] = df['datetime'].dt.floor('10s')

# group by 10 second bins
grouped = df.groupby('bin')

# Example: count how many entries are in each bin
print(grouped.size().reset_index(name='count'))

                  bin  count
0 2025-03-21 19:25:50   1981
1 2025-03-21 19:26:00   3895
2 2025-03-21 19:26:10   2461
3 2025-03-21 19:26:20   2162
4 2025-03-21 19:26:30   2255
5 2025-03-21 19:26:40   1621


In [43]:
# Puffer dataset for videos sent
PUFFER_FILE = "/content/CS293NProject/puffer_6M_profile_on50_13/video_sent.1.log"
columns =['epoch', 'channel', 'session_id', 'stream_id', 'user_id', 'stream_id1', 'stream_id2',
              'video_ts', 'format', 'size', 'ssim', 'cwnd', 'in_flight', 'rtt',
              'send_time', 'acked_time', 'buffer', 'cum_rebuf', 'uuid', 'thing']
puffer_df = pd.read_csv(PUFFER_FILE, names=columns, sep = ",")



In [45]:
puffer_df['timestamp'] = pd.to_datetime(df['epoch'], unit='s', errors='coerce')
display(puffer_df.head())


Unnamed: 0,epoch,channel,session_id,stream_id,user_id,stream_id1,stream_id2,video_ts,format,size,...,cwnd,in_flight,rtt,send_time,acked_time,buffer,cum_rebuf,uuid,thing,timestamp
0,1742585156096,cbs,1,11,jaber1,792847205,792847205,0,426x240-26,30382,...,11,0,79808,179450,17767,0.0,0.0,1IkXIOwbDZkJLCxsRdSsZ7a6uQWuqssK,,2025-03-21 19:25:54.069990873
1,1742585156307,cbs,1,11,jaber1,792847205,792847205,180180,1280x720-22,185089,...,49,0,332,45049,716825,2.002,0.0,1IkXIOwbDZkJLCxsRdSsZ7a6uQWuqssK,,2025-03-21 19:25:54.080837965
2,1742585156680,cbs,1,11,jaber1,792847205,792847205,360360,1280x720-22,280527,...,26,0,332,111341,566825,3.66,0.169,1IkXIOwbDZkJLCxsRdSsZ7a6uQWuqssK,,2025-03-21 19:25:54.081504107
3,1742585157125,cbs,1,11,jaber1,792847205,792847205,540540,1920x1080-24,457180,...,10,10,332,21913,589212,5.225,0.169,1IkXIOwbDZkJLCxsRdSsZ7a6uQWuqssK,,2025-03-21 19:25:54.082763910
4,1742585157936,cbs,1,11,jaber1,792847205,792847205,720720,1920x1080-24,391182,...,12,0,332,20857,561361,6.417,0.169,1IkXIOwbDZkJLCxsRdSsZ7a6uQWuqssK,,2025-03-21 19:25:54.082923174


In [46]:
# 10 second bins
puffer_df['bin'] = puffer_df['timestamp'].dt.floor('10s')

# group by 10 second bins
grouped = puffer_df.groupby('bin')

# Example: count how many entries are in each bin
print(grouped.size().reset_index(name='count'))

                  bin  count
0 2025-03-21 19:25:50     33
