In [None]:
!git clone https://github.com/dantle1/CS293NProject.git


Cloning into 'CS293NProject'...
remote: Enumerating objects: 1107, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 1107 (delta 1), reused 0 (delta 0), pack-reused 1100 (from 5)[K
Receiving objects: 100% (1107/1107), 1.22 GiB | 29.20 MiB/s, done.
Resolving deltas: 100% (959/959), done.
Updating files: 100% (1061/1061), done.


In [None]:
# TShark Packet Extraction Script for Google Colab (Filtered by IP)
# Project: CS 293N - Raw Packet Filtering from PCAP

import os
import pandas as pd
from datetime import datetime

In [None]:
# Install TShark
!apt-get install -y tshark

folder_num = 6 #Assign this variable based on which folder we are executing

folder_name = f"puffer_6M_profile_on50_{folder_num}"
FOLDER = os.path.join('/content/CS293NProject', folder_name)
PCAP_FILE = os.path.join(FOLDER, f"{folder_name}.pcap")
VIDEO_SENT_LOG = os.path.join(FOLDER, 'video_sent.1.log')
OUTPUT_FILE = f"/content/filtered_packets_{folder_num}.csv"
FILTER_IP = '128.111.5.228'

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libbcg729-0 libc-ares2 liblua5.2-0 libnl-genl-3-200 libpcap0.8 libsbc1
  libsmi2ldbl libspandsp2 libspeexdsp1 libwireshark-data libwireshark15
  libwiretap12 libwsutil13 wireshark-common
Suggested packages:
  snmp-mibs-downloader geoipupdate geoip-database geoip-database-extra
  libjs-leaflet libjs-leaflet.markercluster wireshark-doc
The following NEW packages will be installed:
  libbcg729-0 libc-ares2 liblua5.2-0 libnl-genl-3-200 libpcap0.8 libsbc1
  libsmi2ldbl libspandsp2 libspeexdsp1 libwireshark-data libwireshark15
  libwiretap12 libwsutil13 tshark wireshark-common
0 upgraded, 15 newly installed, 0 to remove and 35 not upgraded.
Need to get 23.0 MB of archives.
After this operation, 120 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libpcap0.8 amd64 1.10.1-4ubuntu1.22.04.1

In [None]:
# Run TShark on single PCAP file ===
!tshark -r "{PCAP_FILE}" -Y "ip.src == {FILTER_IP}" -T fields \
    -e frame.time_epoch -e ip.src -e ip.dst -e frame.len \
    -E header=n -E separator=, > "{OUTPUT_FILE}"

print(f"âœ… Packet filtering complete. Saved to: {OUTPUT_FILE}")

Running as user "root" and group "root". This could be dangerous.
âœ… Packet filtering complete. Saved to: /content/filtered_packets_6.csv


In [None]:
# Load and Convert TShark Output
df = pd.read_csv(OUTPUT_FILE, header=None,
                 names=['epoch', 'ip_src', 'ip_dst', 'length'])
df['timestamp'] = pd.to_datetime(df['epoch'], unit='s', errors='coerce')

print("Sample filtered packet data:")
print(df.head())

Sample filtered packet data:
          epoch         ip_src        ip_dst  length  \
0  1.742585e+09  128.111.5.228  192.168.40.2      74   
1  1.742585e+09  128.111.5.228  192.168.40.2      66   
2  1.742585e+09  128.111.5.228  192.168.40.2      74   
3  1.742585e+09  128.111.5.228  192.168.40.2      86   
4  1.742585e+09  128.111.5.228  192.168.40.2     143   

                      timestamp  
0 2025-03-21 19:19:16.285278559  
1 2025-03-21 19:19:16.296501637  
2 2025-03-21 19:19:16.297395945  
3 2025-03-21 19:19:16.298296452  
4 2025-03-21 19:19:16.298464060  


In [None]:
# Run to download the file in your browser

from google.colab import files
#files.download(f'filtered_packets_{folder_num}.csv')


In [None]:
with open(OUTPUT_FILE) as f:
    lines = f.readlines()
    print(f"ðŸ§¾ Filtered packet lines: {len(lines)}")

if len(lines) == 0:
    print("No packets found for the given IP and folder.")

ðŸ§¾ Filtered packet lines: 13747


In [None]:
# Load and Bin Packet Data ===

packet_df = pd.read_csv(OUTPUT_FILE, names=['epoch', 'ip_src', 'ip_dst', 'length'], header=None) #load the data from the csv file
packet_df['timestamp'] = pd.to_datetime(packet_df['epoch'], unit='s', errors='coerce')   # convert to human readable time
packet_df['bin'] = packet_df['timestamp'].dt.floor('10s') # add to nearest timestamp rounded down
#print(packet_df.head())

video_cols = ['timestamp', 'channel', 'session_id', 'stream_id', 'user_id', 'stream_id1', 'stream_id2',
              'video_ts', 'format', 'size', 'ssim', 'cwnd', 'in_flight', 'rtt',
              'send_time', 'acked_time', 'buffer', 'cum_rebuf', 'uuid']

video_df = pd.read_csv(VIDEO_SENT_LOG, names=video_cols, header=None, index_col=False) # sort the dataframe with the header names in video_cols
#print(video_df.head())
video_df['timestamp'] = pd.to_numeric(video_df['timestamp'], errors='coerce')
video_df['timestamp_sec'] = video_df['timestamp'] / 1000.0 #convert to seconds
video_df['timestamp'] = pd.to_datetime(video_df['timestamp_sec'], unit='s') # change to human readable time
video_df['bin'] = video_df['timestamp'].dt.floor('10s') # bin represents the nearest 10s window rounded down

# === STEP 5: Group and Merge ===
grouped_packets = packet_df.groupby('bin')
grouped_video = video_df.groupby('bin')
all_bins = sorted(set(grouped_packets.groups) | set(grouped_video.groups))

chunks = []
for bin_time in all_bins:
    chunks.append({
        'folder': folder_name,
        'bin_start': bin_time,
        'packets': grouped_packets.get_group(bin_time) if bin_time in grouped_packets.groups else pd.DataFrame(),
        'video_sent': grouped_video.get_group(bin_time) if bin_time in grouped_video.groups else pd.DataFrame()
    })

# === STEP 6: Preview ===
print(f"âœ… Created {len(chunks)} aligned 10-second chunks\n")
print("ðŸ§ª Sample chunk:")
print("Time bin:", chunks[0]['bin_start'])
print("# packets in bin:", len(chunks[0]['packets']))
print("Video logs in bin:", len(chunks[0]['video_sent']))

print(chunks)

âœ… Created 9 aligned 10-second chunks

ðŸ§ª Sample chunk:
Time bin: 2025-03-21 19:18:00
# packets in bin: 0
Video logs in bin: 4
[{'folder': 'puffer_6M_profile_on50_6', 'bin_start': Timestamp('2025-03-21 19:18:00'), 'packets': Empty DataFrame
Columns: []
Index: [], 'video_sent':                       timestamp channel  session_id  stream_id user_id  \
0 2025-03-21 19:18:02.546999931     cbs           1         11  jaber1   
1 2025-03-21 19:18:04.750000000     cbs           1         11  jaber1   
2 2025-03-21 19:18:06.657999992     cbs           1         11  jaber1   
3 2025-03-21 19:18:08.752000093     cbs           1         11  jaber1   

   stream_id1  stream_id2  video_ts        format    size  ...  cwnd  \
0   219643018   219643018   3063060  1920x1080-24  523644  ...   132   
1   219643018   219643018   3243240   1280x720-20  433693  ...   132   
2   219643018   219643018   3423420  1920x1080-24  468779  ...   132   
3   219643018   219643018   3603600  1920x1080-24  451593  .

In [None]:
for i, chunk in enumerate(chunks):
    print(f"Chunk {i+1}: {chunk['bin_start']}  | Packets: {len(chunk['packets'])}, Video logs: {len(chunk['video_sent'])}")


Chunk 1: 2025-03-21 19:18:00  | Packets: 0, Video logs: 4
Chunk 2: 2025-03-21 19:18:10  | Packets: 0, Video logs: 6
Chunk 3: 2025-03-21 19:18:20  | Packets: 0, Video logs: 5
Chunk 4: 2025-03-21 19:19:10  | Packets: 1054, Video logs: 5
Chunk 5: 2025-03-21 19:19:20  | Packets: 4106, Video logs: 9
Chunk 6: 2025-03-21 19:19:30  | Packets: 2397, Video logs: 5
Chunk 7: 2025-03-21 19:19:40  | Packets: 2377, Video logs: 5
Chunk 8: 2025-03-21 19:19:50  | Packets: 1703, Video logs: 4
Chunk 9: 2025-03-21 19:20:00  | Packets: 2110, Video logs: 4
