# Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

base_path = '/content/drive/My\ Drive/tma_project'
base_path_python = '/content/drive/My Drive/tma_project'

Mounted at /content/drive/


In [8]:
#drive.mount("/content/drive/", force_remount=True)

# Install libs

In [2]:
# Install tshark.
!apt-get install tshark

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'apt autoremove' to remove it.
The following additional packages will be installed:
  libc-ares2 libcap2-bin liblua5.2-0 libmaxminddb0 libnl-genl-3-200 libpam-cap
  libpcap0.8 libsbc1 libsmi2ldbl libspandsp2 libwireshark-data libwireshark11
  libwiretap8 libwscodecs2 libwsutil9 wireshark-common
Suggested packages:
  mmdb-bin snmp-mibs-downloader wireshark-doc
The following NEW packages will be installed:
  libc-ares2 libcap2-bin liblua5.2-0 libmaxminddb0 libnl-genl-3-200 libpam-cap
  libpcap0.8 libsbc1 libsmi2ldbl libspandsp2 libwireshark-data libwireshark11
  libwiretap8 libwscodecs2 libwsutil9 tshark wireshark-common
0 upgraded, 17 newly installed, 0 to remove and 5 not upgraded.
Need to get 16.5 MB of archives.
After this operation, 95.6 MB of additional disk space will be used.
Get:1 h

In [3]:
# Install pcap sampler.
!sudo apt-get install libc6
!sudo apt-get install libpcap-dev
!make -C $base_path/pcapsampler-master/
!$base_path/pcapsampler-master/pcapsampler
!cp $base_path/pcapsampler-master/pcapsampler $base_path/

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libc6 is already the newest version (2.27-3ubuntu1.6).
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 5 not upgraded.
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  libpcap0.8-dev
The following NEW packages will be installed:
  libpcap-dev libpcap0.8-dev
0 upgraded, 2 newly installed, 0 to remove and 5 not upgraded.
Need to get 221 kB of archives.
After this operation, 750 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 libpcap0.8-dev amd64 1.8.1-6ubuntu1.1

# Download data, sampling and parse to csv

In [4]:
!sudo chmod 755 $base_path/pcapsampler
!$base_path/pcapsampler --usage

Usage: pcapsampler [-?V] [-m MODE] [-r RATE] [--mode=MODE] [--rate=RATE]
            [--help] [--usage] [--version] INPUT_FILE OUTPUT_FILE


In [7]:
import requests
import sys
from pathlib import Path
from datetime import timedelta, datetime
import pytz
import subprocess

def execute_command(command, output_file = None):
    if output_file:
        with open(output_file, "w") as outfile:
            p = subprocess.run(command, stdout=outfile, stderr=subprocess.STDOUT, shell=True)
    else:
        p = subprocess.run(command, shell=True)

def download_file(url, year, month):
    path = '{}/{}/{}'.format(base_path_python, year, month)
    Path(path).mkdir(parents=True, exist_ok=True)
    file_name = '{}/{}/{}/{}'.format(base_path_python, year, month, url.split('/')[-1])

    with open(file_name, "wb") as f:
        response = requests.get(url, stream=True)
        print("Downloading %s" % file_name)
        print("URL: {}".format(url))

        total_length = response.headers.get('content-length')
        if total_length is None: # no content length header
            f.write(response.content)
        else:
            if int(total_length) < 500:
                print("[ERROR] File not found")
            else:
                dl = 0
                total_length = int(total_length)
                print("File size: " + "{:.2f}".format(total_length/(1024*1024)) + "MB")
                for data in response.iter_content(chunk_size=4096):
                    dl += len(data)
                    f.write(data)
                    done = int(50 * dl / total_length)
                    sys.stdout.write("\r[%s%s]" % ('=' * done, ' ' * (50-done)) )    
                    sys.stdout.flush()
    
    return file_name

def add_0_if_single(number):
    if len(str(number)) == 1:
        return "0{}".format(number)
    return str(number)


def download_mawi_dump(year, day, month, hour_min):
    day_str = add_0_if_single(day)
    month_str = add_0_if_single(month)
    hour_min = hour_min
    url = "http://mawi.wide.ad.jp/mawi/samplepoint-F/{}/{}{}{}{}.pcap.gz".format(year, year, month_str, day_str, hour_min)

    file_name = download_file(url, year, month)
    return file_name

es_tz = pytz.timezone('Europe/Madrid')

date = es_tz.localize(datetime(2022, 1, 1))
dest_date = es_tz.localize(datetime(2022, 2, 1))
while date < dest_date:
    date += timedelta(days=1)
    # Download all mondays data.
    if date.weekday() == 0:
        file_name = download_mawi_dump(date.year, date.day, date.month, "1400")
        file_name = file_name.replace(' ', '\ ')

        # # Unzip the file.
        command = 'cd {} && gzip -d {}'.format(base_path, file_name)
        print("Command: {}".format(command))
        execute_command(command)

        file_name_d = file_name[0:-3] # Unziped file.
        file_name_sampled = file_name_d + ".sampled.pcap"

        # # Sampling pcap with pcapsampler.
        command = "cd {} && ./pcapsampler -m COUNT_SYS -r 100 {} {}".format(base_path, file_name_d, file_name_sampled)
        print("Command: {}".format(command))
        execute_command(command)

        # Reduce the size with tshark.
        command = 'cd {} && tshark -r {} -T fields -e ip.src -e ip.dst -e frame.len -e frame.time_epoch -E separator="," -E header=y -Y "tcp && ip && not(icmp)"'.format(base_path, file_name_sampled)
        print("Command: {}".format(command))
        execute_command(command, file_name_sampled.replace('\\ ', ' ') + '.csv')


# Sampling with pcap sampler.
# ./pcapsampler -m COUNT_SYS -r 100 filetosample.pcap sampledfile.pcap


# Tshark pcap to csv with ip source, ip dest, packet len and time. Only TCP packets.
# tshark -r .\200701011400.dump -T fields -e ip.src -e ip.dst -e ip.len -e frame.time_epoch -e icmp -E separator="," -E header=y -Y "tcp and !icmp"
# 491 MB file --> 15 minutes.

# Split pcap files by packets.
# https://www.thegeekstuff.com/2009/02/editcap-guide-11-examples-to-handle-network-packet-dumps-effectively/

# All mondays of 1 month --> 1h 30 min aprox.

Downloading /content/drive/My Drive/tma_project/2022/1/202201031400.pcap.gz
URL: http://mawi.wide.ad.jp/mawi/samplepoint-F/2022/202201031400.pcap.gz
File size: 923.90MB
Command: cd /content/drive/My\ Drive/tma_project && ./pcapsampler -m COUNT_SYS -r 100 /content/drive/My\ Drive/tma_project/2022/1/202201031400.pcap /content/drive/My\ Drive/tma_project/2022/1/202201031400.pcap.sampled.pcap
Command: cd /content/drive/My\ Drive/tma_project && tshark -r /content/drive/My\ Drive/tma_project/2022/1/202201031400.pcap.sampled.pcap -T fields -e ip.src -e ip.dst -e frame.len -e frame.time_epoch -E separator="," -E header=y -Y "tcp && ip && not(icmp)"
Downloading /content/drive/My Drive/tma_project/2022/1/202201101400.pcap.gz
URL: http://mawi.wide.ad.jp/mawi/samplepoint-F/2022/202201101400.pcap.gz
File size: 1109.58MB
Command: cd /content/drive/My\ Drive/tma_project && ./pcapsampler -m COUNT_SYS -r 100 /content/drive/My\ Drive/tma_project/2022/1/202201101400.pcap /content/drive/My\ Drive/tma_proj

In [26]:
drive.flush_and_unmount()
print('All changes made in this colab session should now be visible in Drive.')

All changes made in this colab session should now be visible in Drive.
