In [25]:
import pandas as pd
import random
import heapq
import operator
import numpy as np
from collections import defaultdict

Let's read the dataset in chunks. You need to download the dataset from here https://mcfp.felk.cvut.cz/publicDatasets/CTU-Malware-Capture-Botnet-43/capture20110811.pcap.netflow.labeled and put it in the `dataset` folder

In [66]:
# read in 100 chunk size
dataset = pd.read_csv('dataset/capture20110811.pcap.netflow.labeled', delim_whitespace=True, chunksize=1000)

In [67]:
first_chunk = dataset.get_chunk()
first_chunk

Unnamed: 0,#Date_flow,start,Durat,Prot,Src_IP_Addr:Port,->,Dst_IP_Addr:Port,Flags,Tos,Packets,Bytes,Flows,Label(LEGITIMATE:Botnet:Background)
0,2011-08-11,10:10:00.003,0.967,UDP,89.31.8.11:23929,->,147.32.84.229:13363,INT,0,2,135,1,Background
1,2011-08-11,10:10:00.003,0.967,UDP,147.32.84.229:13363,->,89.31.8.11:23929,INT,0,2,276,1,Background
2,2011-08-11,10:10:00.006,0.000,UDP,208.88.186.6:34042,->,147.32.84.229:13363,INT,0,1,62,1,Background
3,2011-08-11,10:10:00.008,0.000,UDP,92.118.218.77:55246,->,147.32.84.229:13363,INT,0,1,78,1,Background
4,2011-08-11,10:10:00.009,0.000,UDP,182.185.139.181:10223,->,147.32.84.229:13363,INT,0,1,72,1,Background
5,2011-08-11,10:10:00.011,0.000,UDP,147.32.84.229:13363,->,92.118.218.77:55246,INT,0,1,60,1,Background
6,2011-08-11,10:10:00.011,0.000,UDP,147.32.84.229:13363,->,182.185.139.181:10223,INT,0,1,60,1,Background
7,2011-08-11,10:10:00.013,0.000,UDP,86.27.118.198:6386,->,147.32.84.229:13363,INT,0,1,77,1,Background
8,2011-08-11,10:10:00.014,0.000,UDP,147.32.84.229:13363,->,86.27.118.198:6386,INT,0,1,60,1,Background
9,2011-08-11,10:10:00.015,0.839,TCP,197.195.116.160:49177,->,147.32.84.229:13363,PA_,0,3,1512,1,Background


In [68]:
# preprocess: clean up columns
def preprocess(chunk):
    chunk = chunk.rename(index=str, columns={
        "#Date_flow": "Date",
        "start": "Start",
        "Durat": "Duration",
        "Prot":"Protocol",
        "Src_IP_Addr:Port" : "Source_IP",
        "Dst_IP_Addr:Port": "Destination_IP",
        "Label(LEGITIMATE:Botnet:Background)": "Label"})
    chunk = chunk.drop(axis="columns", labels="->")
    
    chunk['Source_IP'] = chunk['Source_IP'].apply(lambda x: x.split(":")[0])
    chunk['Destination_IP'] = chunk['Destination_IP'].apply(lambda x: x.split(":")[0])
    return chunk

first_chunk = preprocess(first_chunk)
first_chunk.head()

Unnamed: 0,Date,Start,Duration,Protocol,Source_IP,Destination_IP,Flags,Tos,Packets,Bytes,Flows,Label
0,2011-08-11,10:10:00.003,0.967,UDP,89.31.8.11,147.32.84.229,INT,0,2,135,1,Background
1,2011-08-11,10:10:00.003,0.967,UDP,147.32.84.229,89.31.8.11,INT,0,2,276,1,Background
2,2011-08-11,10:10:00.006,0.0,UDP,208.88.186.6,147.32.84.229,INT,0,1,62,1,Background
3,2011-08-11,10:10:00.008,0.0,UDP,92.118.218.77,147.32.84.229,INT,0,1,78,1,Background
4,2011-08-11,10:10:00.009,0.0,UDP,182.185.139.181,147.32.84.229,INT,0,1,72,1,Background


We can see from the rows that the host IP address from which the network flows were captured is `147.32.84.229`

## True Frequency

In [69]:
dataset_all = pd.read_csv('dataset/capture20110811.pcap.netflow.labeled', delim_whitespace=True)

In [70]:
dataset_all = preprocess(dataset_all)
len(dataset_all)

6351188

In [112]:
true_freq_ip = dataset_all['Source_IP'].append(dataset_all['Destination_IP']).value_counts()
true_freq_ip[:11] # we list 11 most frequent IP as we don't want to consider the host IP

147.32.84.229    3405103
147.32.80.9      1147493
147.32.84.138     643319
147.32.84.59      634581
147.32.86.165     156522
147.32.80.13       63305
147.32.85.25       62616
147.32.84.118      56830
147.32.84.165      54433
147.32.84.171      51413
147.32.85.124      46499
dtype: int64

In [113]:
true_freq_ip[:11] / float(len(dataset_all))

147.32.84.229    0.536136
147.32.80.9      0.180674
147.32.84.138    0.101291
147.32.84.59     0.099915
147.32.86.165    0.024645
147.32.80.13     0.009967
147.32.85.25     0.009859
147.32.84.118    0.008948
147.32.84.165    0.008571
147.32.84.171    0.008095
147.32.85.124    0.007321
dtype: float64

## Min-Wise Sampling
1. Read the dataset in chunksize=1
2. For each incoming sample, generate random number *hashtag* between [0,1] assign to the sample
3. Keep k samples with smallest hashtag (we use priority queue `heapq`)

In [96]:
count = 0
priority_queue = [] # we use heapq for priority queue implementation
# not really streaming the data, we just simulate the min-wise sampling
for index, chunk in dataset_all.iterrows():
    hashtag = random.random()
    ip_address = chunk["Source_IP"]
    heapq.heappush(priority_queue, (hashtag,ip_address))
    
    hashtag = random.random()
    ip_address = chunk["Destination_IP"]
    heapq.heappush(priority_queue, (hashtag,ip_address))
        
    count = count + 1

In [109]:
# reservoir size
k = 10000
ip_address_samples = []
temp_priority_queue = list(priority_queue)
for i in range(k):
    sample = heapq.heappop(temp_priority_queue)
    ip_address = sample[1]
    ip_address_samples.append(ip_address)

In [114]:
freq_ip = pd.Series(ip_address_samples).value_counts()
freq_ip[:11]

147.32.84.229    2725
147.32.80.9       905
147.32.84.59      488
147.32.84.138     482
147.32.86.165     118
147.32.85.25       53
147.32.80.13       50
147.32.84.165      44
147.32.84.118      42
147.32.84.171      42
147.32.84.131      36
dtype: int64

In [115]:
freq_ip[:11] / float(k)

147.32.84.229    0.2725
147.32.80.9      0.0905
147.32.84.59     0.0488
147.32.84.138    0.0482
147.32.86.165    0.0118
147.32.85.25     0.0053
147.32.80.13     0.0050
147.32.84.165    0.0044
147.32.84.118    0.0042
147.32.84.171    0.0042
147.32.84.131    0.0036
dtype: float64