In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

from incremental_learning.storage import download_dataset
from incremental_learning.misc import DataStream, reservoir_sample_with_jumps
from incremental_learning.config import datasets_dir

import pandas as pd
import dask
import dask.dataframe as dd

import time
import csv


In [2]:
# dataset_name = 'autompg'
dataset_name = 'ember_malware_bytes'
dataset = datasets_dir / "{}.csv".format(dataset_name)
size_mb = dataset.stat().st_size/1024/1024

In [3]:
def bandwidth(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        res = func(*args, **kwargs)
        elapsed_time = time.time() - start_time
        bandwidth = size_mb/elapsed_time
        print("Elapsed time: {} sec\nBandwidth: {} MB/sec" .format(elapsed_time, bandwidth))
        return res
    return wrapper

In [47]:
@bandwidth
def pdreader():
    df = pd.read_csv(dataset)

    
@bandwidth
def pdsampler(k):
    df = pd.read_csv(dataset)
    return df.sample(frac=k)

@bandwidth
def ddsampler(k):
    df = dd.read_csv(dataset)
    return df.sample(frac=k).compute()

@bandwidth
def ddsortchoose(k):
    df = dd.read_csv(dataset)
    df = df.set_index('byte_histogram_0')
    return df.tail(k)

@bandwidth
def ddnlargest(k):
    df = dd.read_csv(dataset)
    return df.nlargest(n=k, columns=['byte_histogram_0']).compute()
#     df = df.set_index('byte_histogram_0')
#     return df.tail(k)


@bandwidth
def dictreader():
    with open(dataset, newline='') as csvfile:
        reader = csv.DictReader(csvfile, dialect='unix')
        for row in reader:
            continue
            
@bandwidth
def csvreader():
    with open(dataset, newline='') as csvfile:
        reader = csv.reader(csvfile, dialect='unix')
        for row in reader:
            continue

@bandwidth
def reservoir(k):
    D = reservoir_sample_with_jumps(dataset, k)
    return D

In [9]:
dictreader()

Elapsed time: 42.04764652252197 sec
Bandwidth: 73.44685728352995 MB/sec


In [6]:
csvreader()

Elapsed time: 24.70233988761902 sec
Bandwidth: 125.01922924296927 MB/sec


In [7]:
pdreader()

Elapsed time: 23.288196563720703 sec
Bandwidth: 132.61084793740594 MB/sec


In [8]:
_ = reservoir(10000)

Elapsed time: 47.16233491897583 sec
Bandwidth: 65.4816496798469 MB/sec


In [22]:
_ = pdsampler(0.2)

Elapsed time: 22.753843784332275 sec
Bandwidth: 135.72508990215047 MB/sec


In [18]:
D = ddsampler(0.2)

Elapsed time: 8.893215894699097 sec
Bandwidth: 347.2610504248281 MB/sec


In [44]:
D = ddsortchoose(2000)

Elapsed time: 20.816734790802002 sec
Bandwidth: 148.355038591958 MB/sec


In [50]:
D = ddnlargest(2000)

Elapsed time: 8.83681583404541 sec
Bandwidth: 349.477408067042 MB/sec


In [51]:
df = dd.read_csv(dataset)

In [52]:
df.npartitions

51