In [95]:
import pandas as pd
#import dask.dataframe as pd
import time
import numpy as np
from contextlib import contextmanager
import os; os.environ['OMP_NUM_THREADS'] = '4'
import dask.dataframe as dd

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.1f} s')

dtypes = {
        'ip':'uint32',
        'app': 'uint16',
        'device': 'uint16',
        'os': 'uint16',
        'channel': 'uint16',
        'is_attributed': 'uint8'
        }
    
path = '../input/'
NROWS = int(9308567)
TRAINFILE = 'train06.csv'
#TRAINFILE = "train_sample.csv"


with timer('read Pandas data'):
    train = pd.read_csv(path+TRAINFILE, skiprows=0, nrows=NROWS,
                        dtype=dtypes)
    train.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
    train = train.drop(['attributed_time'], axis=1)
    train['freqip'] = train.groupby(['ip'])['ip'].transform('count')
    train['freqipdev'] = train.groupby(['ip', 'device'])['ip'].transform('count')
    
with timer('read Dask data'):
    dask_df = dd.read_csv(path+TRAINFILE, dtype=dtypes).head(NROWS, npartitions=-1)
    dask_df.columns = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
    dask_df = dask_df.drop(['attributed_time'], axis=1)
    dask_df['freqip'] = dask_df.groupby(['ip'])['ip'].transform('count')
    dask_df['freqipdev'] = dask_df.groupby(['ip', 'device'])['ip'].transform('count')

[read Pandas data] done in 14.4 s
[read Dask data] done in 11.0 s


In [94]:
dask_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,freqip,freqipdev
0,17357,3,1,19,379,2017-11-06 14:33:34,0,927,922
1,35810,3,1,13,379,2017-11-06 14:34:12,0,424,375
2,45745,14,1,13,478,2017-11-06 14:34:52,0,8949,8075
3,161007,3,1,13,379,2017-11-06 14:35:08,0,176,176
4,18787,3,1,16,379,2017-11-06 14:36:26,0,198,198


In [16]:

dask_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,17357,3,1,19,379,2017-11-06 14:33:34,,0
1,35810,3,1,13,379,2017-11-06 14:34:12,,0
2,45745,14,1,13,478,2017-11-06 14:34:52,,0
3,161007,3,1,13,379,2017-11-06 14:35:08,,0
4,18787,3,1,16,379,2017-11-06 14:36:26,,0


In [19]:
dask_df.groupby(['ip', 'device']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,app,os,channel,click_time,attributed_time,is_attributed
ip,device,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9,1,150,150,150,150,0,150
10,1,49,49,49,49,0,49
19,1,7,7,7,7,0,7
20,0,1,1,1,1,0,1
20,1,128,128,128,128,0,128
20,3032,2,2,2,2,0,2
25,0,1,1,1,1,0,1
25,1,51,51,51,51,1,51
27,1,212,212,212,212,0,212
27,3032,17,17,17,17,0,17
