In [2]:
import pandas as pd
#import dask.dataframe as pd
import time
import numpy as np
from contextlib import contextmanager
import os; os.environ['OMP_NUM_THREADS'] = '4'
import dask.dataframe as dd
import gc

@contextmanager
def timer(name):
    t0 = time.time()
    yield
    print(f'[{name}] done in {time.time() - t0:.1f} s')

names = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'attributed_time', 'is_attributed']
usecols= ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
dtypes = {
        'ip':'uint32',
        'app': 'uint16',
        'device': 'uint16',
        'os': 'uint16',
        'channel': 'uint16',
        'is_attributed': 'uint8'
        }
    
path = '../input/'
NROWS06 = int(9308567)
NROWS07 = int(59639987)
NROWS = NROWS06

TRAINFILE = 'train06.csv'
#TRAINFILE = "train_sample.csv"


with timer('read Pandas data'):
    train = pd.read_csv(path+TRAINFILE, skiprows=0, nrows=NROWS,
                        dtype=dtypes,
                        names=names,
                        usecols=usecols)
    train['freqip'] = train.groupby(['ip'])['ip'].transform('count').astype('int32')
    train['freqipdev'] = train.groupby(['ip', 'device'])['ip'].transform('count').astype('int32')
#    train['time_month'] = train.click_time.str[5:7].astype('int8')
    train['time_day']   = train.click_time.str[8:10].astype('int8')
    train['time_hr']    = train.click_time.str[11:13].astype('int8')
    train['time_min']   = train.click_time.str[14:16].astype('int8')
#    train['time_sec']   = train.click_time.str[17:20].astype('int8')
    train['freqiphr'] = train.groupby(['ip', 'time_hr'])['ip'].transform('count').astype('int32')
    train = train.drop(['click_time'], axis=1)

    
with timer('read Dask data'):
    dask_df = dd.read_csv(path+TRAINFILE, 
                          dtype=dtypes,names=names,
                          usecols=usecols).head(NROWS, npartitions=-1)
    dask_df['freqip'] = dask_df.groupby(['ip'])['ip'].transform('count').astype('int32')
    dask_df['freqipdev'] = dask_df.groupby(['ip', 'device'])['ip'].transform('count').astype('int32')
#    dask_df['time_month'] = dask_df.click_time.str[5:7].astype('int8')
    dask_df['time_day']   = dask_df.click_time.str[8:10].astype('int8')
    dask_df['time_hr']    = dask_df.click_time.str[11:13].astype('int8')
    dask_df['time_min']   = dask_df.click_time.str[14:16].astype('int8')
#    dask_df['time_sec']   = dask_df.click_time.str[17:20].astype('int8')
    dask_df['freqiphr'] = dask_df.groupby(['ip', 'time_hr'])['ip'].transform('count').astype('int32')
    dask_df = dask_df.drop(['click_time'], axis=1)

gc.collect()

[read Pandas data] done in 23.6 s
[read Dask data] done in 19.7 s


56

In [126]:
usecols= ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
train = pd.read_csv(path+TRAINFILE, skiprows=0, nrows=1000,
                        dtype=dtypes,
                        names=names,
                        usecols=usecols)



In [8]:
train.memory_usage()

Index                  80
ip               37234268
app              18617134
device           18617134
os               18617134
channel          18617134
is_attributed     9308567
freqip           37234268
freqipdev        37234268
time_day          9308567
time_hr           9308567
time_min          9308567
freqiphr         37234268
dtype: int64

In [110]:
dask_df.head().dtypes

ip                int64
app               int64
device            int64
os                int64
channel           int64
click_time       object
is_attributed     int64
freqip            int32
freqipdev         int32
time_month         int8
time_day           int8
time_hr            int8
time_min           int8
time_sec           int8
dtype: object

In [16]:

dask_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,17357,3,1,19,379,2017-11-06 14:33:34,,0
1,35810,3,1,13,379,2017-11-06 14:34:12,,0
2,45745,14,1,13,478,2017-11-06 14:34:52,,0
3,161007,3,1,13,379,2017-11-06 14:35:08,,0
4,18787,3,1,16,379,2017-11-06 14:36:26,,0


In [None]:
dask_df.groupby(['ip', 'device']).count()