In [1]:
import pandas as pd

In [2]:
dtypes = {
    'ip'            : 'uint32',
    'app'           : 'uint16',
    'device'        : 'uint16',
    'os'            : 'uint16',
    'channel'       : 'uint16',
    'is_attributed' : 'uint8',
    'click_id'      : 'uint32'
}

In [3]:
train_usecols = ['ip', 'app', 'device', 'os', 'channel', 'click_time', 'is_attributed']
test_usecols = ['click_id', 'ip', 'app', 'device', 'os', 'channel', 'click_time']

In [4]:
%%time
train = pd.read_csv('../data/train.csv', dtype=dtypes, usecols=train_usecols)

CPU times: user 1min 2s, sys: 2.72 s, total: 1min 5s
Wall time: 1min 4s


In [5]:
%%time
train.loc[:, 'click_time'] = pd.to_datetime(train.click_time, format='%Y-%m-%d %H:%M:%S')

CPU times: user 22.3 s, sys: 2.95 s, total: 25.2 s
Wall time: 21.6 s


In [6]:
train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184903890 entries, 0 to 184903889
Data columns (total 7 columns):
ip               uint32
app              uint16
device           uint16
os               uint16
channel          uint16
click_time       datetime64[ns]
is_attributed    uint8
dtypes: datetime64[ns](1), uint16(4), uint32(1), uint8(1)
memory usage: 3.6 GB


In [8]:
test = pd.read_csv('../data/test.csv', dtype=dtypes, usecols=test_usecols)

In [9]:
test.loc[:, 'click_time'] = pd.to_datetime(test.click_time, format='%Y-%m-%d %H:%M:%S')

In [10]:
test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [11]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18790469 entries, 0 to 18790468
Data columns (total 7 columns):
click_id      uint32
ip            uint32
app           uint16
device        uint16
os            uint16
channel       uint16
click_time    datetime64[ns]
dtypes: datetime64[ns](1), uint16(4), uint32(2)
memory usage: 430.1 MB


In [12]:
submit = pd.read_csv('../data/sample_submission.csv', dtype=dtypes)

In [13]:
submit.head()

Unnamed: 0,click_id,is_attributed
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [14]:
submit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18790469 entries, 0 to 18790468
Data columns (total 2 columns):
click_id         uint32
is_attributed    uint8
dtypes: uint32(1), uint8(1)
memory usage: 89.6 MB


In [18]:
hdf_path = '../derived_data/source.hdf'

In [15]:
train.to_hdf(hdf_path, key='train')

In [16]:
test.to_hdf(hdf_path, key='test')

In [17]:
submit.to_hdf(hdf_path, key='submit_sample')

In [20]:
%%time
read_train = pd.read_hdf(hdf_path, key='train')

CPU times: user 319 ms, sys: 6.9 s, total: 7.22 s
Wall time: 34.4 s


In [21]:
read_train.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [22]:
read_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184903890 entries, 0 to 184903889
Data columns (total 7 columns):
ip               uint32
app              uint16
device           uint16
os               uint16
channel          uint16
click_time       datetime64[ns]
is_attributed    uint8
dtypes: datetime64[ns](1), uint16(4), uint32(1), uint8(1)
memory usage: 5.0 GB


In [23]:
%%time
read_test = pd.read_hdf(hdf_path, key='test')

CPU times: user 51.7 ms, sys: 805 ms, total: 857 ms
Wall time: 4.4 s


In [24]:
read_test.head()

Unnamed: 0,click_id,ip,app,device,os,channel,click_time
0,0,5744,9,1,3,107,2017-11-10 04:00:00
1,1,119901,9,1,3,466,2017-11-10 04:00:00
2,2,72287,21,1,19,128,2017-11-10 04:00:00
3,3,78477,15,1,13,111,2017-11-10 04:00:00
4,4,123080,12,1,13,328,2017-11-10 04:00:00


In [25]:
read_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18790469 entries, 0 to 18790468
Data columns (total 7 columns):
click_id      uint32
ip            uint32
app           uint16
device        uint16
os            uint16
channel       uint16
click_time    datetime64[ns]
dtypes: datetime64[ns](1), uint16(4), uint32(2)
memory usage: 573.4 MB
