## Transform train & test.csv into numpy arrays.

Before starting train.csv is split into parts so we can read in parallel.

- `split -l 10000000 train.csv train_part`
- `for i in $(ls -1 train_part*); do sed -i '1s;^;ip,app,device,os,channel,click_time,attributed_time,is_attributed\n;' $i`


In [20]:
import glob
import hashlib
import traceback 
from csv import DictReader
import multiprocessing as mp
from functools import lru_cache
import math
from collections import Counter
from pyhashxx import hashxx

import numpy as np
from tqdm import tqdm

D = 2 ** 23


FEATURES = [
    
    # single features
    ('ip',), 
    ('app',), 
    ('device',),
    ('os',), 
    ('channel',),
    
    ('click_hour',),
    
    # pair interactions
    ('device', 'app'), 
    ('channel', 'app'), 
    ('channel', 'device'), 
    ('channel', 'os'),
    ('ip', 'channel'),
    ('ip', 'device'),
    ('ip', 'app'),
    ('ip', 'click_hour'),
    
    # triple
    ('ip', 'device', 'os')
]

COUNT_FEATURES = [
    ('device', 'app'), 
    ('channel', 'app'), 
    ('channel', 'device'), 
    ('channel', 'os'),
    ('ip', 'channel'),
    ('ip', 'device'),
    ('ip', 'app'),
    ('ip', 'click_hour'),
    ('ip', 'device', 'os')        
]

# maxsize=None means the cache is unbounded.
# set to something reasonable if memory is limited.
@lru_cache(maxsize=None)
def hashed(value, D):
    # hash is not stable after python 3.3 unless PYTHONHASHSEED is set.
    # we need something with less collisions and stable to be able to pickle the model.
    #return int(hashlib.md5(value.encode('utf8')).hexdigest(), 16) % D
    return hashxx(value.encode('utf8')) % D

        
def get_x(csv_row):        
    try:
        x = {}
        csv_row['click_hour'] = int(csv_row['click_time'][-8:-6]) # hour
        for k in FEATURES:
            x[k] = hashed(' '.join([str(csv_row[c]) for c in k]), D)
        return x
    except Exception as e:
        #print(csv_row)        
        traceback.print_exc()


def load_part(fname, max_size=10000000):
    train_x = np.zeros((max_size, len(FEATURES)), dtype=np.uint32) 
    train_y = np.zeros((max_size), dtype=np.uint8)
    assert(D < 2**32)
    
    partial_counters = {}
    for k in COUNT_FEATURES:    
        partial_counters[k] = Counter()
        
    with open(fname) as f:
        rows = 0
        for idx, row in tqdm(enumerate(DictReader(f)), total=max_size, mininterval=30):
            h = get_x(row)
            x = [h[k] for k in FEATURES]
            
            train_x[idx, :] = x
            train_y[idx] = 1. if row['is_attributed'] == '1' else 0
            
            # partial counters
            for k in COUNT_FEATURES:                
                partial_counters[k][h[k]] += 1
                
            rows += 1
            
        return train_x[:rows], train_y[:rows], partial_counters

def prepare_train():
    fnames = list(glob.glob('input/train_parta*'))

    p = mp.Pool(8)
    parts = p.map(load_part, fnames)
    
    X = np.concatenate([p[0] for p in parts])
    y = np.concatenate([p[1] for p in parts])
    counters = {}
    for k in COUNT_FEATURES:                
        counters[k] = sum([p[2][k] for p in parts], Counter())
    
    print(X.shape, y.shape)
    p.close()
    
    d = int(math.log(D, 2))
    np.savez_compressed('tmp/train-hashxx-D{}.npz'.format(d), x=X, y=y, features=FEATURES)
    np.savez_compressed('tmp/aux-hashxx-D{}.npz'.format(d), counters=counters, features=FEATURES)
    return X, y, counters

def prepare_test():
    size = 18790470
    X = np.zeros((size, len(FEATURES)), dtype=np.uint32)
    click_id = np.zeros((size), dtype=np.uint32)
    with open('input/test.csv') as f:
        for idx, row in tqdm(enumerate(DictReader(f)), total=size, mininterval=30):
            h = get_x(row)
            x = [h[k] for k in FEATURES]
            X[idx, :] = x
            click_id[idx] = row['click_id']
    d = int(math.log(D, 2))
    np.savez_compressed('tmp/test-hashxx-D{}.npz'.format(d), x=X, click_id=click_id)
    return X

In [None]:
#X, y, counters = prepare_train()
#print('finished train.')

X = prepare_test()
print('finished test.')

 11%|█▏        | 2136732/18790470 [01:40<13:00, 21323.91it/s]

In [10]:
X.shape, y.shape
#import gc
#del X
#del y
#gc.collect()

((184903890, 15), (184903890,))

In [4]:
if False:
    train = np.load('tmp/train.npz')
    X_train, y_train = train['x'], train['y']
    print(X_train.shape, y_train.shape)

In [5]:
if False:
    test = np.load('tmp/test.npz')
    X_test = test['x']
    print(X_test.shape)