## Transform train & test.csv into numpy arrays.

Before starting train.csv is split into parts so we can read in parallel.

- `split -l 10000000 train.csv train_part`
- `for i in $(ls -1 train_part*); do sed -i '1s;^;ip,app,device,os,channel,click_time,attributed_time,is_attributed\n;' $i`


In [4]:
from csv import DictReader
import hashlib
from tqdm import tqdm
import numpy as np
import traceback 
import multiprocessing as mp
import glob

D = 2 ** 12

def hashed(value, size=D):
    # return int(hashlib.md5((value).encode('utf8')).hexdigest(), 16)%(D-1)+1
    return  hash(value) % D
    
def get_x(csv_row):        
    try:
        x = []
        for k in ['ip', 'app', 'device', 'os', 'channel']:
            x.append(hashed(csv_row[k]))
        for k1, k2 in [('app', 'channel'), ('app', 'device'), ('ip', 'device')]:
            x.append(hashed(csv_row[k1] + csv_row[k2]))
        x.append(int(csv_row['click_time'][-8:-6])) # hour
        x.append(int(csv_row['click_time'][-5:-3])) # minute
        return x
    except Exception as e:
        #print(csv_row)
        traceback.print_exc()

def load_part(fname, max_size=10000000):
    train_x = np.zeros((max_size, 10), dtype=np.uint16) 
    train_y = np.zeros((max_size), dtype=np.uint8)
    assert(D < 2**16)

    with open(fname) as f:
        rows = 0
        for idx, row in tqdm(enumerate(DictReader(f)), total=max_size):
            train_x[idx, :] = get_x(row)
            train_y[idx] = 1. if row['is_attributed'] == '1' else 0
            rows += 1
        return train_x[:rows], train_y[:rows]

def prepare_train():
    fnames = list(glob.glob('input/train_part*'))

    p = mp.Pool(8)
    parts = p.map(load_part, fnames)
    X = np.concatenate([p[0] for p in parts])
    y = np.concatenate([p[1] for p in parts])

    print(X.shape, y.shape)
    p.close()
    
    np.savez_compressed('tmp/train.npz', x=X, y=y)

def prepare_test():
    size = 18790470
    X = np.zeros((size, 10), dtype=np.uint16)
    with open('input/test.csv') as f:
        for idx, row in tqdm(enumerate(DictReader(f)), total=size):
            X[idx, :] = get_x(row)
    np.savez_compressed('tmp/test.npz', x=X)
        

In [5]:
#prepare_train()
#print('finished train.')

prepare_test()
print('finishsed test.')


100%|█████████▉| 18790469/18790470 [06:20<00:00, 49422.85it/s]


finishsed test.


In [3]:
!ls tmp

train.npz  unused.ipynb  weights.hdf5


In [12]:
train = np.load('tmp/train.npz')
X_train, y_train = train['x'], train['y']
print(X_train.shape, y_train.shape)

(184903890, 10) (184903890,)


In [11]:
test = np.load('tmp/test.npz')
X_test = test['x']
print(X_test.shape)

(18790470, 10)
