In [1]:
import numpy as np
import sklearn as sk
import pandas as pd

In [2]:
import os
import os.path
import sys
import datetime
import hashlib

In [3]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
np.random.seed(137)

Features/Y

  - ip: ip address of click.
  - app: app id for marketing.
  - device: device type id of user mobile phone (e.g., iphone 6 plus, iphone 7, huawei mate 7, etc.)
  - os: os version id of user mobile phone
  - channel: channel id of mobile ad publisher
  - click_time: timestamp of click (UTC)
  - attributed_time: if user download the app for after clicking an ad, this is the time of the app download
  - is_attributed: the target that is to be predicted, indicating the app was downloaded

In [5]:
DATA = {
    'train-sample': '../data/mnt/ssd/kaggle-talkingdata2/competition_files/train_sample.csv',
}

COLS_ID = ['ip', 'app', 'os', 'device', 'channel']
COLS_SIG = ['app', 'os', 'device', 'channel']
COLS_Y = ['is_attributed']
COLS_DT = ['click_time', 'attributed_time']
COLS = set(COLS_ID + COLS_Y + COLS_DT)

def loadframe(frame):
    
    filepath = DATA[frame]
    
    return pd.read_csv(filepath)
    

In [6]:
df_train_sample = loadframe('train-sample')
df_train_sample.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,,0
1,105560,25,1,17,259,2017-11-07 13:40:27,,0
2,101424,12,1,19,212,2017-11-07 18:05:24,,0
3,94584,13,1,13,477,2017-11-07 04:58:08,,0
4,68413,12,1,1,178,2017-11-09 09:00:09,,0


In [7]:
df_train_sample.shape

(100000, 8)

In [8]:
for c in COLS_ID:
    n = np.max(df_train_sample[c])
    print('There are at least %s %s(s)' % (n, c))

There are at least 364757 ip(s)
There are at least 551 app(s)
There are at least 866 os(s)
There are at least 3867 device(s)
There are at least 498 channel(s)


In [9]:
def preproc(df):
    
    new_df = df.copy()
    for c in COLS:
        if c in COLS_DT:
            new_df[c] = pd.to_datetime(df[c], format='%Y-%m-%d %H:%M:%S', exact=True, errors='ignore')
    return new_df

In [10]:
df_train_sample_preproc = preproc(df_train_sample)
df_train_sample_preproc.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed
0,87540,12,1,13,497,2017-11-07 09:30:38,NaT,0
1,105560,25,1,17,259,2017-11-07 13:40:27,NaT,0
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,0
3,94584,13,1,13,477,2017-11-07 04:58:08,NaT,0
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,0


In [11]:
def encode_datetime(dt):
    """
    encode date-time into a vector
    """
    
    if pd.notnull(dt):
        enc = np.array([dt.month, dt.hour, dt.minute, dt.second, dt.isoweekday()])
    else:
        enc = np.zeros((5,))
    
    return enc

In [12]:

def one_hot(size, index):
    array = np.zeros(size)
    array[index] = 1.
    return array

def hashing_functions(n = 4):
    
    fns = []
    
    for _ in range(n):
        a, b = np.random.randint(100), np.random.randint(100)
        fn = lambda x, m=a, c=b: (m * x + c) & ((1 << 32) - 1)
        fns.append(fn)

    return fns

def feature_hashing(col, hfs, bsize = 4):
    """
    Hashing Trick for categorical variables;
    Simple implementation, relying on variable encoding;
    Better solution might be to use more advanced hashing function
    """
    hashes = []
    
    for hf in hfs:
        hashes.append(col.map(lambda x: one_hot(bsize, hf(x) % bsize) if pd.notnull(x) else np.zeros(bsize)))
        
    hashedc = np.sum(hashes, axis = 0)
    
    L = list(hashedc)
    A = L
    data_frame = pd.DataFrame(L)
    return pd.Series(A, index=data_frame.index)    

In [14]:
def generate_features(df):
    
    new_df = df.copy()
    hfs = hashing_functions(n = 16)
    
    for c in COLS:
        print('Feature engineering: %s' % c)
        if c in COLS_DT:
            new_col = '%s_enc' % c
            new_df[new_col] = df[c].map(lambda x: encode_datetime(x))
        elif c in COLS_SIG:
            new_col = '%s_ht' % c
            new_df[new_col] = feature_hashing(df[c], hfs, bsize=8)

    return new_df

In [15]:
%%time
df_features = generate_features(df_train_sample_preproc)

Feature engineering: ip
Feature engineering: os
Feature engineering: is_attributed
Feature engineering: attributed_time
Feature engineering: app
Feature engineering: channel
Feature engineering: click_time
Feature engineering: device
CPU times: user 27.1 s, sys: 1.9 s, total: 29 s
Wall time: 29.8 s


In [16]:
df_features.head()

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,os_ht,attributed_time_enc,app_ht,channel_ht,click_time_enc,device_ht
0,87540,12,1,13,497,2017-11-07 09:30:38,NaT,0,"[2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[3.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 3.0]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]","[11, 9, 30, 38, 2]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]"
1,105560,25,1,17,259,2017-11-07 13:40:27,NaT,0,"[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]","[3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, 1.0]","[11, 13, 40, 27, 2]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]"
2,101424,12,1,19,212,2017-11-07 18:05:24,NaT,0,"[3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 2.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[3.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 3.0]","[3.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 3.0]","[11, 18, 5, 24, 2]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]"
3,94584,13,1,13,477,2017-11-07 04:58:08,NaT,0,"[2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, 1.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, 1.0]","[2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 2.0, 1.0]","[11, 4, 58, 8, 2]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]"
4,68413,12,1,1,178,2017-11-09 09:00:09,NaT,0,"[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]","[0.0, 0.0, 0.0, 0.0, 0.0]","[3.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 3.0]","[2.0, 2.0, 1.0, 3.0, 2.0, 1.0, 2.0, 3.0]","[11, 9, 0, 9, 4]","[2.0, 1.0, 2.0, 3.0, 3.0, 2.0, 3.0, 0.0]"


In [101]:
def model_data(df):
    cols = sorted([c for c in df.columns if (c.endswith('_ht') or c.endswith('_enc'))])
    data = [df[c] for c in cols]
    print('X:', cols)
    X = np.array([r for r in map(lambda x: np.concatenate(x), [x for x in zip(*data)])], dtype=np.float32)
    Y = df['is_attributed'].values
    
    return X, Y

In [102]:
X_train, Y_train = model_data(df_features)
X_train[:2, :]
Y_train[:2]

X: ['app_ht', 'attributed_time_enc', 'channel_ht', 'click_time_enc', 'device_ht', 'os_ht']


array([[  3.,   3.,   1.,   2.,   1.,   1.,   2.,   3.,   0.,   0.,   0.,
          0.,   0.,   2.,   1.,   2.,   3.,   3.,   2.,   3.,   0.,  11.,
          9.,  30.,  38.,   2.,   2.,   1.,   2.,   3.,   3.,   2.,   3.,
          0.,   2.,   2.,   3.,   2.,   3.,   1.,   2.,   1.],
       [  2.,   1.,   2.,   3.,   3.,   2.,   3.,   0.,   0.,   0.,   0.,
          0.,   0.,   3.,   2.,   2.,   2.,   3.,   1.,   2.,   1.,  11.,
         13.,  40.,  27.,   2.,   2.,   1.,   2.,   3.,   3.,   2.,   3.,
          0.,   2.,   1.,   2.,   3.,   3.,   2.,   3.,   0.]], dtype=float32)

array([0, 0])

In [103]:
X_train.shape
Y_train.shape

(100000, 42)

(100000,)