In [1]:
!pip install --upgrade pip -q
!pip install progressbar -q
!pip install memory_profiler -q
!pip install --upgrade pandas>=1.2 -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.3/2.1 MB[0m [31m8.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m2.0/2.1 MB[0m [31m29.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m22.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for progressbar (setup.py) ... [?25l[?25hdone
[0m

In [2]:
%load_ext memory_profiler

In [3]:
import urllib
import tarfile
import os
from collections import OrderedDict
import warnings
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar
from scipy.sparse import csr_matrix

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_extraction import FeatureHasher
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, log_loss

# Raw Dataset

## Download Criteo Advertising Challenge dataset

In [4]:
# ProgressBar borrowed from https://stackoverflow.com/a/53643011/2015762
class ProgressBar():
    def __init__(self):
        self.pbar = None

    def __call__(self, block_num, block_size, total_size):
        if not self.pbar:
            self.pbar=progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()

def download_dataset(dataset_url, dataset_folder_path, compressed_dataset_path):
    # Download dataset
    os.makedirs(dataset_folder_path, exist_ok=True)
    urllib.request.urlretrieve(dataset_url, compressed_dataset_path, ProgressBar())

def extract_dataset(compressed_dataset_path, dataset_folder_path, dataset_path):
    # Extract train.txt (dataset with labels) and readme
    with tarfile.open(compressed_dataset_path, "r") as input_file:
        input_file.extract('readme.txt', dataset_folder_path)
        input_file.extract('train.txt', dataset_folder_path)
        os.rename(os.path.join(dataset_folder_path, 'train.txt'), dataset_path)

In [5]:
dataset_url = "https://criteostorage.blob.core.windows.net/criteo-research-datasets/kaggle-display-advertising-challenge-dataset.tar.gz"
dataset_folder_path = os.path.abspath('sync/data/criteo_dataset')
compressed_dataset_path = os.path.join(dataset_folder_path, "criteo_dataset.tar.gz")
dataset_path = os.path.join(dataset_folder_path, "criteo_dataset.txt")

In [6]:
if not os.path.exists(compressed_dataset_path):
    download_dataset(dataset_url, dataset_folder_path, compressed_dataset_path)

if not os.path.exists(dataset_path):
    extract_dataset(compressed_dataset_path, dataset_folder_path, dataset_path)




Quick look at the files we have downloaded.

Within iPython notebook, we can execute bash command by prepending the cell with `!` and insert python variable into it with `{}`

In [7]:
!ls -alh {dataset_folder_path}

total 15G
drwxr-xr-x 2 root      root  4.0K Mar 16 08:10 .
drwxr-xr-x 3 root      root  4.0K Mar 16 08:03 ..
-rw-r--r-- 1 root      root  4.3G Mar 16 08:06 criteo_dataset.tar.gz
-rw-r--r-- 1 293604138 staff  11G May 12  2014 criteo_dataset.txt
-rw-r--r-- 1 293604138 staff 1.9K Aug 22  2014 readme.txt


In [8]:
!cat {dataset_folder_path}/readme.txt

        ------ Display Advertising Challenge ------

Dataset: dac-v1

This dataset contains feature values and click feedback for millions of display 
ads. Its purpose is to benchmark algorithms for clickthrough rate (CTR) prediction.
It has been used for the Display Advertising Challenge hosted by Kaggle:
https://www.kaggle.com/c/criteo-display-ad-challenge/


Full description:

This dataset contains 2 files:
  train.txt
  test.txt
corresponding to the training and test parts of the data. 


Dataset construction:

The training dataset consists of a portion of Criteo's traffic over a period
of 7 days. Each row corresponds to a display ad served by Criteo and the first
column is indicates whether this ad has been clicked or not.
The positive (clicked) and negatives (non-clicked) examples have both been
subsampled (but at different rates) in order to reduce the dataset size.

There are 13 features taking integer values (mostly count features) and 26
categorical features. The values of th

In [9]:
label_columns = ['label']
integer_features = [f'int_feat_{i}' for i in range(1, 14)]
categorical_features = [f'cat_feat_{i}' for i in range(1, 27)]
columns = label_columns + integer_features + categorical_features

In [10]:
pd.read_csv(dataset_path, nrows=10, header=None, sep='\t', names=columns)

Unnamed: 0,label,int_feat_1,int_feat_2,int_feat_3,int_feat_4,int_feat_5,int_feat_6,int_feat_7,int_feat_8,int_feat_9,...,cat_feat_17,cat_feat_18,cat_feat_19,cat_feat_20,cat_feat_21,cat_feat_22,cat_feat_23,cat_feat_24,cat_feat_25,cat_feat_26
0,0,1.0,1,5.0,0.0,1382,4.0,15,2,181,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102,8.0,2,2,4,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767,89.0,4,2,245,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392,,0,0,0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2,0.0,3,0,0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,
5,0,,-1,,,12824,,0,0,6,...,776ce399,92555263,,,242bb710,8ec974f4,be7c41b4,72c78f11,,
6,0,,1,2.0,,3168,,0,1,2,...,776ce399,cdfa8259,,,20062612,,93bad2c0,1b256e61,,
7,1,1.0,4,2.0,0.0,0,0.0,1,0,0,...,e5ba7672,74ef3502,,,5316a17f,,32c7478e,9117a34a,,
8,0,,44,4.0,8.0,19010,249.0,28,31,141,...,e5ba7672,42a2edb9,,,0014c32a,,32c7478e,3b183c5c,,
9,0,,35,,1.0,33737,21.0,1,2,3,...,d4bb7bd8,70d0f5f9,,,0e63fca0,,32c7478e,0e8fe315,,


## Shrink it to a toy dataset

We first create a toy dataset with "only" 1 million rows (out of 45 millions)

In [11]:
toy_dataset_path = os.path.join(dataset_folder_path, "criteo_toy_dataset.txt")

In [12]:
!head -n 1000000 {dataset_path} > {toy_dataset_path}

## Shortcut to get the toy dataset

If it takes too much time, download this toy dataset there instead.

In [24]:
too_long = False
if too_long:
  from urllib import request
  os.makedirs(dataset_folder_path, exist_ok=True)
  toy_dataset_path = os.path.join(dataset_folder_path, "criteo_toy_dataset.txt")
  toy_dataset_url = 'https://www.dropbox.com/s/305lnmwphmu4cir/criteo_toy_dataset.txt?dl=1'
  request.urlretrieve(toy_dataset_url, toy_dataset_path, ProgressBar())




# Estimate Ratio of Positive samples

Let's say we want to perform a basic operation: estimate the number of positive samples within the data

### Basic approach

In [25]:
def compute_positive_label_proportion(dataset_path, columns):
    df = pd.read_csv(dataset_path, sep="\t", header=None, names=columns, usecols=['label'])
    return df['label'].mean()

Let's measure its memory footprint with the `%%memit` magic function

In [26]:
%%memit
positive_label_proportion = compute_positive_label_proportion(toy_dataset_path, columns)
print('positive_label_proportion', positive_label_proportion)

positive_label_proportion 0.254949
peak memory: 407.25 MiB, increment: 1.85 MiB


What would happen if you run the same function on a 45 times bigger dataset ?

You can give a try with `compute_positive_label_proportion(dataset_path, columns)`... at your own risks.

### Specifying schema
We can help pandas by specifying the column types to be used such that it does not need to infer it. Do so with the parameter dtype of pd.read_csv: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.

In [13]:
col_types = OrderedDict()
for col_name in columns:
    if col_name in label_columns: col_type = 'bool'
    if col_name in integer_features: col_type = 'float32'
    if col_name in categorical_features: col_type = 'str'
    col_types[col_name] = col_type

def compute_positive_label_proportion_with_dtype(dataset_path, columns, col_types):
    # Read csv with dtype and return positive_label_proportion
    df = pd.read_csv(dataset_path, sep="\t", header=None, names=columns, dtype=col_types)
    return df['label'].mean()

In [28]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype(toy_dataset_path, columns, col_types)
print('positive_label_proportion', positive_label_proportion)

positive_label_proportion 0.254949
peak memory: 1128.77 MiB, increment: 721.52 MiB


### Reading data by chunks
We can control the amount of memory we need by loading only a small chunk of the data and processing it before moving to the next chunk.

See documentation at https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#iterating-through-files-chunk-by-chunk

```
reader = pd.read_csv(..., chunksize=10, nrows=100):
for chunk in reader:
    print(chunk)
```

In [29]:
def compute_positive_label_proportion_with_dtype_and_chunksize(dataset_path, columns, col_types, chunksize):
    # Read csv with dtype and chunksize and return positive_label_proportion
    reader = pd.read_csv(
        dataset_path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=chunksize, 
    )
    sum_labels = 0
    sum_rows = 0
    for chunk in reader:
        sum_labels += chunk['label'].sum()
        sum_rows += len(chunk)
    return sum_labels / sum_rows

In [30]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype_and_chunksize(toy_dataset_path, columns, col_types, 100_000)
print('positive_label_proportion', positive_label_proportion)

positive_label_proportion 0.254949
peak memory: 492.44 MiB, increment: 0.00 MiB


This can now be applied to the full dataset with no memory issue.

In [None]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype_and_chunksize(dataset_path, columns, col_types, 100_000)
print('positive_label_proportion', positive_label_proportion)

# Training and evaluation

## Split train and test datasets
Since the datasets contain one line per example, we can split them into train and test by simply iterating over the lines. For each line in the original dataset: write it to the test data set with a probability p and write it to the train dataset with a probability 1 - p.

In [14]:
def split_train_test(full_dataset_path, train_dataset_path, test_dataset_path, test_ratio, seed=302984, print_every=None):
    random.seed(seed)
    with open(full_dataset_path, 'r') as input_file, open(train_dataset_path, 'w') as train_file, open(test_dataset_path, 'w') as test_file:
        for i, line in enumerate(input_file):
            if random.uniform(0, 1) <= test_ratio:
                test_file.write(line)
            else:
                train_file.write(line)
            
            if print_every is not None and (i + 1) % print_every == 0:
                print(f"Processed {i + 1} lines")
        print(f"Processed {i + 1} lines")
        
train_dataset_path = os.path.join(dataset_folder_path, "criteo_train_dataset.txt")
test_dataset_path = os.path.join(dataset_folder_path, "criteo_test_dataset.txt")

In [15]:
if not os.path.exists(train_dataset_path) or not os.path.exists(test_dataset_path):
    split_train_test(dataset_path, train_dataset_path, test_dataset_path, test_ratio=0.1, print_every=10_000_000)

Processed 10000000 lines
Processed 20000000 lines
Processed 30000000 lines
Processed 40000000 lines
Processed 45840617 lines


In [15]:
!wc -l {test_dataset_path}

4585250 /content/sync/data/criteo_dataset/criteo_test_dataset.txt


## Shuffling
The convergence guarantees of SGD rely on the fact that the observations come at random. Hence, shuffling between epochs is important.

First result of "How to shuffle a file that is too big for memory" on Google: https://stackoverflow.com/a/40814865/2015762

Note that quicker pseudo-shuffling strategies exists, but this fits our "Big data on your laptop" problematic.

In [16]:
test_dataset_shuffled_path = os.path.join(dataset_folder_path, "criteo_test_dataset_shuffled.txt")
train_dataset_shuffled_path = os.path.join(dataset_folder_path, "criteo_train_dataset_shuffled.txt")

In [43]:
!awk 'BEGIN{srand();} {printf "%06d %s\n", rand()*1000000, $0;}' /content/sync/data/criteo_dataset/criteo_test_dataset.txt | sort -n | cut -c8- > /content/sync/data/criteo_dataset/criteo_test_dataset_shuffled.txt
# We can run it on the train dataset too but let's skip it since it is quite long
# !awk 'BEGIN{srand();} {printf "%06d %s\n", rand()*1000000, $0;}' /content/sync/data/criteo_dataset/criteo_train_dataset.txt | sort -n | cut -c8- > /content/sync/data/criteo_dataset/criteo_train_dataset_shuffled.txt

## Training
In order to train a logistic model on chunks of data, we will use scikit-learn `SGDClassifier` (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html) and train for its `log` loss with its `partial_fit` method.
We can now apply the previous data processing pipeline and add the training to obtain a trained classifier.

In [124]:
#  To begin with, let's not do any preprocessing and deal with "ready to use" continuous features only
def preprocess_simple(chunk, integer_features, categorical_features):
    return chunk[integer_features].fillna(-1)

In [211]:
def do_train(path, preprocess, max_training_steps=1000, chunk_size=1000, print_every=50):
  classifier = SGDClassifier(loss="log_loss")#, alpha=0.001)
  # 1. Read train data by chunks
  reader = pd.read_csv(
      path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=chunk_size, 
  )
  losses = []
  for i, chunk in enumerate(reader):
      # 2. Apply preprocess_data to return the continous features
      features = preprocess(chunk, integer_features, categorical_features)
      # 3. Train classifier on this chunk  with fit.
      classifier.partial_fit(features, chunk["label"], classes=[0, 1])
      # 4. Stop after `max_training_steps`
      if i > max_training_steps:
          break

      label_predictions = classifier.predict_proba(features)[:, 1]
      losses.append(log_loss(chunk["label"], label_predictions))
          
      if print_every is not None and (i + 1) % print_every == 0:
        print(f'{i+1} : {np.array(losses).mean()}')
  return classifier

In [126]:
classifier = do_train(train_dataset_path, preprocess_simple)

100
200
300
400
500
600
700
800
900
1000


## Testing
Let's evaluate the performances of the trained classifier. We should iterate over the test dataset and evaluate the labels predicted by the classifier with `roc_auc_score` and `log_loss`.

In [127]:
def do_test(classifier, path, preprocess, max_testing_steps = 100, chunk_size = 1000, print_every = 10):
  reader = pd.read_csv(
      path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=chunk_size, 
  )
  roc_auc_scores = []
  log_losses = []
  # 1. Read test data by chunks
  for i, chunk in enumerate(reader):
      # 2. Apply preprocess_data to return the continous features
      features = preprocess(chunk, integer_features, categorical_features)
      # 3. Predict labels with classifiers
      label_predictions = classifier.predict_proba(features)[:, 1]
      # 4. Compute AUC score and Log loss for this chunk
      roc_auc_scores += [roc_auc_score(chunk["label"], label_predictions)]
      log_losses += [log_loss(chunk["label"], label_predictions)]
      
      if i > max_testing_steps:
          return (np.mean(roc_auc_scores), np.mean(log_losses))
          
      if print_every is not None and (i + 1) % print_every == 0:
          print(i+1)
  # 6. Return metrics
  return (np.mean(roc_auc_scores), np.mean(log_losses))

In [129]:
(roc_auc_scores, log_losses) = do_test(classifier, test_dataset_path, preprocess_simple)
print(f"AUC = {roc_auc_scores}")
print(f"LogLoss = {log_losses}")

10
20
30
40
50
60
70
80
90
100
AUC = 0.5567602629608853
LogLoss = 10.021558459885304


# Feature Engineering

## Continuous features
A smart way to deal with continuous features (counting integer features are part of them), consists in transforming them into categorical features through a quantile transformation. To do so we will use scikit-learn KBinsDiscretizer : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html.

It can be used as following
```
df = pd.DataFrame({'col_1': np.random.normal(size=1000), 'col_2': np.random.poisson(lam=1, size=1000)})
bucketizer = KBinsDiscretizer(n_bins=20, encode='ordinal')
bucketizer.fit(df)
df_bucketized = pd.DataFrame(bucketizer.transform(df), columns=[f'{col}_bucketized' for col in df.columns], index=df.index)
sns.jointplot(data=pd.concat((df, df_bucketized), axis=1), x="col_1", y="col_1_bucketized")
```

1. Create a `KBinsDiscretizer` and train it on the first chunk of the dataset. Try the `fit` method of `KBinsDiscretizer` on the chunk and see the type of what is returned. This is not a dataframe any more but your classifier will accept this type for its feature matrix.
1. Update `preprocess_data` to add a bucketize step to the training pipeline.
1. Do not forget to deal with missing values, you do not want to carry on NaNs. You can for example replace them with -1.

In [130]:
reader = pd.read_csv(
    train_dataset_path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=10_000
)
chunk = reader.get_chunk(1000)

In [131]:
bucketizer = KBinsDiscretizer(n_bins=20)
with warnings.catch_warnings(record=True):
    bucketizer.fit(chunk[integer_features].fillna(-1))

In [132]:
# bin_edges contains one bin definition per column:
print(len(bucketizer.bin_edges_))
# Displaying first column bins:
print(bucketizer.bin_edges_[0])

13
[-1.  0.  1.  2.  4.  9. 88.]


In [134]:
def bucketize(df, bucketizer):
    return bucketizer.transform(df.fillna(-1))

def preprocess_bucket(chunk, integer_features, categorical_features):
    return bucketize(chunk[integer_features], bucketizer)

In [178]:
reader = pd.read_csv(
    train_dataset_path, sep="\t", header=None, names=columns, dtype=col_types, chunksize=10_000
)
small_chunk = reader.get_chunk(5)
mat = preprocess_bucket(small_chunk, integer_features, categorical_features)
mat.todense()

In [135]:
classifier_with_buckets = do_train(train_dataset_path, preprocess_bucket)

100
200
300
400
500
600
700
800
900
1000


In [136]:
(roc_auc_scores, log_losses) = do_test(classifier_with_buckets, test_dataset_path, preprocess_bucket)
print(f"AUC = {roc_auc_scores}")
print(f"LogLoss = {log_losses}")

10
20
30
40
50
60
70
80
90
100
AUC = 0.7140477968883315
LogLoss = 0.5124704331207389


In [88]:
!cat {test_dataset_path} | wc -l

4585250


## Categorical features

### Implement Hashing Trick
For categorical features we will implement the hashing trick by ourselves. As a quick reminder, for each row

1. Select the categorical features 
1. Create for each feature the string concatenating the feature name and the feature value
1. Apply a hash function to each of these string and use this value to choose the feature's column index
1. Store the transformed features in a sparse matrix

In [137]:
# Here is a function that hashes strings the deterministic way
from sklearn.utils.murmurhash import murmurhash3_bytes_s32
def hash_string(string, seed=0):
    return murmurhash3_bytes_s32(string.encode(), seed)
hash_string('my_feature=my_feature_value')
# Note, if we were using builtin function hash('my_feature=my_feature_value'), we would have had a different hash value at each run

1480568101

In [138]:
# define a function that:
# - takes as arguments:
#    - the dataframe to transform
#    - size of hash_space
# - returns a numpy array with hashes of each categorical columns
# hashes should belong to [0;hash_space[
def get_features_hashes(row, hash_space):
    # return the list of the hashes values for each categorical feature in the row
    features_as_string = [f"{label}={value}" for label, value in zip(row.index, row.values)]
    return [abs(hash_string(string)) % hash_space for string in features_as_string]

In [187]:
# try out the function one one row of your chunk with a hash_space of 2^16
hash_space = 2 ** 16
row = chunk[categorical_features].iloc[0]
np.array(get_features_hashes(row, hash_space))

array([ 6307, 50007, 16938, 64016, 52877, 55356, 40417, 44725, 34895,
       31363, 35777, 46842, 15202, 57383, 34431, 18421, 47320, 18458,
       25889, 53956,  9729, 12896, 56167, 63487, 61374, 63046])

In [188]:
# define a function that transforms a dataframe into a sparse matrix m that can be passed to the learning
# Matrix m contains `hash_space` columns
# m[i,j] = 1 if the hashed value of at least one categorical column is `j` for line `i`

# use csr_matrix to create the sparse matrix
# Fill the csr_matrix, using csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)]) constructor

def transform_with_hashing_trick(df, hash_space):
    series = df.apply(lambda row: get_features_hashes(row, hash_space), axis=1)

    rows_and_cols_by_line = [([row_index] * len(col_indexes), col_indexes) for (row_index, col_indexes) in enumerate(series)]
    all_row_indices = sum([row_indices for (row_indices, _) in rows_and_cols_by_line], [])
    all_col_indices = sum([col_indices for (_, col_indices) in rows_and_cols_by_line], [])

    data = np.ones_like(all_col_indices)
    return csr_matrix((data, (all_row_indices, all_col_indices)), shape=(len(df), hash_space), dtype=float)

In [186]:
m = transform_with_hashing_trick(small_chunk[categorical_features], 2 ** 8)
print(m.sum(axis=1))
print(len(categorical_features))

[[26.]
 [26.]
 [26.]
 [26.]
 [26.]]
26


In [216]:
# Change the `preprocess_data` method we had defined before.
# New implementation should:
# - apply bucketization on integer columns
# - apply hashing trick on categorical columns
# - return a concatenation of matrix with both features
# you can rely on scipy.sparse hstack to concatenate matrix
from scipy.sparse import hstack
hash_space = 2 ** 16
def preprocess_hash_v1(df, integer_features, categorical_features):
  bucketized_integer_features = bucketize(df[integer_features], bucketizer)
  hashed_categorical_features = transform_with_hashing_trick(df[categorical_features], hash_space)
  #return hashed_categorical_features
  return hstack((bucketized_integer_features, hashed_categorical_features))

In [217]:
# run the training
classifier_with_hashing_v1 = do_train(train_dataset_path, preprocess_hash_v1)

50 : 1.2501235214970996
Label Mean : 0.234 ; Pred Mean : 0.15926569913433536
100 : 0.8032901182249903
Label Mean : 0.259 ; Pred Mean : 0.3315099972608472
150 : 0.651252213299453
Label Mean : 0.267 ; Pred Mean : 0.4128187651365036
200 : 0.582421515901622
Label Mean : 0.259 ; Pred Mean : 0.2708464522489461
250 : 0.5446114589989384
Label Mean : 0.243 ; Pred Mean : 0.22123147945730232
300 : 0.5196043347908924
Label Mean : 0.268 ; Pred Mean : 0.26893085771222003
350 : 0.5036891876780105
Label Mean : 0.274 ; Pred Mean : 0.28610705073869125
400 : 0.4923385710767311
Label Mean : 0.254 ; Pred Mean : 0.24017421163851563
450 : 0.4840735374838529
Label Mean : 0.27 ; Pred Mean : 0.2527085231081293
500 : 0.4774353433300452
Label Mean : 0.267 ; Pred Mean : 0.34577484178490003
550 : 0.471866471398522
Label Mean : 0.27 ; Pred Mean : 0.33103182446296925
600 : 0.46771402691661534
Label Mean : 0.255 ; Pred Mean : 0.19998025264745456
650 : 0.4638227632199241
Label Mean : 0.248 ; Pred Mean : 0.2645292371968

In [218]:
# look at metrics
(roc_auc_scores, log_losses) = do_test(classifier_with_hashing_v1, test_dataset_path, preprocess_hash_v1)
print(f"AUC = {roc_auc_scores}")
print(f"LogLoss = {log_losses}")

10
20
30
40
50
60
70
80
90
100
AUC = 0.768672242857132
LogLoss = 0.48363301599994474


### Use scikit-learn Feature Hasher

Actually, the hashing trick is well known and already implemented in scikit-learn FeatureHasher:<br>
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html

In [169]:
# rewrite a new version of `transform_with_hashing_trick` that relies on FeatureHasher
# create the hasher only once
# you may need this to convert the dataframe: 
# https://stackoverflow.com/questions/29815129/pandas-dataframe-to-list-of-dictionaries
# Test it on a chunk
hasher = FeatureHasher(n_features=2 ** 20)
def transform_with_hashing_trick_v2(df, hasher):
  normalized_df = (chunk.fillna("nan"))
  return hasher.transform(normalized_df.to_dict('records'))

transform_with_hashing_trick_v2(chunk[categorical_features], hasher)

<1000x1048576 sparse matrix of type '<class 'numpy.float64'>'
	with 37489 stored elements in Compressed Sparse Row format>

Compare speed of two methods on chunk of 1000 rows

In [172]:
%timeit transform_with_hashing_trick(chunk, 2 ** 20)

361 ms ± 153 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [173]:
%timeit transform_with_hashing_trick_v2(chunk, hasher)

52.8 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


Test it on the pipeline.

In [175]:
# redefine preprocess_data
hasher = FeatureHasher(n_features=2 ** 20)
def preprocess_hash_v2(df, integer_features, categorical_features):
  bucketized_integer_features = bucketize(df[integer_features], bucketizer)
  hashed_categorical_features = transform_with_hashing_trick_v2(df, hasher, categorical_features)
  return hstack((bucketized_integer_features, hashed_categorical_features))

In [176]:
preprocess_hash_v2(chunk, integer_features, categorical_features)

TypeError: ignored

In [None]:
classifier_with_hashing_v2 = do_train(train_dataset_path, max_training_steps=2000, chunk_size=1000, print_every=200)

In [102]:
(roc_auc_scores, log_losses) = do_test(classifier_with_hashing_v2, test_dataset_path)
print(f"AUC = {roc_auc_scores}")
print(f"LogLoss = {log_losses}")

100
200
300
400
500
600
700
800
900
1000
AUC = 0.6992177807345303
LogLoss = 0.5196194999645235


### Implement Cross Features with hashing

In [42]:
def get_features_hashes(row, hash_space):
    # return the list of the hashes values for each categorical feature in the row
    features_as_string = (
        [f"{l1}={v1};{l2}={v2}"
         for i,(l1,v1) in enumerate(zip(row.index, row.values))
         for j,(l2, v2) in enumerate(zip(row.index, row.values))
         if i <= j
         ])
    return [abs(hash_string(string)) % hash_space for string in features_as_string]

In [33]:
hash_space = 2 ** 16
classifier_with_cross = do_train(train_dataset_path, max_training_steps=10_000, chunk_size=1000, print_every=1000)

KeyboardInterrupt: ignored

In [30]:
(roc_auc_scores, log_losses) = do_test(classifier_with_cross, test_dataset_path)
print(f"AUC = {roc_auc_scores}")
print(f"LogLoss = {log_losses}")

100
200
300
400
500
600
700
800
900
1000
AUC = 0.7110682780699549
LogLoss = 0.5106296209457268
