In [None]:
!pip install --upgrade pip -q
!pip install progressbar -q
!pip install memory_profiler -q
!pip install --upgrade pandas>=1.2 -q

In [None]:
%load_ext memory_profiler

In [None]:
import urllib
import tarfile
import os
from collections import OrderedDict
import warnings
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import progressbar
from scipy.sparse import csr_matrix

from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_extraction import FeatureHasher
from sklearn import linear_model
from sklearn.metrics import roc_auc_score, log_loss

# Raw Dataset

## Download Criteo Advertising Challenge dataset

In [None]:
# ProgressBar borrowed from https://stackoverflow.com/a/53643011/2015762
class ProgressBar():
    def __init__(self):
        self.pbar = None

    def __call__(self, block_num, block_size, total_size):
        if not self.pbar:
            self.pbar=progressbar.ProgressBar(maxval=total_size)
            self.pbar.start()

        downloaded = block_num * block_size
        if downloaded < total_size:
            self.pbar.update(downloaded)
        else:
            self.pbar.finish()

def download_dataset(dataset_url, dataset_folder_path, compressed_dataset_path):
    # Download dataset
    os.makedirs(dataset_folder_path, exist_ok=True)
    urllib.request.urlretrieve(dataset_url, compressed_dataset_path, ProgressBar())

def extract_dataset(compressed_dataset_path, dataset_folder_path, dataset_path):
    # Extract train.txt (dataset with labels) and readme
    with tarfile.open(compressed_dataset_path, "r") as input_file:
        input_file.extract('readme.txt', dataset_folder_path)
        input_file.extract('train.txt', dataset_folder_path)
        os.rename(os.path.join(dataset_folder_path, 'train.txt'), dataset_path)

In [None]:
dataset_url = "https://criteostorage.blob.core.windows.net/criteo-research-datasets/kaggle-display-advertising-challenge-dataset.tar.gz"
dataset_folder_path = os.path.abspath('sync/data/criteo_dataset')
compressed_dataset_path = os.path.join(dataset_folder_path, "criteo_dataset.tar.gz")
dataset_path = os.path.join(dataset_folder_path, "criteo_dataset.txt")

In [None]:
if not os.path.exists(compressed_dataset_path):
    download_dataset(dataset_url, dataset_folder_path, compressed_dataset_path)

if not os.path.exists(dataset_path):
    extract_dataset(compressed_dataset_path, dataset_folder_path, dataset_path)

Quick look at the files we have downloaded.

Within iPython notebook, we can execute bash command by prepending the cell with `!` and insert python variable into it with `{}`

In [None]:
!ls -alh {dataset_folder_path}

In [None]:
!cat {dataset_folder_path}/readme.txt

In [None]:
label_columns = ['label']
integer_features = [f'int_feat_{i}' for i in range(1, 14)]
categorical_features = [f'cat_feat_{i}' for i in range(1, 27)]
columns = label_columns + integer_features + categorical_features

In [None]:
pd.read_csv(dataset_path, nrows=10, header=None, sep='\t', names=columns)

## Shrink it to a toy dataset

We first create a toy dataset with "only" 1 million rows (out of 45 millions)

In [None]:
toy_dataset_path = os.path.join(dataset_folder_path, "criteo_toy_dataset.txt")

In [None]:
!head -n 1000000 {dataset_path} > {toy_dataset_path}

## Shortcut to get the toy dataset

If it takes too much time, download this toy dataset there instead.

In [None]:
too_long = False
if too_long:
  from urllib import request
  os.makedirs(dataset_folder_path, exist_ok=True)
  toy_dataset_path = os.path.join(dataset_folder_path, "criteo_toy_dataset.txt")
  toy_dataset_url = 'https://www.dropbox.com/s/305lnmwphmu4cir/criteo_toy_dataset.txt?dl=1'
  request.urlretrieve(toy_dataset_url, toy_dataset_path, ProgressBar())

# Estimate Ratio of Positive samples

Let's say we want to perform a basic operation: estimate the number of positive samples within the data

### Basic approach

In [None]:
def compute_positive_label_proportion(dataset_path, columns):
  # fill me !
  pass

Let's measure its memory footprint with the `%%memit` magic function

In [None]:
%%memit
positive_label_proportion = compute_positive_label_proportion(toy_dataset_path, columns)
print('positive_label_proportion', positive_label_proportion)

What would happen if you run the same function on a 45 times bigger dataset ?

You can give a try with `compute_positive_label_proportion(dataset_path, columns)`... at your own risks.

### Specifying schema
We can help pandas by specifying the column types to be used such that it does not need to infer it. Do so with the parameter dtype of pd.read_csv: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html.

In [None]:
col_types = OrderedDict()
# fill col types here

def compute_positive_label_proportion_with_dtype(dataset_path, columns, col_types):
    # Read csv with dtype and return positive_label_proportion
    # fill me !
    pass

In [None]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype(toy_dataset_path, columns, col_types)
print('positive_label_proportion', positive_label_proportion)

### Reading data by chunks
We can control the amount of memory we need by loading only a small chunk of the data and processing it before moving to the next chunk.

See documentation at https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#iterating-through-files-chunk-by-chunk

```
reader = pd.read_csv(..., chunksize=10, nrows=100):
for chunk in reader:
    print(chunk)
```

In [None]:
def compute_positive_label_proportion_with_dtype_and_chunksize(dataset_path, columns, col_types, chunksize):
    # Read csv with dtype and chunksize and return positive_label_proportion
    # fill me !

In [None]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype_and_chunksize(toy_dataset_path, columns, col_types, 100_000)
print('positive_label_proportion', positive_label_proportion)

This can now be applied to the full dataset with no memory issue.

In [None]:
%%memit
positive_label_proportion = compute_positive_label_proportion_with_dtype_and_chunksize(dataset_path, columns, col_types, 100_000)
print('positive_label_proportion', positive_label_proportion)

# Training and evaluation

## Split train and test datasets
Since the datasets contain one line per example, we can split them into train and test by simply iterating over the lines. For each line in the original dataset: write it to the test data set with a probability p and write it to the train dataset with a probability 1 - p.

In [None]:
def split_train_test(full_dataset_path, train_dataset_path, test_dataset_path, test_ratio, seed=302984, print_every=None):
  # fill me !
  pass

In [None]:
train_dataset_path = os.path.join(dataset_folder_path, "criteo_train_dataset.txt")
test_dataset_path = os.path.join(dataset_folder_path, "criteo_test_dataset.txt")
if not os.path.exists(train_dataset_path) or not os.path.exists(test_dataset_path):
    split_train_test(dataset_path, train_dataset_path, test_dataset_path, test_ratio=0.1, print_every=10_000_000)

In [None]:
!wc -l {test_dataset_path}

## Shuffling
The convergence guarantees of SGD rely on the fact that the observations come at random. Hence, shuffling between epochs is important.

First result of "How to shuffle a file that is too big for memory" on Google: https://stackoverflow.com/a/40814865/2015762

Note that quicker pseudo-shuffling strategies exists, but this fits our "Big data on your laptop" problematic.

In [None]:
test_dataset_shuffled_path = os.path.join(dataset_folder_path, "criteo_test_dataset_shuffled.txt")
train_dataset_shuffled_path = os.path.join(dataset_folder_path, "criteo_train_dataset_shuffled.txt")

In [None]:
!awk 'BEGIN{srand();} {printf "%06d %s\n", rand()*1000000, $0;}' /content/sync/data/criteo_dataset/criteo_test_dataset.txt | sort -n | cut -c8- > /content/sync/data/criteo_dataset/criteo_test_dataset_shuffled.txt
# We can run it on the train dataset too but let's skip it since it is quite long
# !awk 'BEGIN{srand();} {printf "%06d %s\n", rand()*1000000, $0;}' /content/sync/data/criteo_dataset/criteo_train_dataset.txt | sort -n | cut -c8- > /content/sync/data/criteo_dataset/criteo_train_dataset_shuffled.txt

## Training
In order to train a logistic model on chunks of data, we will use scikit-learn `SGDClassifier` (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html) and train for its `log` loss with its `partial_fit` method.
We can now apply the previous data processing pipeline and add the training to obtain a trained classifier.

In [None]:
#  To begin with, let's not do any preprocessing and deal with "ready to use" continuous features only
def preprocess_simple(chunk, integer_features, categorical_features):
    return chunk[integer_features].fillna(-1)

In [None]:
def do_train(path, preprocess, max_training_steps=1000, chunk_size=1000, print_every=50):
  # fit some SGDClassifier, chunk by chunk, with partial_fit method, then return it

In [None]:
classifier = do_train(train_dataset_path, preprocess_simple)

## Testing
Let's evaluate the performances of the trained classifier. We should iterate over the test dataset and evaluate the labels predicted by the classifier with `roc_auc_score` and `log_loss`.

In [None]:
def do_test(classifier, path, preprocess, max_testing_steps = 100, chunk_size = 1000, print_every = 10):
  # return AUC and log loss averaged over all chunks
  # print average auc and log_loss at each `print_every`
  # functions to use:
  # - classifier.predict_proba
  # - log_loss
  # - roc_auc_score

In [None]:
# Compute your AUC and log_loss with simple model

# Feature Engineering

## Continuous features
A smart way to deal with continuous features (counting integer features are part of them), consists in transforming them into categorical features through a quantile transformation. To do so we will use scikit-learn KBinsDiscretizer : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html.

It can be used as following
```
df = pd.DataFrame({'col_1': np.random.normal(size=1000), 'col_2': np.random.poisson(lam=1, size=1000)})
bucketizer = KBinsDiscretizer(n_bins=20, encode='ordinal')
bucketizer.fit(df)
df_bucketized = pd.DataFrame(bucketizer.transform(df), columns=[f'{col}_bucketized' for col in df.columns], index=df.index)
sns.jointplot(data=pd.concat((df, df_bucketized), axis=1), x="col_1", y="col_1_bucketized")
```

1. Create a `KBinsDiscretizer` and train it on the first chunk of the dataset. Try the `fit` method of `KBinsDiscretizer` on the chunk and see the type of what is returned. This is not a dataframe any more but your classifier will accept this type for its feature matrix.
1. Update `preprocess_data` to add a bucketize step to the training pipeline.
1. Do not forget to deal with missing values, you do not want to carry on NaNs. You can for example replace them with -1.

In [None]:
# get a chunk of 1000 lines of your dataset from `train_dataset_path`, you will use it to fit the bucketizer
# also get a chunk of 10 lines of your dataset, you will use it to make sure things look correct


In [None]:
# create a KBinsDiscretizer of 20 bins, fit it on big chunk, where NaN values are replaced with -1


In [None]:
# Look at the bins, test the bucketizer on small chunk. You can use `.todense()` function to look at a small sparse matrix


In [None]:
# return a matrix of bucketized integer features
def preprocess_bucket(chunk, integer_features, categorical_features):
  pass

In [None]:
# test the process function on a small chunk


In [None]:
# train your model with bucketized columns


In [None]:
# look at AUC, log_loss


## Categorical features

### Implement Hashing Trick
For categorical features we will implement the hashing trick by ourselves. As a quick reminder, for each row

1. Select the categorical features 
1. Create for each feature the string concatenating the feature name and the feature value
1. Apply a hash function to each of these string and use this value to choose the feature's column index
1. Store the transformed features in a sparse matrix

In [None]:
# Here is a function that hashes strings the deterministic way
from sklearn.utils.murmurhash import murmurhash3_bytes_s32
def hash_string(string, seed=0):
    return murmurhash3_bytes_s32(string.encode(), seed)
hash_string('my_feature=my_feature_value')
# Note, if we were using builtin function hash('my_feature=my_feature_value'), we would have had a different hash value at each run

In [None]:
# define a function that:
# - takes as arguments:
#    - the dataframe to transform
#    - size of hash_space
# - returns an array with hashes of each categorical columns
# hashes should belong to [0;hash_space[
def get_features_hashes(row, hash_space):
  pass

In [None]:
# try out the function one one row of your small chunk with a hash_space of 2^16


In [None]:
# define a function that transforms a dataframe into a sparse matrix m that can be passed to the learning
# Matrix m contains `hash_space` columns
# m[i,j] = 1 if the hashed value of at least one categorical column is `j` for line `i`

# use csr_matrix to create the sparse matrix
# Fill the csr_matrix, using csr_matrix((data, (row_ind, col_ind)), [shape=(M, N)]) constructor

def transform_with_hashing_trick(df, hash_space):
  pass

In [None]:
# test the function on a small chunk.
# Make sure that sum of each row is equal to the number of categorical features


In [None]:
# Create a `preprocess_hash_v1` method.
# New implementation should:
# - apply bucketization on integer columns
# - apply hashing trick on categorical columns
# - return a concatenation of matrix with both features
# you can rely on scipy.sparse hstack to concatenate matrix
from scipy.sparse import hstack
hash_space = 2 ** 16
def preprocess_hash_v1(df, integer_features, categorical_features):
  pass

In [None]:
# run the training


In [None]:
# look at metrics


### Use scikit-learn Feature Hasher

Actually, the hashing trick is well known and already implemented in scikit-learn FeatureHasher:<br>
https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html

In [None]:
# rewrite a new version of `transform_with_hashing_trick` that relies on FeatureHasher
# create the hasher only once
# you may need this to convert the dataframe: 
# https://stackoverflow.com/questions/29815129/pandas-dataframe-to-list-of-dictionaries
def transform_with_hashing_trick_v2(df, hash_space):
  pass

In [None]:
# Test it on a small chunk


In [None]:
# Test speed of transform_with_hashing_trick and transform_with_hashing_trick_v2 with timeit function.

In [None]:
# Create a preprocess_hash_v2 method that relies on transform_with_hashing_trick_v2
def preprocess_hash_v2(df, integer_features, categorical_features):
  pass

In [None]:
# Test it on a chunk


In [None]:
# train the model


In [None]:
# Look at metrics


### Implement Cross Features with hashing

In [None]:
# Adapt the hashing trick to implement cross features.
# Maybe you just need to reimplement `get_features_hashes`


In [None]:
# Test on small chunk

In [None]:
# Train the model

In [None]:
# Look at metrics