# Logistic regression on Criteo dataset without Spark MLlib

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql import types as st
from pyspark.sql import functions as sf
from pyspark.sql import Row, DataFrame
from pyspark import RDD
from pyspark import StorageLevel

In [0]:
%matplotlib inline

In [0]:
import numpy as np
import math
import matplotlib.pyplot as plot
from typing import Tuple, Dict

In [0]:
ss = SparkSession \
    .builder \
    .appName("criteo-lr") \
    .master("local[4]") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.driver.memory", "4g") \
    .config("spark.ui.port", "0") \
    .getOrCreate()
ss

## Load dataset

In [0]:
integer_features = [f"int_feat_{i}" for i in range(1, 14)]
categorical_features = [f"cat_feat_{i}" for i in range(1, 27)]

fields = []

fields.append(
    st.StructField("label", st.IntegerType(), nullable=False)
)

for int_feat in integer_features:
    fields.append(st.StructField(int_feat, st.IntegerType(), nullable=True))
    
for cat_feat in categorical_features:
    fields.append(st.StructField(cat_feat, st.StringType(), nullable=True))

schema = st.StructType(fields)

In [0]:
toy_dataset_url = 'https://www.dropbox.com/s/dle2t3szhljfevh/criteo_toy_dataset.txt?dl=1'
urllib.request.urlretrieve(toy_dataset_url, "criteo_toy_dataset.txt")
dbutils.fs.mv("file:/databricks/driver/criteo_toy_dataset.txt", "dbfs:/train.txt")

In [0]:
full_df = ss.read.csv(
    path="dbfs:/train.txt",
    sep="\t",
    header=None,
    schema=schema
)

In [0]:
full_df.agg(
    sf.count('*').alias('num_examples'),
    sf.sum('label').alias('num_positives'),
    sf.sum(sf.expr('label == 0').cast('int')).alias('num_negatives'),
    sf.mean('label').alias('avg_label')
).toPandas()

## Convert to vector with one hot encoding

### Select subset of features based on number of modalities

In [0]:
threshold = 100
num_modalities = {} 
for cat_feat in categorical_features:
    num_modalities[cat_feat] = full_df \
        .filter(sf.col(cat_feat).isNotNull()) \
        .groupby(cat_feat) \
        .count() \
        .filter(sf.col('count') > sf.lit(threshold)) \
        .count()
num_modalities

In [0]:
low_card_cat_feat = [cat_feat for cat_feat, num_modalities in num_modalities.items() if num_modalities < 50]
low_card_cat_feat

### Build map for one hot encoding

In [0]:
modalities = {}
for cat_feat in low_card_cat_feat:
    rows = full_df\
        .filter(sf.col(cat_feat).isNotNull())\
        .groupby(cat_feat)\
        .count()\
        .filter(sf.col('count') > sf.lit(threshold))\
        .select(cat_feat)\
        .collect()
    modalities[cat_feat] = [row[cat_feat] for row in rows]

In [0]:
one_hot_encoder = {cat_feat:{} for cat_feat in low_card_cat_feat}
#one_hot_encoder = [cat_feat:{} for cat_feat in low_card_cat_feat]
index = 0
for cat_feat in low_card_cat_feat:
    for value in modalities[cat_feat]:
        one_hot_encoder[cat_feat][value] = index
        index += 1
    one_hot_encoder[cat_feat][None] = index
    index += 1
dimension = index + 1 # dimension is nb_of_modalities + 1 for the intercept
dimension

### Convert to vector

In [0]:
def row_to_vector(
    row: Row, dimension: int, encoder: Dict[str, Dict[str, int]]
) -> Tuple[np.ndarray, int]:
    x = np.zeros(dimension)
    x[-1] = 1 # for intercept
    y = row['label']
    for feat in encoder.keys():
        value = row[feat]
        index = encoder[feat].get(value, None)
        if index != None:
            x[index] = 1
    return x, y

In [0]:
def convert_to_vectors(
    df: DataFrame, dimension: int, encoder: Dict[str, Dict[str, int]]
) -> RDD:
    features = encoder.keys()
    return df\
        .select('label', *features).rdd\
        .map(lambda row: row_to_vector(row, dimension, encoder))

In [0]:
convert_to_vectors(full_df, dimension, one_hot_encoder).first()

## Compute loss

In [0]:
def sigmoid(x: float) -> float:
    return 1 / (1 + math.exp(-x))

In [0]:
X = np.arange(-10, 10, 0.01)

In [0]:
plot.plot(X, [sigmoid(x) for x in X])

In [0]:
def point_predict(x: np.ndarray, model: np.ndarray) -> float:
    return sigmoid(np.dot(x, model))

In [0]:
def point_loss(prediction: float, y: int) -> float:
    return - y * math.log(prediction) - (1-y) * math.log(1-prediction)

In [0]:
def compute_loss(vec_label_rdd: RDD, model: np.ndarray, num_examples: int) -> float:
    sum_loss = vec_label_rdd\
        .map(lambda vec_lab: point_loss(point_predict(vec_lab[0], model), vec_lab[1]))\
        .reduce(lambda u, v: u+v)
    return sum_loss / num_examples

## Compute gradient of the loss

In [0]:
def point_gradient(x: np.ndarray, y: int, model: np.ndarray) -> float:
    p = sigmoid(np.dot(x, model))
    return (p - y) *  x

In [0]:
def compute_gradient(vec_label_rdd: RDD, model: np.ndarray, num_examples: int) -> np.ndarray:
    sum_gradient = vec_label_rdd\
        .map(lambda vec_lab: point_gradient(vec_lab[0], vec_lab[1], model))\
        .reduce(lambda u, v: u+v)
    return sum_gradient / num_examples

## Check gradient with finite differences

In [0]:
def point_gradient_fd(x, y, model, h=0.001):
    dimension = len(x)
    gradient = np.zeros(dimension)
    for i in range(0, dimension):
        delta = np.zeros(dimension)
        delta[i] = h
        
        loss_up = point_loss(point_predict(x, model+delta), y)
        loss_down = point_loss(point_predict(x, model-delta), y)
        
        gradient[i] = (loss_up - loss_down) / (2*h)
    return gradient

In [0]:
model = np.random.uniform(-1.0, 1.0, size=dimension)
x = np.random.uniform(-1.0, 1.0, size=dimension)
y = 0

In [0]:
np.sum(np.abs(point_gradient_fd(x, y, model) - point_gradient(x, y, model)))

## Distributed Gradient Descent

In [0]:
def logit(x: float) -> float:
    return math.log( x / (1-x) )

In [0]:
np.sum(np.abs([x - logit(sigmoid(x)) for x in X]))

In [0]:
def smart_init(dimension: int, avg_label: float) -> np.ndarray:
    init_model = np.zeros(dimension)
    init_model[-1] = logit(avg_label)
    return init_model

In [0]:
def train(
    training_set: DataFrame,
    dimension: int,
    encoder: Dict[str, Dict[str, int]],
    nb_iter: int,
    lr: float
) -> Tuple[np.ndarray, float]:
    num_examples, avg_label = training_set.agg(
        sf.count('*').alias('num_examples'),
        sf.mean('label').alias('avg_label')
    ).collect()[0]
    print(f'Num examples: {num_examples}, average label: {avg_label}')
    model = smart_init(dimension, avg_label)
    vector_label_rdd = convert_to_vectors(training_set, dimension, encoder).persist()
    for it in range(0, nb_iter):
        loss = compute_loss(vector_label_rdd, model, num_examples)
        print(f'Loss at step {it}: {loss}')
        gradient = compute_gradient(vector_label_rdd, model, num_examples)
        model -= lr * gradient
    final_loss = compute_loss(vector_label_rdd, model, num_examples)
    print(f'Loss at step {nb_iter}: {final_loss}')
    return model, final_loss

In [0]:
model, loss = train(full_df, dimension, one_hot_encoder, 10, 1)

In [0]:
print(f'intercept -> {model[-1]}')
for dim in one_hot_encoder.keys():
    for mod, index in one_hot_encoder[dim].items():
        print(f'{dim}={mod} -> {model[index]}')

In [0]:
ss.stop()

### More questions

* use sparse vectors
* add feature hashing
* merge loss and gradient computation in one function
* use lbfgs from scipy