In [126]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import sklearn.linear_model
import sklearn.metrics
import scipy.special
import numpy as np
import time
import math
from tqdm import tqdm as tqdm

In [100]:
# df = pd.read_csv(
#     "~/data/avazu/train",
#     nrows=10000000
# )

In [90]:
# df.to_feather("~/data/avazu/train_10M.feather")

In [114]:
df = pd.read_feather("~/data/avazu/train_10M.feather")

In [116]:
target = "click"
CAT_COLS = [
    "C1", "banner_pos", 
    "site_category", "app_category", 
    "device_type", "device_conn_type",
]

In [146]:
df_enc = pd.get_dummies(df[CAT_COLS], columns=CAT_COLS)
df_final = pd.concat([
    df[target], df_enc
], axis=1)

In [141]:
nrows = 4_000_000
np.random.seed(0)
r_order = np.random.permutation(nrows)
Xs = df_enc.values[:nrows][r_order]
y = df[target].values[:nrows][r_order]

In [143]:
Xc = np.concatenate([
    np.repeat(1, repeats=Xs.shape[0]).reshape(-1,1),
    Xs
], axis=1)

# Numpy Implementation

In [49]:
def predict(Xs, theta):
    z = np.inner(Xs,theta)
    h = scipy.special.expit(z)
    return h

In [39]:
def calc_grad(X_b, y_b, theta):
    z = np.inner(X_b,theta)
    h = scipy.special.expit(z)
    grd = np.inner(X_b.T, h-y_b)/y_b.shape[0]
    return grd

# Numpy ADAM

In [145]:
lr = 0.1
batch_size = 512

eps = 1e-8
b1 = 0.9
b2 = 0.999

t1 = time.time()
theta = np.zeros(shape=(Xc.shape[1]))
mt = np.zeros(shape=(Xc.shape[1]))
vt = np.zeros(shape=(Xc.shape[1]))
b1t = b1
b2t = b2

n_rows = len(Xc)
start_idx = 0
t = 1
for batch_idx in range(n_rows // batch_size):
    X_b = Xc[start_idx:start_idx+batch_size]
    y_b = y[start_idx:start_idx+batch_size]
    cur_grad = calc_grad(X_b, y_b, theta)
    mt = b1*mt + (1-b1)*cur_grad
    vt = b2*vt + (1-b2)*(cur_grad * cur_grad)
    at = (lr/math.sqrt(t))*np.sqrt(1-b2t)/(1-b1t)
#     at = lr * np.sqrt(1-b2t)/(1-b1t)
    theta -= at*mt/(np.sqrt(vt)+eps)

    start_idx += batch_size
    b1t *= b1
    b2t *= b2
    t += 1

t2 = time.time()
print("Total Time: {}".format(t2-t1))
yh = predict(Xc, theta)
score = sklearn.metrics.log_loss(y, yh)
print("Log Loss: {}".format(score))

Total Time: 1.4434258937835693
Log Loss: 0.4515423413330194


# Numpy LR

In [67]:
theta = np.zeros(shape=(Xc.shape[1]))
lr = 0.001

batch_size = 128
n_rows = len(Xc)
start_idx = 0

t1 = time.time()
for batch_idx in range(n_rows // batch_size):
    X_b = Xc[start_idx:start_idx+batch_size]
    y_b = y[start_idx:start_idx+batch_size]
    cur_grad = calc_grad(X_b, y_b, theta)
    theta -= lr*cur_grad
    start_idx += batch_size
    
t2 = time.time()
print("Total Time: {}".format(t2-t1))

Total Time: 0.3488016128540039


In [68]:
yh = predict(Xc, theta)
score = sklearn.metrics.log_loss(y, yh)
print("Log Loss: {}".format(score))

Log Loss: 0.4380691840129385


# Vowpal Wabbit

In [47]:
def process_df(df, f_name):
    with open(f_name, "w") as f:
        for row in tqdm(df.itertuples()):
            label = getattr(row, target)
            if label == 0:
                lval = -1
            else:
                lval = 1
            cat_vals = [
                "{}={}".format(cat_name, getattr(row, cat_name))
                for cat_name in CAT_COLS
            ]
            new_line = "{} | {}\n".format(lval, " ".join(cat_vals))
            f.write(new_line)

In [48]:
process_df(df, "vw.txt")

1000000it [00:07, 129191.29it/s]


# Alternative Methods

In [138]:
lm = sklearn.linear_model.LogisticRegression(
    solver="lbfgs",
    verbose=1,
    max_iter=30,
    n_jobs=1,
)
lm.fit(Xs, y)
yh = lm.predict_proba(Xs)
score = sklearn.metrics.log_loss(y, yh)
print("Log Loss: {}".format(score))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   16.0s finished


Log Loss: 0.4518602557622661


In [32]:
lm = sklearn.linear_model.SGDClassifier(
    loss = "log",
    l1_ratio=0,
    alpha=.1,
    learning_rate="adaptive",
    eta0=.001,
    verbose=1,
    shuffle=True,
    max_iter=20,
    n_jobs=4,
)
lm.fit(Xs, y)
yh = lm.predict_proba(Xs)
score = sklearn.metrics.log_loss(y, yh)
print("Log Loss: {}".format(score))

-- Epoch 1
Norm: 0.21, NNZs: 66, Bias: -1.720659, T: 1000000, Avg. loss: 0.434751
Total training time: 0.55 seconds.
-- Epoch 2
Norm: 0.20, NNZs: 66, Bias: -1.730077, T: 2000000, Avg. loss: 0.434397
Total training time: 1.10 seconds.
-- Epoch 3
Norm: 0.20, NNZs: 66, Bias: -1.708292, T: 3000000, Avg. loss: 0.434396
Total training time: 1.66 seconds.
-- Epoch 4
Norm: 0.24, NNZs: 66, Bias: -1.721131, T: 4000000, Avg. loss: 0.434430
Total training time: 2.21 seconds.
-- Epoch 5
Norm: 0.24, NNZs: 66, Bias: -1.718980, T: 5000000, Avg. loss: 0.434426
Total training time: 2.76 seconds.
-- Epoch 6
Norm: 0.20, NNZs: 66, Bias: -1.729255, T: 6000000, Avg. loss: 0.434420
Total training time: 3.31 seconds.
-- Epoch 7
Norm: 0.21, NNZs: 66, Bias: -1.712717, T: 7000000, Avg. loss: 0.434274
Total training time: 3.87 seconds.
-- Epoch 8
Norm: 0.21, NNZs: 66, Bias: -1.715451, T: 8000000, Avg. loss: 0.434283
Total training time: 4.41 seconds.
-- Epoch 9
Norm: 0.20, NNZs: 66, Bias: -1.727475, T: 9000000, Av



Log Loss: 0.4342404762471195


# PyTorch Experiments

In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as utils
torch.set_num_threads(1)

In [11]:
class LinearModel(nn.Module):
    def __init__(self, k):
        super(LinearModel, self).__init__()
        self.fc = nn.Linear(k, 2)

    def forward(self, x):
        x = self.fc(x)
        return x
#         return F.log_softmax(x, dim=1)

net = LinearModel(66)
# net = nn.Linear(66, 2)
criterion = torch.nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [12]:
xt = torch.from_numpy(Xs.astype(np.float32))
yt = torch.from_numpy(y)

train_data = utils.TensorDataset(xt,yt) # create your datset
train_loader = utils.DataLoader(
    train_data,
    batch_size=256,
    shuffle=True,
    num_workers=1,
)

In [13]:
start_time = time.time()
num_epochs = 1
for epoch in range(num_epochs):
    running_loss = 0.0
    for data in train_loader:
        inputs, labels = data
        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
    print("Running Loss: {}".format(running_loss))
end_time = time.time()
print('Finished Training in :{}'.format((end_time - start_time)))

Running Loss: 1682.9635597467422
Finished Training in :14.4614098072052


In [14]:
with torch.no_grad():
    yh = F.softmax(net(xt),dim=1)
    score = sklearn.metrics.log_loss(y, yh)
    print("Log Loss: {}".format(score))

Log Loss: 0.4254369985373933
