In [81]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import sklearn.linear_model
import sklearn.metrics
import numpy as np
from tqdm import tqdm as tqdm

In [4]:
df = pd.read_csv(
    "~/data/avazu/train",
    nrows=1000000
)

In [5]:
target = "click"
CAT_COLS = [
    "C1", "banner_pos", 
    "site_category", "app_category", 
    "device_type", "device_conn_type",
]

In [6]:
df_enc = pd.get_dummies(df[CAT_COLS], columns=CAT_COLS)
df_final = pd.concat([
    df[target], df_enc
], axis=1)

In [7]:
Xs = df_enc.values
y = df[target].values

In [44]:
lm = sklearn.linear_model.LogisticRegression(
    solver="saga",
    verbose=1,
    max_iter=100,
)
lm.fit(Xs, y)
yh = lm.predict_proba(Xs)
score = sklearn.metrics.log_loss(y, yh)
print("Log Loss: {}".format(score))

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


convergence after 59 epochs took 74 seconds


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.2min finished


Log Loss: 0.42532564432012976


In [45]:
lm = sklearn.linear_model.SGDClassifier(
    loss = "log",
    l1_ratio=0,
    alpha=.1,
    learning_rate="adaptive",
    eta0=.001,
    verbose=1,
    shuffle=True,
    max_iter=20
)
lm.fit(Xs, y)
yh = lm.predict_proba(Xs)
score = sklearn.metrics.log_loss(y, yh)
print("Log Loss: {}".format(score))

-- Epoch 1
Norm: 0.20, NNZs: 66, Bias: -1.738578, T: 1000000, Avg. loss: 0.434745
Total training time: 0.49 seconds.
-- Epoch 2
Norm: 0.17, NNZs: 66, Bias: -1.727277, T: 2000000, Avg. loss: 0.434426
Total training time: 0.99 seconds.
-- Epoch 3
Norm: 0.22, NNZs: 66, Bias: -1.726153, T: 3000000, Avg. loss: 0.434416
Total training time: 1.49 seconds.
-- Epoch 4
Norm: 0.18, NNZs: 66, Bias: -1.738806, T: 4000000, Avg. loss: 0.434413
Total training time: 1.99 seconds.
-- Epoch 5
Norm: 0.19, NNZs: 66, Bias: -1.741730, T: 5000000, Avg. loss: 0.434401
Total training time: 2.49 seconds.
-- Epoch 6
Norm: 0.21, NNZs: 66, Bias: -1.719802, T: 6000000, Avg. loss: 0.434430
Total training time: 3.00 seconds.
-- Epoch 7
Norm: 0.21, NNZs: 66, Bias: -1.719315, T: 7000000, Avg. loss: 0.434280
Total training time: 3.50 seconds.
-- Epoch 8
Norm: 0.20, NNZs: 66, Bias: -1.714909, T: 8000000, Avg. loss: 0.434277
Total training time: 3.99 seconds.
-- Epoch 9
Norm: 0.21, NNZs: 66, Bias: -1.721505, T: 9000000, Av



Log Loss: 0.4342363321805997


# PyTorch Experiments

In [61]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as utils

In [116]:
class LinearModel(nn.Module):
    def __init__(self, k):
        super(LinearModel, self).__init__()
        self.fc = nn.Linear(k, 2)

    def forward(self, x):
        x = self.fc(x)
        return x
#         return F.log_softmax(x, dim=1)

net = LinearModel(66)
criterion = torch.nn.CrossEntropyLoss()
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.5)
optimizer = optim.Adam(net.parameters(), lr=0.001)

In [120]:
xt = torch.from_numpy(Xs.astype(np.float32))
yt = torch.from_numpy(y)

train_data = utils.TensorDataset(xt,yt) # create your datset
train_loader = utils.DataLoader(
    train_data,
    batch_size=1024,
    shuffle=True,
)

In [121]:
num_epochs = 10
for epoch in range(3):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
    print("Running Loss: {}".format(running_loss))
print('Finished Training')


Running Loss: 415.6251060664654
Running Loss: 415.62961107492447
Running Loss: 415.6205635666847
Finished Training


In [122]:
with torch.no_grad():
    yh = F.softmax(net(xt),dim=1)
    score = sklearn.metrics.log_loss(y, yh)
    print("Log Loss: {}".format(score))

Log Loss: 0.42537413860317924
