In [60]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import math

## Load data

In [72]:
filename = "./fixtures/net_20180312_201803114_100k"
features = "PageID OrderID".split()
target = "IsClick"
columns = features + [target]
data = []
with open(filename) as fd:
    header = fd.readline()[2:].strip().split("\t")
    col_to_index = dict((col, i) for i, col in enumerate(header))
    column_indices = [col_to_index[col] for col in columns]
    for line in fd:
        splitted = line.strip().split("\t")
        data.append([int(splitted[idx]) for idx in column_indices])

print(col_to_index)        
print(columns)
print(features)
print(data[:4])

{'HitLogID': 0, 'UniqID': 1, 'IsClick': 2, 'ShowTime': 3, 'PhraseID': 4, 'TargetDomainID': 5, 'PageID': 6, 'OrderID': 7, 'BannerID': 8, 'QueryLemmaH': 9, 'BannerTitleLemmaH': 10, 'DeviceType': 11, 'RegionID': 12}
['PageID', 'OrderID', 'IsClick']
['PageID', 'OrderID']
[[258763, 18951888, 0], [264633, 14829991, 1], [249430, 1026618, 0], [261025, 19335144, 1]]


## Calculate feature map and filter rare features

In [73]:
# Calculate feature map via simple enumeration
from collections import defaultdict

feature_stats = defaultdict(dict)
for example in data:
    for fid, col in zip(example[:-1], features):
        feature_stats[col][fid] = feature_stats[col].get(fid, 0) + 1

# calculate feature map + filter rare features 
min_counts = 2
X, y = [], []
unk_fid = 0
fid = 1
feature_map = defaultdict(dict)
for column, col_data in feature_stats.items():
    for feature, counts in col_data.items():
        if counts >= min_counts:
            feature_map[column][feature] = fid
            fid += 1
        else:
            feature_map[column][feature] = unk_fid
print("Number of unique features after filtering reare features = {}".format(fid))
num_features = fid

# remap features in dataset
for rec in data:
    X.append([feature_map[column][fid] for column, fid in zip(features, rec[:-1])])
    y.append(rec[-1])

from collections import namedtuple
Dataset = namedtuple("Dataset", "X y")    
dataset = Dataset(X, y)

Number of unique features after filtering reare features = 19135


In [74]:
# Train / Test split
import numpy as np

dataset = Dataset(np.asarray(dataset.X), np.asarray(dataset.y))
indices = np.arange(len(dataset.X))
np.random.shuffle(indices)
test_begin = int(0.9 * len(indices))
train = Dataset(np.take(dataset.X, indices[:test_begin], axis=0), 
                np.take(dataset.y, indices[:test_begin], axis=0))
test = Dataset(np.take(dataset.X, indices[test_begin:], axis=0), 
                np.take(dataset.y, indices[test_begin:], axis=0))

In [75]:
def batch_iter(dataset, batch_size):
    for start in range(0, len(dataset.X), batch_size):
        yield dataset.X[start:start + batch_size], dataset.y[start:start + batch_size]

## Pytorch model

In [76]:
def glorot(m):
    size = m.weight.size() # returns a tuple
    fan_out = size[0] # number of rows
    fan_in = size[1]
    if isinstance(m, nn.Linear):
        scale = math.sqrt(2.0 / (fan_in + fan_out))
    elif isinstance(m, nn.Embedding):
        scale = math.sqrt(2.0 / (1.0 + fan_in * fan_out))
    else:
        raise NotImplementedError
    m.weight.data.uniform_(-scale, scale)

    
class FFM(nn.Module):
    def __init__(self, *args, **kwargs):
        super(FFM, self).__init__()
        self.num_features = kwargs["num_features"]
        self.dim = kwargs["dim"]
        self.num_fields = kwargs["num_fields"]
        self.use_unary = kwargs["use_unary"]
        
        # create parameters
        self.embeddings = nn.Embedding(self.num_features, self.dim)
        out_dim = self.dim
        if self.use_unary:
            self.unary = nn.Embedding(self.num_features, 1)
            out_dim += self.num_fields
        self.projection = nn.Linear(out_dim, 2)
        # initialize parameters
        glorot(self.embeddings)
        glorot(self.projection)
        if self.use_unary:
            glorot(self.unary)        
        
    def forward(self, X):
        """
        :param self:
        :param X: B (batch size) x F (number of features)
        :return:
        """
        # quadratic cross embeddings: (a+b+c)**2 - a**2 - b**2 - c**2 = 2 * (ab + bc + ac)
        embeddings = self.embeddings(X)  # B x F x D
        embeddings_sum = embeddings.sum(dim=1)  # B x D
        sum_squares = torch.mul(embeddings, embeddings).sum(dim=1)  # B x D
        quadratic = 0.5 * (torch.mul(embeddings_sum, embeddings_sum) - sum_squares)
        if self.use_unary:
            unary = self.unary(X)  # B x F x 1
            unary = unary.squeeze(dim=2)  # B x F
            out = torch.cat((quadratic, unary), dim=1)  # B x (F + D)
        else:
            out = quadratic
        y = self.projection(out)
        logsoftmax = nn.LogSoftmax(dim=1)
        y = logsoftmax(y)
        return y
        

## Train model

In [77]:
USE_CUDA = False
#gpus = [0]
#torch.cuda.set_device(gpus[0])
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor

In [82]:
features = "PageID OrderID".split()
# Only factorization
conf = {
    "use_unary": False,
    "num_features": num_features,
    "dim": 10,
    "num_iter": 5,
    "batch_size": 128,
    "num_fields": len(features)
}

model = FFM(**conf)
loss_func = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=5.0)
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Loss on test before learning
test_targets = autograd.Variable(LongTensor(test.y))
test_features = autograd.Variable(LongTensor(test.X))
test_logprob = model.forward(test_features)
test_loss = loss_func(test_logprob, test_targets)
print("it={it}, test loss={loss}".format(it=-1, loss=float(test_loss)))

iter_loss = []
for it in range(conf["num_iter"]):
    data_iter = batch_iter(train, batch_size=conf["batch_size"])
    batch_loss = torch.Tensor([0])
    iter_loss.append(0)

    for Xb, yb in data_iter:
        targets = autograd.Variable(LongTensor(yb))
        features = autograd.Variable(LongTensor(Xb))
        model.zero_grad()
        logprob = model.forward(features)
        loss = loss_func(logprob, targets)
        loss.backward()
        optimizer.step()

        iter_loss[-1] += loss.data
        
    model.zero_grad()
    test_logprob = model.forward(test_features)
    test_loss = loss_func(test_logprob, test_targets)
    print("it={it}, train loss={loss}, test_loss={test}".format(it=it, loss=float(iter_loss[-1]),
                                                                test=float(test_loss)))

it=-1, test loss=0.6518734693527222
it=0, train loss=409.041015625, test_loss=0.566365659236908
it=1, train loss=375.3892517089844, test_loss=0.5507497787475586
it=2, train loss=347.3578186035156, test_loss=0.5423299074172974
it=3, train loss=319.36700439453125, test_loss=0.560049831867218
it=4, train loss=299.4649963378906, test_loss=0.5908413529396057


In [81]:
features = "PageID OrderID".split()
conf = {
    "use_unary": True,
    "num_features": num_features,
    "dim": 10,
    "num_iter": 10,
    "batch_size": 128,
    "num_fields": len(features)
}

model = FFM(**conf)
loss_func = nn.NLLLoss()
#optimizer = optim.SGD(model.parameters(), lr=5.0)
optimizer = optim.Adam(model.parameters(), lr=5e-4)

# Loss on test before learning
test_targets = autograd.Variable(LongTensor(test.y))
test_features = autograd.Variable(LongTensor(test.X))
test_logprob = model.forward(test_features)
test_loss = loss_func(test_logprob, test_targets)
print("it={it}, test loss={loss}".format(it=-1, loss=float(test_loss)))

iter_loss = []
for it in range(conf["num_iter"]):
    data_iter = batch_iter(train, batch_size=conf["batch_size"])
    batch_loss = torch.Tensor([0])
    iter_loss.append(0)

    for Xb, yb in data_iter:
        targets = autograd.Variable(LongTensor(yb))
        features = autograd.Variable(LongTensor(Xb))
        model.zero_grad()
        logprob = model.forward(features)
        loss = loss_func(logprob, targets)
        loss.backward()
        optimizer.step()

        iter_loss[-1] += loss.data
        
    model.zero_grad()
    test_logprob = model.forward(test_features)
    test_loss = loss_func(test_logprob, test_targets)
    print("it={it}, train loss={loss}, test_loss={test}".format(it=it, loss=float(iter_loss[-1]),
                                                                test=float(test_loss)))

it=-1, test loss=0.6916554570198059
it=0, train loss=429.96173095703125, test_loss=0.5636969208717346
it=1, train loss=372.3305969238281, test_loss=0.535670816898346
it=2, train loss=352.6449279785156, test_loss=0.5236057639122009
it=3, train loss=337.5802917480469, test_loss=0.5173668265342712
it=4, train loss=321.42633056640625, test_loss=0.5166086554527283
it=5, train loss=303.61712646484375, test_loss=0.5264502763748169
it=6, train loss=287.7060546875, test_loss=0.5458723306655884
it=7, train loss=275.47357177734375, test_loss=0.5688799023628235
it=8, train loss=265.957763671875, test_loss=0.5922778844833374
it=9, train loss=258.05462646484375, test_loss=0.6154462695121765
