In [19]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim
import math

## Load data

In [20]:
filename = "net_20180312_201803114_1k"
features = "PageID OrderID".split()
target = "IsClick"
columns = features + [target]
data = []
with open(filename) as fd:
    header = fd.readline()[2:].strip().split("\t")
    col_to_index = dict((col, i) for i, col in enumerate(header))
    print(col_to_index)
    column_indices = [col_to_index[col] for col in columns]
    for line in fd:
        splitted = line.strip().split("\t")
        data.append([int(splitted[idx]) for idx in column_indices])
print(columns)
print(features)
print(data[:4])

{'HitLogID': 0, 'UniqID': 1, 'IsClick': 2, 'ShowTime': 3, 'PhraseID': 4, 'TargetDomainID': 5, 'PageID': 6, 'OrderID': 7, 'BannerID': 8, 'QueryLemmaH': 9, 'BannerTitleLemmaH': 10, 'DeviceType': 11, 'RegionID': 12}
['PageID', 'OrderID', 'IsClick']
['PageID', 'OrderID']
[[258763, 18951888, 0], [264633, 14829991, 1], [249430, 1026618, 0], [261025, 19335144, 1]]


## Calculate feature map and filter rare features

In [21]:
# Calculate feature map via simple enumeration
from collections import defaultdict

feature_stats = defaultdict(dict)
feature_map = defaultdict(dict)
counter = 1
for example in data:
    for fid, col in zip(example[:-1], features):        
        if fid not in feature_map[col]:
            feature_map[col][fid] = counter
            counter += 1
        feature_stats[col][fid] = feature_stats[col].get(fid, 0) + 1

#print(feature_stats)
from collections import namedtuple
Dataset = namedtuple("Dataset", "X y")

# remap features using feature map + filter rare features 
min_counts = 2
X, y = [], []
unk_fid = 0
for rec in data:
    X.append([feature_map[col][fid] if feature_stats[col][fid] >= min_counts else unk_fid 
                 for col, fid in zip(features, rec[:-1])])
    y.append(rec[-1])
dataset = Dataset(X, y)

# calculate number of features
num_features = 0
for f, fdata in feature_stats.items():
    for fid, counts in fdata.items():
        if counts >= min_counts:
            num_features += 1
print("Number of features = {}".format(num_features))

Number of features = 168


In [22]:
def batch_iter(dataset, batch_size):
    for start in range(0, len(dataset.X), batch_size):
        yield dataset.X[start:start + batch_size], dataset.y[start:start + batch_size]

## Pytorch model

In [46]:
class FFM(nn.Module):
    def __init__(self, *args, **kwargs):
        super(FFM, self).__init__()

        self.num_features = kwargs["num_features"]
        self.dim = kwargs["dim"]
        self.embeddings = nn.Embedding(self.num_features, self.dim)
        self.unary = nn.Embedding(self.num_features, 1)
        self.num_fields = kwargs["num_fields"]
        #self.logsigmoid = nn.LogSigmoid()
        
        # initialize weights
        glorot = math.sqrt(2.0 / (self.num_features * self.dim + 1.0))
        self.embeddings.weight.data.uniform_(-glorot, glorot)
        glorot = math.sqrt(2.0 / (self.num_features + 1.0))
        self.unary.weight.data.uniform_(-glorot, glorot)

        self.projection = nn.Linear(self.dim + self.num_fields, 1)
        
    def forward(self, X):
        """
        :param self:
        :param X: B (batch size) x F (number of features)
        :return:
        """
        embeddings = self.embeddings(X)  # B x F x D
        embeddings_sum = embeddings.sum(dim=1)  # B x 1 x D
        sum_squares = torch.mul(embeddings, embeddings).sum(dim=1)  # B x 1 x D
        quadratic = 0.5 * (torch.mul(embeddings_sum, embeddings_sum) - sum_squares)
        unary = self.unary(X)  # B x F x 1
        unary = unary.squeeze(dim=2)
        
        concat = torch.cat((quadratic, unary), dim=1)
        print(concat.size())
        print(unary.size())
        print(quadratic.size())
        logsigmoid = nn.LogSigmoid()
        return logsigmoid(concat)


## Train model

In [48]:
USE_CUDA = False
LongTensor = torch.cuda.LongTensor if USE_CUDA else torch.LongTensor

conf = {
    "num_features": num_features,
    "dim": 10,
    "num_iter": 1,
    "batch_size": 64,
    "num_fields": len(features)
}
num_iter = conf["num_iter"]


model = FFM(**conf)
loss_func = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)
for it in range(num_iter):
    data_iter = batch_iter(dataset, batch_size=conf["batch_size"])
    print("Iteration {iter}".format(iter=it))
    for Xb, yb in data_iter:
        targets = autograd.Variable(LongTensor(yb))
        features = autograd.Variable(LongTensor(Xb))
        
        model.zero_grad()
        logprob = model.forward(features)
        loss = loss_func(logprob, targets)
        loss.backward()
        
        optimizer.step()

Iteration 0
torch.Size([64, 12])
torch.Size([64, 2])
torch.Size([64, 10])


RuntimeError: index out of range at /pytorch/torch/lib/TH/generic/THTensorMath.c:277