In [1]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.autograd import Variable

import numpy as np


In [2]:
import zlib

In [3]:
from nltk import ngrams

In [4]:
import spacy
spacy_en = spacy.load('en')


def tokenizer(text, alpha_only=True): # create a tokenizer function
    return [tok.text for tok in spacy_en.tokenizer(text) if (not alpha_only or tok.is_alpha )]

In [5]:
def cosine_similarity(x1, x2, dim=1, eps=1e-8): # support for old versions of Pytorch
    r"""Returns cosine similarity between x1 and x2, computed along dim.

    .. math ::
        \text{similarity} = \dfrac{x_1 \cdot x_2}{\max(\Vert x_1 \Vert _2 \cdot \Vert x_2 \Vert _2, \epsilon)}

    Args:
        x1 (Tensor): First input.
        x2 (Tensor): Second input (of size matching x1).
        dim (int, optional): Dimension of vectors. Default: 1
        eps (float, optional): Small value to avoid division by zero.
            Default: 1e-8

    Shape:
        - Input: :math:`(\ast_1, D, \ast_2)` where D is at position `dim`.
        - Output: :math:`(\ast_1, \ast_2)` where 1 is at position `dim`.

    Example::

        >>> input1 = torch.randn(100, 128)
        >>> input2 = torch.randn(100, 128)
        >>> output = F.cosine_similarity(input1, input2)
        >>> print(output)
    """
    w12 = torch.sum(x1 * x2, dim)
    w1 = torch.norm(x1, 2, dim)
    w2 = torch.norm(x2, 2, dim)
    return w12 / (w1 * w2).clamp(min=eps)

In [70]:
def loss(distances, target, alpha = 0.4):
    """
    :param distances 1d Tensor shape: (num_examples, )
    :param target 1d Tensor shape: (num_examples, )
    """

    diff = torch.abs(distances - target)
    return torch.sum(diff[diff > alpha])

In [7]:
def pad_list(batch, pad=None):
    if pad is None:
        pad = list
    batch_lengths = list(map(len, batch))
    max_len = max(batch_lengths)

    for seq, length in zip(batch, batch_lengths):
        diff = max_len - length
        for idx in range(diff):
            seq.append(pad())

    return batch


def pad_numpy(sequences, max_len=None):
    """
    :param sequences - list of lists
    """
    seq_lengths = list(map(len, sequences))
    max_len = max_len or max(seq_lengths)

    seq_tensor = np.zeros((len(sequences), max_len), dtype=int)

    for idx, (seq, seqlen) in enumerate(zip(sequences, seq_lengths)):
        seq_tensor[idx, :seqlen] = np.array(seq, dtype=int)

    return seq_tensor, seq_lengths


def pad_batch(batch):
    max_word_len = max(map(lambda x: max(map(len, x)), batch))
    return np.stack(map(lambda x: pad_numpy(x, max_word_len)[0], pad_list(batch)))


def encode_ngrams(tokens, dict_size):
    words = []
    for token in tokens:
        token = " {} ".format(token)
        word_ngrams = []
        for ngram in ngrams(token, 3):
            crc32_hash = zlib.crc32(str(ngram).encode())
            
            word_ngrams.append(crc32_hash % dict_size)
            
        words.append(word_ngrams)
    
    return words


def encode_texts(texts, dict_size):    
    return list(map(
        lambda x: encode_ngrams(x, dict_size),
        map(
            tokenizer,
            texts
        )
    ))    


In [8]:
class SparseLinear(nn.Linear):
    def __init__(self, dict_size, out_features, bias=True):
        super(SparseLinear, self).__init__(in_features=dict_size, out_features=out_features, bias=bias)
    
    def forward(self, inpt):
        res = torch.index_select(self.weight.t(), 0, inpt.view(-1))
        res = res.view(-1, inpt.shape[-1], self.out_features)
        res = res.sum(dim=1)
        res = res.view(*inpt.shape[:-1], self.out_features)
        
        if self.bias is not None:
            res = res + self.bias
        return res

In [39]:
class CDSSM(nn.Module):
    
    conv_input_size = 500
    conv_out_size = 300
    out_size = 128
    window = 3
    embedding_size = 20000
    
    def __init__(self, is_cuda=False):
        super(CDSSM, self).__init__()
        
        self.is_cuda = is_cuda
        
        self.sparse_linear = SparseLinear(dict_size=self.embedding_size, out_features=self.conv_input_size)
        self.conv_nn = torch.nn.Conv1d(self.conv_input_size, self.conv_out_size, self.window)
        self.feed_forvard = nn.Linear(in_features=self.conv_out_size, out_features=self.out_size)
        
        if self.is_cuda:
            self.cuda()
    
    def process_sentence(self, sentences):
        """
        :param sentences Tensor (batch_size, sentence_length, word_depth)
        """
        
        # Compress sparse ngram representation into dense vectors 
        sentences = F.relu(self.sparse_linear(sentences))
        
        # Prepare for convolution and apply it.
        # Combine 3-word window into single vector
        sentences = sentences.transpose(1, 2)
        
        conv_embedding = F.relu(self.conv_nn(sentences))
        
        # Apply max-pooling to compress variable-length sequence of 3-word vectors into single document vector
        convolutions_size = conv_embedding.size()[2]
        max_pooling = F.max_pool1d(conv_embedding, kernel_size=convolutions_size).view(-1, self.conv_out_size)
        
        # Compress pooled representation even more
        res = F.relu(self.feed_forvard(max_pooling))
        
        return res


In [40]:
query = "One (1) guide and one (1) spotter per hunter are generaly used. Top off this hunt with opportunities for " \
        "trophy XXXXX. (12 day hunts) Home | Hunts | Camps | News | Gallery | Links | Contact Copyright  2007 Bugle " \
        "Basin Outfitters. "
positive = "Wolverine Creek Outfitters . Founded in the mid 1940s, Wolverine Creek have consistently provided hunters " \
           "the opportunity to harvest trophy bull elk , moose , XXXXX and bighorn sheep . I met up with Wolverine " \
           "Creek s master hunter  Ryan Lakovitch  again at this years SHOT Show . Ryan "
negative = "2011-08-19 14:45:00 in the \" performance \" category Image by \"exfordy\" on Flickr End Point recently " \
           "started working with a new client (a startup in XXXXX, cannot name names, etc.) who is using PostgreSQL " \
           "because of the great success some of the people starting the company have had with Postgres "

In [41]:
model = CDSSM()

In [47]:
batch = pad_batch(encode_texts([query, positive, negative], 1000))

In [48]:
inp = Variable(torch.from_numpy(batch)); inp

tensor([[[ 208,  130,  919,  ...,    0,    0,    0],
         [ 642,  795,  277,  ...,    0,    0,    0],
         [ 772,  792,  225,  ...,    0,    0,    0],
         ...,
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]],

        [[ 529,  764,  739,  ...,    0,    0,    0],
         [ 251,  101,    5,  ...,    0,    0,    0],
         [ 825,  560,  297,  ...,    0,    0,    0],
         ...,
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0],
         [   0,    0,    0,  ...,    0,    0,    0]],

        [[ 609,  929,    0,  ...,    0,    0,    0],
         [ 193,  377,  426,  ...,    0,    0,    0],
         [ 248,  443,  660,  ...,  817,    0,    0],
         ...,
         [ 402,  316,  764,  ...,    0,    0,    0],
         [ 949,   30,  602,  ...,    0,    0,    0],
         [ 768,  233,  282,  ...,    0,    0,    0]]]

In [49]:
model.process_sentence(inp)

torch.Size([3, 47, 500])


tensor(1.00000e-02 *
       [[ 2.3946,  0.0000,  0.0000,  5.6208,  0.0000,  4.8701,  4.0002,
          1.3212,  0.0000,  5.1779,  3.3613,  0.0000,  0.0000,  0.0000,
          0.0000,  2.8338,  0.0000,  1.6666,  0.0000,  2.5371,  3.3030,
          0.0000,  6.9933,  0.0000,  1.5986,  3.5640,  0.0000,  0.0000,
          0.0000,  1.5470,  0.0000,  0.0000,  2.9109,  6.1972,  2.0791,
          0.0000,  3.6186,  0.0000,  0.0000,  2.3091,  0.0000,  2.8679,
          4.1418,  5.2189,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          4.9565,  0.0000,  4.0952,  4.4193,  0.0000,  0.0000,  0.0000,
          1.9381,  5.9366,  0.0000,  3.4665,  2.0813,  4.5665,  0.0000,
          0.0000,  2.8416,  0.0000,  0.8578,  2.4593,  0.0000,  0.0000,
          1.7891,  0.0000,  0.0000,  0.0000,  3.5063,  1.5199,  0.0000,
          2.0778,  0.0000,  3.9739,  0.2569,  2.5575,  3.1236,  3.2029,
          0.0000,  0.0000,  4.5519,  1.4453,  0.0000,  6.4522,  0.0000,
          0.0000,  0.0000,  1.6154,  1.4572

In [50]:
a = torch.FloatTensor([
    [1, 1],
    [-1, 1],
    [0, 1]
])

b = torch.FloatTensor([
    [-1, -1],
    [-2, 2],
    [1, 1]
])

In [58]:
exp = torch.FloatTensor([
    1,
    -1,
    1
])

In [67]:
diff = torch.abs(cosine_similarity(a, b) - exp)

In [69]:
torch.sum(diff[diff > 0.4])

tensor(4.)

In [52]:
import math

In [57]:
math.acos(0.7071)

0.785407753397449

In [74]:
math.cos(0)

1.0