In [1]:
import torch as t
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.nn.init import xavier_normal

class GloVe(nn.Module):
    def __init__(self, co_oc, embed_size, x_max=100, alpha=0.75):
        """
        :param co_oc: Co-occurrence ndarray with shape of [num_classes, num_classes]
        :param embed_size: embedding size
        :param x_max: An int representing cutoff of the weighting function
        :param alpha: Ant float parameter of the weighting function
        """

        super(GloVe, self).__init__()

        self.embed_size = embed_size
        self.x_max = x_max
        self.alpha = alpha

        ''' co_oc Matrix is shifted in order to prevent having log(0) '''
        self.co_oc = co_oc + 1.0

        [self.num_classes, _] = self.co_oc.shape

        self.in_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.in_embed.weight = xavier_normal(self.in_embed.weight)

        self.in_bias = nn.Embedding(self.num_classes, 1)
        self.in_bias.weight = xavier_normal(self.in_bias.weight)

        self.out_embed = nn.Embedding(self.num_classes, self.embed_size)
        self.out_embed.weight = xavier_normal(self.out_embed.weight)

        self.out_bias = nn.Embedding(self.num_classes, 1)
        self.out_bias.weight = xavier_normal(self.out_bias.weight)

    def forward(self, input, output):
        """
        :param input: An array with shape of [batch_size] of int type
        :param output: An array with shape of [batch_size] of int type
        :return: loss estimation for Global Vectors word representations
                 defined in nlp.stanford.edu/pubs/glove.pdf
        """

        batch_size = len(input)

        co_occurences = np.array([self.co_oc[input[i], output[i]] for i in range(batch_size)])
        weights = np.array([self._weight(var) for var in co_occurences])

        co_occurences = Variable(t.from_numpy(co_occurences)).float()
        weights = Variable(t.from_numpy(weights)).float()

        input = Variable(t.from_numpy(input))
        output = Variable(t.from_numpy(output))

        input_embed = self.in_embed(input)
        input_bias = self.in_bias(input)
        output_embed = self.out_embed(output)
        output_bias = self.out_bias(output)

        return (t.pow(
            ((input_embed * output_embed).sum(1) + input_bias + output_bias).squeeze(1) - t.log(co_occurences), 2
        ) * weights).sum()

    def _weight(self, x):
        return 1 if x > self.x_max else (x / self.x_max) ** self.alpha

    def embeddings(self):
        return self.in_embed.weight.data.cpu().numpy() + self.out_embed.weight.data.cpu().numpy()

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [5]:
cooc_frame = pd.read_csv("/var/patentmark/subset_cooc.csv", sep=",", header=None)

In [7]:
cooc_frame.columns = ["code1", "code2", "weight"]

In [10]:
cooc_frame.shape

(21513823, 3)

In [13]:
cooc_frame['code1'] = cooc_frame.code1.astype('category')

In [14]:
cooc_frame['code2'] = cooc_frame.code2.astype('category')

In [22]:
all_codes = list(set(cooc_frame.code1.cat.categories.values.tolist() + cooc_frame.code2.cat.categories.values.tolist()))

In [24]:
node2id = LabelEncoder()
node2id.fit(all_codes)

LabelEncoder()

In [30]:
import joblib
joblib.dump(node2id, "node2id.joblib")

['node2id.joblib']

In [36]:
node_mapping = pd.DataFrame( [(node_id, node) for node_id, node in enumerate(node2id.classes_)])
node_mapping.columns = ["node_id", "node"]
node_mapping.to_csv("cpc_mapping.csv", header=None, index=False)

In [37]:
from typing import Dict

import numpy as np


class Embedding(object):
    def __init__(self, embedding_path: str, dimensions: int, index_path: str = None):
        self.dimensions = dimensions
        self.embeddings = self.load_embeddings(embedding_path)
        self.index: Dict[str, int] = {}
        if index_path:
            self.load_index(index_path)

    def load_embeddings(self, file_name: str) -> np.ndarray:
        print("Loading embeddings...")
        embeddings = np.fromfile(file_name, dtype=np.float32)
        length = embeddings.shape[0]
        assert length % self.dimensions == 0, f"The number of floats ({length}) in the embeddings is not divisible by" \
                                              f"the number of dimensions ({self.dimensions})!"
        embedding_shape = [int(length / self.dimensions), self.dimensions]
        embeddings = embeddings.reshape(embedding_shape)
        print(f"Done loading embeddings (shape: {embeddings.shape}).")
        return embeddings

    def load_index(self, index_path: str) -> None:
        print("Loading uri index...")
        with open(index_path, "r") as file:
            for line in [line.strip() for line in file.readlines()]:
                index, uri = line.split(",", 1)
                self.index[uri] = int(index)
        print(f"Done loading {len(self.index)} items.")

    def __getitem__(self, item) -> np.ndarray:
        if self.index and isinstance(item, str):
            return self.embeddings[self.index[item]]
        return self.embeddings[item]


In [38]:
embedding_file = "/home/martin/cpc.emb.verse.32d.bin"
index_file = "cpc_mapping.csv"
embeddings = Embedding(embedding_file, 32, index_file)

Loading embeddings...
Done loading embeddings (shape: (164296, 32)).
Loading uri index...
Done loading 164296 items.


In [39]:
embeddings

<__main__.Embedding at 0x7fe61b887070>

In [25]:
len(node2id.classes_)

164296

In [26]:
cooc_frame["code1_id"] = node2id.transform(cooc_frame.code1)

In [27]:
cooc_frame["code2_id"] = node2id.transform(cooc_frame.code2)

In [29]:
cooc_frame[['code1_id', 'code2_id', 'weight']].to_csv("code_cooc_weighs_encoded.csv", index=False, header=None, sep=" ")

In [None]:
glove = GloVe(co_oc_matrix, embed_size)
    
optimizer = Adagrad(glove.parameters(), 0.05)
    
for i in range(num_iterations):
    ''' 
    input and target are [batch_size] shaped arrays of int type
    '''
    input, target = next_batch(batch_size)
        
    loss = glove(input, target)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
word_embeddings = glove.embeddings()      