In [8]:
import gensim
from gensim.models import word2vec
import pickle
import numpy as np
import torch
from transformers import BertTokenizer, BertModel, AutoModel, BertTokenizerFast
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler


class SciCiteEmbedding:
    def __init__(self, glove=False, word2vec=False, elmo=False, bert=False):
        self.word2vec = word2vec
        self.glove = glove
        self.elmo = elmo
        self.bert = bert
        self.ndim = None

    def embed(self, X_train):  # return a 3D numpy array. shape: [samples, max_length, embed_dim]
        embed_list = []
        if self.word2vec:  # time-consuming and could miss some tokens that do not appear in word2vec keys.
            # if word2vec model has not been dowloaded, plz run this:
            # path = gensim.downloader.load("word2vec-google-news-300", return_path=True)
            word2vec_model_path = "./word2vec/word2vec-google-news-300.gz"
            w2v = gensim.models.KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
            max_length = 0
            for token_list in X_train:
                np_list = []
                for token in token_list:
                    try:
                        np_list.append(w2v[token])
                    except:
                        continue
                max_length = max(max_length, len(np_list))
                embed_list.append(np.array(np_list))
            embed_list = [np.concatenate([array, np.zeros((max_length-array.shape[0], 300))], axis=0) for array in embed_list]
            return np.array(embed_list)  # [samples, max_length, 300]

        elif self.glove:
            max_length = max([len(token_list) for token_list in X_train])
            glove_model_path = './GloVe/glove2word2vec_model.sav'  # glove embed dimension=100
            glove = pickle.load(open(glove_model_path, 'rb'))
            for token_list in X_train:
                np_list = []
                for token in token_list:
                    try:
                        v = glove[token]
                    except:
                        continue
                    np_list.append(v)
                embed_list.append(
                    np.concatenate([np.array(np_list), np.zeros((max_length - len(token_list), 100))], axis=0))
            embed_array = np.array(embed_list)  # [n_sample, max_length, ndim=512]
            return embed_array

        elif self.elmo: # elmo_dim = 512
            elmo_options_file = "./elmo/elmo_2x2048_256_2048cnn_1xhighway_options.json"
            elmo_weight_file = "./elmo/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5"
            elmo_model = Elmo(elmo_options_file, elmo_weight_file, 1, dropout=0)
            character_ids = batch_to_ids(X_train)
            embeddings = elmo_model(character_ids)['elmo_representations'][0]
            return embeddings.detach().numpy()  # [n_sample, max_length, ndim=512]

        elif self.bert:#dim = 768
            model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
            # kiv: big file to download
            tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
            character_ids = tokenizer.batch_encode_plus(X_train, return_token_type_ids= False,is_split_into_words=True,padding=True)
            tokens_tensor = torch.tensor(character_ids['input_ids'])
            segments_tensor = torch.tensor(character_ids['attention_mask'])
            with torch.no_grad():
                outputs = model(tokens_tensor, segments_tensor)
            hidden_states = outputs[2][0].tolist()
            embeddings = []
            for element in hidden_states:
                for array in element:
                    embeddings.append(np.array(array))
            return np.array(embeddings)


test = [['ok', ',', 'fine', 'i', 'will', 'check', 'it', 'later', '.'],
        ['love', 'you', '!'],
        ['i', 'think', 'data', 'preprocessing', 'is', 'so', 'complicated', '.']]

embed_model = SciCiteEmbedding(bert=True)  # pick an embedding method
embed_array = embed_model.embed(test)
np.save('../data/a_simple_test.npy', embed_array)
print(embed_array.shape)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(39, 768)
