In [1]:
import torch
import pandas as pd
import numpy as np
#import re
import nltk
#nltk.download()

from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [2]:
data = pd.read_csv("text_sentences.txt", delimiter='\n', header=None)
data.head()

Unnamed: 0,0
0,alpha-catenin inhibits beta-catenin signaling ...
1,A binary complex of birch profilin and skeleta...
2,Abnormal immunoreactivity in the tumor tissue ...
3,Absence of alpha-syntrophin leads to structura...
4,"Abundance of actin , talin , alpha 5 and beta ..."


In [3]:
len(data)

1100

In [4]:
encoded = data[0].apply(lambda x: tokenizer.encode_plus(
    text=x,  # the sentence to be encoded
    add_special_tokens=True,  # Add [CLS] and [SEP]
    max_length = 512,  # maximum length of a sentence
    pad_to_max_length=True,  # Add [PAD]s
    truncation = True,
    return_attention_mask = True,  # Generate the attention mask
    #return_tensors = 'pt',  # ask the function to return PyTorch tensors
))



In [5]:
np.array(encoded[1]['input_ids']).shape

(512,)

In [6]:
ids = np.array([encoded[i]['input_ids'] for i in range(len(encoded))])
masked = np.array([encoded[i]['attention_mask'] for i in range(len(encoded))])

In [7]:
ids.shape

(1100, 512)

In [8]:
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.

In [9]:
#coment for fine tuning
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [None]:
#running the model to get ourlayers
with torch.no_grad():

    outputs = model(input_ids,attention_mask )

    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    hidden_states = outputs[2]

In [10]:
#save the above hidden units tensor
torch.save(hidden_states,'hidden_states.pt')

In [12]:
# uncomment to load the trained tensors
#stat=torch.load('hidden_states.pt')
#hidden_states = stat

In [13]:
print ("Number of layers:", len(hidden_states), "  (initial embeddings + 12 BERT layers)")
layer_i = 0

print ("Number of batches:", len(hidden_states[layer_i]))
batch_i = 0

print ("Number of tokens:", len(hidden_states[layer_i][batch_i]))
token_i = 0

print ("Number of hidden units:", len(hidden_states[layer_i][batch_i][token_i]))

Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of batches: 1100
Number of tokens: 512
Number of hidden units: 768


In [15]:
print(len(hidden_states))
type(hidden_states)

13


torch.Tensor

In [16]:
stacked = torch.stack(hidden_states, dim=0)
print(type(stacked))
stacked.size()

<class 'torch.Tensor'>


torch.Size([13, 1100, 512, 768])

In [17]:
#concatinating the last 4 layers
word_embedding = []
for batch in range(len(hidden_states[0])):
    for token in range(len(hidden_states[0][0])):
        cat_vec = torch.cat((stacked[-1][batch][token],
                             stacked[-2][batch][token],
                             stacked[-3][batch][token],
                             stacked[-4][batch][token]),dim=0)
        word_embedding.append(cat_vec)
    

In [18]:
print ('Shape is: %d x %d' % (len(word_embedding), len(word_embedding[0])))

Shape is: 563200 x 3072
