In [1]:
"""
get embeddings for target words 
"""

__author__ = 'Christin Beck'
__created__ = '29.06.2023'

from icecream import ic

import re
import os

from transformers import *
import torch

import numpy as np
import pandas as pd


import unicodedata
import math

2023-07-24 10:50:31.164367: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def tokens_to_words(tokenized_sentence):    #returns list of tuples (list = sentence), tuple: (word, starting index in tokenized, end index in tokenized)
    bert_to_words = []
    for j, token in enumerate(tokenized_sentence):
        if re.search(r'\[(CLS|SEP|PAD)\]', token):
            bert_to_words.append((token, j, j))
        elif not token == '[PAD]': #ignore padding
            if j+1 < len(tokenized_sentence): 
                if not '##' in tokenized_sentence[j] and '##' in tokenized_sentence[j+1]:
                    begin = j
                    tokens_to_word = tokenized_sentence[j] + tokenized_sentence[j+1]
                    tokens_to_word = tokens_to_word.replace('##', '')
                    n = 2
                    while j+n < len(tokenized_sentence) and '##' in tokenized_sentence[j+n]:
                        tokens_to_word = tokens_to_word + tokenized_sentence[j+n]
                        tokens_to_word = tokens_to_word.replace('##', '')
                        n+=1
                    end = j+n-1
                    bert_to_words.append((tokens_to_word, begin, end))
                elif not '##' in tokenized_sentence[j] and not '##' in tokenized_sentence[j+1]:
                    bert_to_words.append((token, j, j))
  
    return bert_to_words

In [3]:
# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 2 GPU(s) available.
We will use the GPU: NVIDIA A100-PCIE-40GB


In [5]:
file = 'target_sentences_swadesh.tsv'

data = pd.read_csv(file, sep='\t', quotechar='\0', encoding='utf8') #quotation marks in data (would be read in as string delimiter otherwise and there is incomplete quotation in the data)

concepts = pd.DataFrame(data, columns=['Concept']).values.flatten().tolist()  
target_words = pd.DataFrame(data, columns=['Word']).values.flatten().tolist()  
positions = pd.DataFrame(data, columns=['Position']).values.flatten().tolist()  
corpora = pd.DataFrame(data, columns=['Corpus']).values.flatten().tolist()  
sentences = pd.DataFrame(data, columns=['Sentence']).values.flatten().tolist()  

original_sent = sentences

new_sent = []
for i, s in enumerate(sentences):
    
    #replacing target words with concept/lemma following Laicher
    target_position = int(positions[i])
    words = s.split(' ')
    
    #missing entries are not matched here (is ok)
    if target_position in range(len(words)):
        sent = ' '.join(words[0:target_position]) + ' ' + concepts[i] + ' ' + ' '.join(words[target_position+1:])
        s = sent

    #needed for matching token embeddings
    s = re.sub(r'(\\&|\^|’|\'|‘|„|\"|»|″|“|”|〟|〞|«|′|‟)', '', s)
    s = re.sub(r'(\(|\*)(\w|\d)', '\2', s) #only needed if not split off of word
    s = re.sub(r'(\w|\d)(\)|\()', '\1', s)
    s = re.sub(r'\((\w+|\d+)\)', '\1', s)
    s = re.sub(r'(\w|\d)(=|-|==|--|—|—————)(\w|\d)', '\1\3', s)
    s = re.sub(r'(\w|\d)(=|-|—) ', '\1 ', s)
    s = re.sub(r'(\w)(\\&|&)', '\1', s)
    s = re.sub(r'\s+', ' ', s)
    s = re.sub(r'(\w|\d)\.', '\1 .', s)
    s = re.sub(r'(\w)( ̄)', '\1 \2', s) 
    s = re.sub(r'', 'NAN', s)
    s = s.lstrip() #in case replacement introduced an initial white space
    

    
    new_sent.append(s)
    
sentences = new_sent




In [6]:
#get embeddings
print('Load BERT tokenizer.')    
#Load BERT tokenizer
#model_path = '../models/bert/pretrained-bert'
#model_path = 'dbmdz/bert-base-german-cased'
model_path = '../models/bert/fine-tuned-bert/german'

tokenizer_path = '../models/bert/pretrained-tokenizer'
tokenizer = BertTokenizerFast.from_pretrained(tokenizer_path)

#hidden states = embeddings
print('Load pre-trained BERT weights.')
model = BertModel.from_pretrained(model_path,
                              output_hidden_states = True, #return all hidden_states (will be third argument/index 2 of outputs below when model is called)
                              )
                              
                              #maybe just get last layer for classification? might be faster
                              
model.eval() #puts model in evaluation mode, dropout regularization (used in training) is turned off

sent_len = 100  #this is no of tokens

print('Tokenize input sentences, return dictionary of tensors (attention_mask, input_ids, token_type_ids).')
#[CLS] and [SEP] added
#padding pads each sentence to the maximum length there is in the batch, adds [PAD] tokens
#truncate each sentence to max length the model can accept  (now decided 20 covers many sentences, long enough, not very long sentences)
#return tensors
#attention mask indicates which tokens are just padding (tells the model not to pay attention to these)
encoded_inputs = tokenizer(sentences, max_length=sent_len, padding='max_length', truncation=True, return_tensors='pt') 

#map words to tokens
map_words_to_tokens = []

for i, ids in enumerate(encoded_inputs['input_ids']):
    tokenized_input = tokenizer.convert_ids_to_tokens(ids)
    #print(tokenized_input)
    word_indices = tokens_to_words(tokenizer.convert_ids_to_tokens(ids))
    map_words_to_tokens.append(word_indices)


loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file ../models/bert/pretrained-tokenizer/config.json
Model config BertConfig {
  "_name_or_path": "../models/bert/pretrained-tokenizer",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "cls_token": "[CLS]",
  "do_lower_case": true,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "mask_token": "[MASK]",
  "max_len": 512,
  "max_position_embeddings": 512,
  "model_max_length": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token": "[PAD]",
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "sep_token": "[SEP]",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "unk_token": "[UNK]",
  "use_cache": true,
  "vocab

Load BERT tokenizer.
Load pre-trained BERT weights.


Some weights of the model checkpoint at ../models/bert/fine-tuned-bert/german were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at ../models/bert/fine-tuned-bert/german and are newly initialized: ['bert.pooler.dense.we

Tokenize input sentences, return dictionary of tensors (attention_mask, input_ids, token_type_ids).


In [7]:
print('Return and collect all hidden states (layers).')
# Load pre-trained model (weights)

#batching

batch_size = 512

batches = math.ceil(len(sentences)/batch_size) #rounds up
print('No of batches: ', batches)


sentences_out = open('swadesh_meta.tsv', 'w')
sentences_out.write('Concept\tOriginal_Word\tPosition\tCorpus\tSentence\tOriginal_sentence\n')

all_embeddings_cat = []
all_embeddings_last = []
all_embeddings_sum = []
all_embeddings_avg = []


for batch in range(0,batches):
    print('Currently, batch no:', batch)  
    start_batch = batch * batch_size #1: 0, 2: 128, 3:256
    end_batch = start_batch + batch_size #1: 128, 2: 256
    #Just doing a forward pass of the model here (no backprop)
    #torch.no_grad() tells PyTorch not to construct the compute graph
    #reduces memory consumption, faster    
    with torch.no_grad():
        #call model
        outputs = model(encoded_inputs['input_ids'][start_batch:end_batch])
        hidden_states = outputs[2] #third argument of outputs are hidden_states



    #get word embeddings and rearrange sentence embeddings


    current_positions = positions[start_batch:end_batch]
    current_concepts = concepts[start_batch:end_batch]
    current_target_words = target_words[start_batch:end_batch]
    current_sentences = sentences[start_batch:end_batch]
    current_originals = original_sent[start_batch:end_batch]
    current_corpus = corpora[start_batch:end_batch]

    for i, sent in enumerate(map_words_to_tokens[start_batch:end_batch]):
        target_position = int(current_positions[i])+1 #for CLS token
        concept = current_concepts[i]
        target_word = concept.lower()

        target_word_original = str(current_target_words[i])
        if target_position in range(len(sent)): #late targets may not be within no of tokens
            target_in_sent = sent[target_position][0]
 
            if target_word == target_in_sent:
                start_index = sent[target_position][1]
                end_index = sent[target_position][2]
    

                if start_index != end_index:            #layer|sent|tokens
                    word_embed = np.average(hidden_states[-2][i][start_index:end_index+1], axis = 0)
                    #concatenation of last four layers
                    word_embed_cat = np.average(hidden_states[-4][i][start_index:end_index+1], axis = 0)
                    #sum last 4
                    word_embed_sum = np.average(hidden_states[-4][i][start_index:end_index+1], axis = 0)
                    #average last 4
                    word_embed_avg = np.average(hidden_states[-4][i][start_index:end_index+1], axis = 0)
                    for k in range(-3,0):
                        word_embed_cat = np.concatenate((word_embed_cat, np.average(hidden_states[k][i][start_index:end_index+1], axis = 0)), axis = 0)
                        word_embed_sum = np.sum((word_embed_sum, np.average(hidden_states[k][i][start_index:end_index+1], axis = 0)), axis = 0)
                        word_embed_avg = np.average((word_embed_avg, np.average(hidden_states[k][i][start_index:end_index+1], axis = 0)), axis = 0)



                    target_embedding = word_embed_cat
                else:
                    word_embed = hidden_states[-2][i][start_index].numpy()
          
                    word_embed_cat = hidden_states[-4][i][start_index].numpy()
                    word_embed_sum = hidden_states[-4][i][start_index].numpy()
                    word_embed_avg = hidden_states[-4][i][start_index].numpy()
                    for k in range(-3,0):

                        word_embed_cat = np.concatenate((word_embed_cat, hidden_states[k][i][start_index].numpy()), axis = 0)
                        word_embed_sum = np.sum((word_embed_sum, hidden_states[k][i][start_index].numpy()), axis = 0)
                        word_embed_avg = np.average((word_embed_avg, hidden_states[k][i][start_index].numpy()), axis = 0)

                    target_embedding = word_embed_cat
    
                all_embeddings_cat.append(target_embedding)
                all_embeddings_last.append(word_embed)
                all_embeddings_sum.append(word_embed_sum)
                all_embeddings_avg.append(word_embed_avg)
                sentences_out.write(concept + '\t' + target_word_original + '\t' + str(target_position) + '\t' + current_corpus[i] + '\t' + current_sentences[i] + '\t' + current_originals[i] + '\n')
    



Return and collect all hidden states (layers).
No of batches:  305
Currently, batch no: 0
0 512
Number of layers: 13   (initial embeddings + 12 BERT layers)
Number of sentences: 512
Number of tokens: 100
Number of hidden units: 768
Tensor shape for each layer:  torch.Size([512, 100, 768])
[21, 21, 2, 4, 5, 1, 2, 21, 21, 19, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 6, 8, 21, 3, 3, 5, 3, 5, 10, 3, 2, 2, 4, 7, 5, 6, 3, 4, 3, 9, 4, 4, 2, 4, 4, 5, 2, 4, 6, 6, 1, 5, 2, 2, 2, 2, 2, 6, 18, 0, 8, 8, 4, 3, 4, 12, 5, 7, 8, 3, 8, 10, 8, 2, 3, 4, 10, 21, 12, 9, 21, 1, 21, 12, 4, 1, 7, 7, 3, 7, 12, 5, 3, 1, 2, 3, 2, 2, 3, 7, 4, 2, 3, 0, 0, 2, 2, 3, 8, 6, 6, 4, 3, 4, 12, 21, 2, 21, 21, 7, 8, 10, 3, 19, 10, 5, 5, 6, 2, 7, 3, 13, 4, 10, 2, 4, 3, 16, 6, 8, 9, 13, 1, 5, 2, 2, 11, 2, 4, 13, 5, 4, 1, 10, 15, 9, 1, 5, 9, 3, 1, 2, 9, 6, 5, 3, 7, 4, 6, 3, 5, 13, 2, 3, 2, 4, 7, 1, 4, 6, 4, 4, 3, 10, 11, 9, 21, 2, 6, 4, 11, 9

In [8]:
all_embeddings_cat = np.array(all_embeddings_cat)
all_embeddings_last = np.array(all_embeddings_last)
all_embeddings_sum = np.array(all_embeddings_sum)
all_embeddings_avg = np.array(all_embeddings_avg)


np.save('swadesh_embeddings_cat.npy', all_embeddings_cat)
np.save('swadesh_embeddings_last.npy', all_embeddings_last)
np.save('swadesh_embeddings_sum.npy', all_embeddings_sum)
np.save('swadesh_embeddings_avg.npy', all_embeddings_avg)

148313
148313
