In [None]:
import os
import torch
import pickle
import numpy as np

from transformers import AlbertTokenizer, AlbertForMaskedLM, AlbertConfig
from collections import OrderedDict

In [None]:
folder = '/home/ayan-yue/Documents/projects/diachronic-analysis-ALBERT'

models = OrderedDict()
tokenizers = OrderedDict()

for genre in os.listdir(folder + '/models'):
    
    config = AlbertConfig.from_json_file(folder + '/models' + '/' + genre + '/config.json')
    model = AlbertForMaskedLM.from_pretrained('albert-base-v2', config=config)
    state_dict_path = (folder + '/models' + '/' + genre + '/pytorch_model.bin')        
    model.load_state_dict(torch.load(state_dict_path))
    
    models[genre] = model
    tokenizers[genre] = AlbertTokenizer.from_pretrained(folder + '/models' + '/' + genre)

In [None]:
with open (folder + '/pickled-d', 'rb') as f:

    d = pickle.load(f)

In [None]:
#AddMoreClusters code 

def get_embedding_for_sentence(tokenized_sent, genre):
    
    tokenizer = tokenizers[genre]
    model = models[genre]
    
    #print("Getting embedding for sentence")
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_sent)
    tokens_tensor = torch.tensor([indexed_tokens])
    segments_ids = [1] * len(tokenized_sent)
    segments_tensors = torch.tensor([segments_ids])
    
    with torch.no_grad():
        
        _ , encoded_layers = model(tokens_tensor, segments_tensors)
        batch_i = 0
        token_embeddings = []
        
        # For each token in the sentence...
        for token_i in range(len(tokenized_sent)):
            
            hidden_layers = []
            
            # For each of the 12 layers...
            for layer_i in range(len(encoded_layers)):
                
                # Lookup the vector for `token_i` in `layer_i`
                vec = encoded_layers[layer_i][batch_i][token_i]
                hidden_layers.append(vec)
                
            token_embeddings.append(hidden_layers
                                   )
        concatenated_last_4_layers = [torch.cat((layer[-1], layer[-2], layer[-3], layer[-4]), 0) for layer in token_embeddings]
        summed_last_4_layers = [torch.sum(torch.stack(layer)[-4:], 0) for layer in token_embeddings]
        last_layer = [layer[-1] for layer in token_embeddings]
        
        return summed_last_4_layers
    
    
def retrieve_sentences_with_word(word, genre):
    
    sentences = [sentence for sentence in d[genre] if word in sentence]
    
    return sentences


def get_embeddings_for_word(word, sentences, genre):
    
    tokenizer = tokenizers[genre]
    
    print("Getting ALBERT embeddings for word:", word)
    word_embeddings = []
    valid_sentences = []
    for i, sentence in enumerate(sentences):
        
            marked_sent = "[CLS] " + sentence + " [SEP]"
            tokenized_sent = tokenizer.tokenize(marked_sent)
            
            if tokenizer.tokenize(word)[0] in tokenized_sent and len(tokenized_sent) < 512 and len(tokenized_sent) > 3:
                
                sent_embedding = get_embedding_for_sentence(tokenized_sent, genre)
                word_indexes = list(np.where(np.array(tokenized_sent) == tokenizer.tokenize(word))[0])
                
                for index in word_indexes:
                    
                    word_embedding = np.array(sent_embedding[index])
                    word_embeddings.append(word_embedding)
                    valid_sentences.append(sentence)
                    
    word_embeddings = np.array(word_embeddings)
    valid_sentences = np.array(valid_sentences)
    
    return word_embeddings, valid_sentences


def get_embeddings(word, genre):
    
    sentences = retrieve_sentences_with_word(word, genre)
    embeddings = get_embeddings_for_word(word, sentences, genre)
    
    return embeddings

In [None]:
get_embeddings('innovation', 'reports_texts')
    