In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F

#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


In [2]:

# Sentences we want sentence embeddings for
sentences = ['This is an example sentence', 'Each sentence is converted']

In [3]:
# Load model from HuggingFace Hub
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')


In [4]:
# Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [5]:
# Compute token embeddings
with torch.no_grad():
    model_output = model(**encoded_input)

In [6]:
attention_mask = encoded_input['attention_mask']

In [16]:
token_embeddings = model_output[0]
token_embeddings.shape
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
mean_pooling  = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [19]:
# Perform pooling
#sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

# Normalize embeddings
sentence_embeddings = F.normalize(mean_pooling, p=2, dim=1)

In [20]:
from sentence_transformers import SentenceTransformer, models


In [25]:

word_embedding_model = models.Transformer('bert-base-uncased', max_seq_length=256)
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
sentences

['This is an example sentence', 'Each sentence is converted']

In [31]:
model = SentenceTransformer(modules=[word_embedding_model])
res = model.encode(sentences)

KeyError: 'sentence_embedding'

In [33]:
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

In [64]:
sentences = ['that that This is an example sentence that I like a lot and I will tell you that I', 'Each sentence is converted']

In [65]:
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

In [66]:
with torch.no_grad():
    model_output = model(**encoded_input)

In [69]:
encoded_input['input_ids'][0]

tensor([ 101, 2008, 2008, 2023, 2003, 2019, 2742, 6251, 2008, 1045, 2066, 1037,
        2843, 1998, 1045, 2097, 2425, 2017, 2008, 1045,  102])

In [70]:
encoded_input['input_ids'][1]

tensor([ 101, 2169, 6251, 2003, 4991,  102,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0])

In [77]:
import openai
response = openai.Embedding.create(
    input="This is an example",
    engine="text-similarity-davinci-001")

ModuleNotFoundError: No module named 'openai'

In [78]:
!pip install openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting openai
  Downloading openai-0.16.0.tar.gz (41 kB)
[K     |████████████████████████████████| 41 kB 1.2 MB/s  eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting openpyxl>=3.0.7
  Downloading openpyxl-3.0.9-py2.py3-none-any.whl (242 kB)
[K     |████████████████████████████████| 242 kB 19.1 MB/s eta 0:00:01
Collecting pandas-stubs>=1.1.0.11
  Downloading pandas_stubs-1.2.0.53-py3-none-any.whl (162 kB)
[K     |████████████████████████████████| 162 kB 96.7 MB/s eta 0:00:01
Collecting et-xmlfile
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Building wheels fo