#### Implement a Dual Encoder 
- Using `all-MiniLM-L6-v2`
- Also called Bi Encoders

In [2]:
#pip insall transformer
#pip insall datasets

In [16]:
from transformers import AutoTokenizer, AutoModel
import torch

import torch.nn.functional as F

In [5]:
# load the tokenizer and model
model_name = "sentence-transformers/all-MiniLM-L6-v2"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model     = AutoModel.from_pretrained(model_name)

#### Define the encoding function
- query 
- documents

In [13]:
def encode_text(texts, tokenizer, model):
    
    # tokenize the input text
    inputs = tokenizer(texts, return_tensors = 'pt', padding = True, truncation = True)
    
    # pass the inputs thru the model (gen the emb)
    with torch.no_grad():
        outputs = model(**inputs)       # prediction
        
        # summarize the embeddings (last hidden state)
        emb = outputs.last_hidden_state.mean(dim=1)
    
    return emb

#### encode the q and docs

In [11]:
query = "what is the capital of France?"

documents = [
    "Paris is the capital of France",
    "Berlin is the capital of Germany",
    "California is the capital of USA"
]

In [15]:
# encode the q and docs

query_embeddings = encode_text([query],   tokenizer, model)
docs_embeddings  = encode_text(documents, tokenizer, model)

#### compute the similarities

In [17]:
cos_sim = F.cosine_similarity(query_embeddings, docs_embeddings)

In [18]:
cos_sim

tensor([0.8501, 0.3943, 0.3895])

#### try and understand the dual encoder model

In [19]:
import pandas as pd
from datasets import load_dataset

In [20]:
snli = load_dataset("snli")

In [21]:

snli

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
})

In [22]:
df = pd.DataFrame(snli['train'])

In [24]:
pd.set_option('display.max_colwidth', None)

In [25]:
df.head(10)

Unnamed: 0,premise,hypothesis,label
0,A person on a horse jumps over a broken down airplane.,A person is training his horse for a competition.,1
1,A person on a horse jumps over a broken down airplane.,"A person is at a diner, ordering an omelette.",2
2,A person on a horse jumps over a broken down airplane.,"A person is outdoors, on a horse.",0
3,Children smiling and waving at camera,They are smiling at their parents,1
4,Children smiling and waving at camera,There are children present,0
5,Children smiling and waving at camera,The kids are frowning,2
6,A boy is jumping on skateboard in the middle of a red bridge.,The boy skates down the sidewalk.,2
7,A boy is jumping on skateboard in the middle of a red bridge.,The boy does a skateboarding trick.,0
8,A boy is jumping on skateboard in the middle of a red bridge.,The boy is wearing safety equipment.,1
9,An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.,An older man drinks his juice as he waits for his daughter to get off work.,1


label
- entailment    1
- contradiction 2
- Neutral       0

#### Issues with Dual Encoders

- Depends heavily on the positive and negative pairs (during the training of the bert model)
- may not be relevant to the project (custom one)