In [1]:
# !pip install datasets

In [2]:
import datasets

qa = datasets.load_dataset('squad', split='validation')
qa

Found cached dataset squad (C:/Users/u042/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10570
})

In [3]:
qa[0]

{'id': '56be4db0acb8001400a502ec',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the AFC at Super Bowl 50?',
 'answers': {'text': ['Denver Broncos', 'Denver Broncos', 'Denver Broncos'],


In [4]:
qa[1]

{'id': '56be4db0acb8001400a502ed',
 'title': 'Super_Bowl_50',
 'context': 'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.',
 'question': 'Which NFL team represented the NFC at Super Bowl 50?',
 'answers': {'text': ['Carolina Panthers',
   'Carolina Panthers',
   'Caroli

In [5]:
unique_contexts = []
unique_ids = []

# make list of IDs that represent only first instance of each context
for row in qa:
    if row['context'] not in unique_contexts:
        unique_contexts.append(row['context'])
        unique_ids.append(row['id'])
        
# now filter out any sample that aren't included in unique IDs
qa = qa.filter(lambda x: True if x['id'] in unique_ids else False)
qa

Loading cached processed dataset at C:\Users\u042\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-c2c2462b8b5781fc.arrow


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2067
})

In [6]:
# !pip install sentence_transformers

### Create context vectors with the retriever model

In [7]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')
model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
)

### Encode the context vectors

In [8]:
qa = qa.map(lambda x: {
    'encoding': model.encode(x['context']).tolist()
}, batched=True, batch_size=32)

qa

Loading cached processed dataset at C:\Users\u042\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-8e560c6ebb305938.arrow


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers', 'encoding'],
    num_rows: 2067
})

## Create Vector Database (and index context vectors)

#### Use either Faiss (there is a whole set of articles and videos dedicated to Faiss at Pinecone. For Pinecone we need an API key from: https://app.pinecone.io.

c8cfcdd4-e091-47ca-be53-4c2c6c4bfbb4

#### Also install the client:

!pip install pinecone-client


In [9]:
# !pip install pinecone-client

In [10]:
API_KEY = "c8cfcdd4-e091-47ca-be53-4c2c6c4bfbb4"

In [11]:
import pinecone

pinecone.init(API_KEY, environment='us-east1-gcp')

In [12]:
# pinecone.create_index('qa-index', dimension=len(model.encode('hello world').tolist()))

In [13]:
index = pinecone.Index('qa-index')

In [14]:
from tqdm.auto import tqdm

upserts = [(v['id'], v['encoding']) for v in qa]
for i in tqdm(range(0, len(upserts), 50)):
    i_end = i + 50
    if i_end > len(upserts):
        i_end = len(upserts)
    index.upsert(vectors=upserts[i:i_end])

  0%|          | 0/42 [00:00<?, ?it/s]

## QA Inference

In [15]:
query = "Which NFL team represented the AFC at Super Bowl 50?"
# query = "Do NFL teams only care about playing at the Super Bowl?"
xq = model.encode([query]).tolist()

In [16]:
xc = index.query(xq, top_k=5)
# xc

In [17]:
xc['matches']

[{'id': '56be4db0acb8001400a502ec',
  'score': 0.685847402,
  'sparseValues': {},
  'values': []},
 {'id': '56be53b8acb8001400a50314',
  'score': 0.586465776,
  'sparseValues': {},
  'values': []},
 {'id': '56be4e1facb8001400a502f6',
  'score': 0.54540956,
  'sparseValues': {},
  'values': []},
 {'id': '56becb823aeaaa14008c948b',
  'score': 0.538329065,
  'sparseValues': {},
  'values': []},
 {'id': '56bec0dd3aeaaa14008c9357',
  'score': 0.520058692,
  'sparseValues': {},
  'values': []}]

In [18]:
ids = [x['id'] for x in xc['matches']]
# ids
contexts = qa.filter(lambda x: True if x['id'] in ids else False)

Loading cached processed dataset at C:\Users\u042\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-b0600eaa0da71746.arrow


In [19]:
from transformers import pipeline

model_name = 'deepset/electra-base-squad2'
nlp = pipeline(tokenizer=model_name, model=model_name,
              task='question-answering')

In [20]:
for context in contexts['context']:    
    print(nlp(question=query, context=context))

{'score': 0.999852180480957, 'start': 177, 'end': 191, 'answer': 'Denver Broncos'}
{'score': 6.596066555175639e-07, 'start': 525, 'end': 539, 'answer': 'Dallas Cowboys'}
{'score': 1.1175001418450847e-05, 'start': 15, 'end': 93, 'answer': 'NFL Commissioner Roger Goodell stated that the league planned to make the 50th'}
{'score': 2.344022979189342e-12, 'start': 564, 'end': 579, 'answer': 'Super Bowl XXXV'}
{'score': 0.009671404957771301, 'start': 68, 'end': 74, 'answer': 'Denver'}
