In [1]:
from keybert import KeyBERT

# https://github.com/MaartenGr/KeyBERT
kw_model = KeyBERT()


def get_keywords(payload):
    keywords = kw_model.extract_keywords(payload)
    return keywords


payload = """
There are a few preprocessing steps particular to question answering tasks you should be aware of:

Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. To deal with longer sequences, truncate only the context by setting truncation="only_second".
Next, map the start and end positions of the answer to the original context by setting return_offset_mapping=True.
With the mapping in hand, now you can find the start and end tokens of the answer. Use the sequence_ids method to find which part of the offset corresponds to the question and which corresponds to the context.
"""


kws = get_keywords(payload=payload)

kws

  from .autonotebook import tqdm as notebook_tqdm


[('answering', 0.4536),
 ('sequence_ids', 0.4062),
 ('sequences', 0.3786),
 ('context', 0.3025),
 ('tasks', 0.2706)]

In [11]:
kw_model.model

<keybert.backend._sentencetransformers.SentenceTransformerBackend at 0x7bed60457a00>

# Embedding Model

In [None]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-MiniLM-L6-v2")

# QA model

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

model_name = "JinuAugustine/llama-2-7b-chat-gdpr"
model = AutoModelForQuestionAnswering.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.45it/s]
Some weights of LlamaForQuestionAnswering were not initialized from the model checkpoint at JinuAugustine/llama-2-7b-chat-gdpr and are newly initialized: ['embed_tokens.weight', 'layers.0.input_layernorm.weight', 'layers.0.mlp.down_proj.weight', 'layers.0.mlp.gate_proj.weight', 'layers.0.mlp.up_proj.weight', 'layers.0.post_attention_layernorm.weight', 'layers.0.self_attn.k_proj.weight', 'layers.0.self_attn.o_proj.weight', 'layers.0.self_attn.q_proj.weight', 'layers.0.self_attn.v_proj.weight', 'layers.1.input_layernorm.weight', 'layers.1.mlp.down_proj.weight', 'layers.1.mlp.gate_proj.weight', 'layers.1.mlp.up_proj.weight', 'layers.1.post_attention_layernorm.weight', 'layers.1.self_attn.k_proj.weight', 'layers.1.self_attn.o_proj.weight', 'layers.1.self_attn.q_proj.weight', 'layers.1.self_attn.v_proj.weight', 'layers.10.input_layernorm.weight', 'layers.10.mlp.down_pro

In [6]:
question = "What specific aspects of GDPR make it applicable to organizations outside the European Union?"

inputs = tokenizer(question, return_tensors="pt")
inputs

{'input_ids': tensor([[    1,  1724,  2702, 21420,   310,   402, 29928, 10593,  1207,   372,
         22903,   304, 25700,  5377,   278,  7824,  7761, 29973]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [4]:
with torch.no_grad():
    outputs = model(**inputs)
outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 0.6231, -0.7905,  1.5907,  0.5335, -0.1430, -1.3357, -0.3122, -0.5169,
         -2.0500, -1.0132,  0.5962,  0.0624, -1.1320, -0.5693, -0.6617, -1.1763,
         -1.5075,  1.3859]]), end_logits=tensor([[ 1.3263,  1.2028,  2.3109,  2.2422,  1.9583,  1.3100,  1.7135, -0.0387,
          0.6528, -0.6529,  1.5098,  0.6660,  1.1995,  0.8337,  3.0473,  0.2523,
          0.7147,  1.2713]]), hidden_states=None, attentions=None)

In [5]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [7]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'specific aspects of GDPR make it applicable to organizations outside the'