In [35]:
import os

import torch

from transformers import(
    AutoModelForSeq2SeqLM, 
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    AutoConfig,
    T5Tokenizer
)

In [30]:
MODEL_AND_TOKENIZER_PATH=os.path.join('model', 't5-small-qg-hl')
TOKENIZER_PATH=os.path.join(os.getcwd(), 'model', 't5-qg-tokenizer')
print(TOKENIZER_PATH)

c:\git\projetos-bi-master\estudo-nlp\semantic-search\question_generator\model\t5-qg-tokenizer


In [33]:
class E2EQGPipeline:
    def __init__(
        self,
        model_path_or_name: str,
        tokenizer_path_or_name: str,
        use_cuda: bool
    ) :

        self.model:PreTrainedModel = AutoModelForSeq2SeqLM.from_pretrained(model_path_or_name)
        
        self.tokenizer:PreTrainedTokenizer = T5Tokenizer.from_pretrained(tokenizer_path_or_name, config=AutoConfig.from_pretrained(model_path_or_name))

        self.device = "cuda" if torch.cuda.is_available() and use_cuda else "cpu"
        self.model.to(self.device)

        assert self.model.__class__.__name__ in ["T5ForConditionalGeneration", "BartForConditionalGeneration"]
        
        if "T5ForConditionalGeneration" in self.model.__class__.__name__:
            self.model_type = "t5"
        else:
            self.model_type = "bart"
        
        self.default_generate_kwargs = {
            "max_length": 256,
            "num_beams": 4,
            "length_penalty": 1.5,
            "no_repeat_ngram_size": 3,
            "early_stopping": True,
        }
    
    def __call__(self, context: str, **generate_kwargs):
        inputs = self._prepare_inputs_for_e2e_qg(context)

        # TODO: when overrding default_generate_kwargs all other arguments need to be passsed
        # find a better way to do this
        if not generate_kwargs:
            generate_kwargs = self.default_generate_kwargs
        
        input_length = inputs["input_ids"].shape[-1]
        
        # max_length = generate_kwargs.get("max_length", 256)
        # if input_length < max_length:
        #     logger.warning(
        #         "Your max_length is set to {}, but you input_length is only {}. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=50)".format(
        #             max_length, input_length
        #         )
        #     )

        outs = self.model.generate(
            input_ids=inputs['input_ids'].to(self.device), 
            attention_mask=inputs['attention_mask'].to(self.device),
            **generate_kwargs
        )

        prediction = self.tokenizer.decode(outs[0], skip_special_tokens=True)
        questions = prediction.split("<sep>")
        questions = [question.strip() for question in questions[:-1]]
        return questions
    
    def _prepare_inputs_for_e2e_qg(self, context):
        source_text = f"generate questions: {context}"
        if self.model_type == "t5":
            source_text = source_text + " </s>"
        
        inputs = self._tokenize([source_text], padding=False)
        return inputs
    
    def _tokenize(
        self,
        inputs,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        max_length=512
    ):
        inputs = self.tokenizer.batch_encode_plus(
            inputs, 
            max_length=max_length,
            add_special_tokens=add_special_tokens,
            truncation=truncation,
            padding="max_length" if padding else False,
            pad_to_max_length=padding,
            return_tensors="pt"
        )
        return inputs


In [36]:
e2e_qg = E2EQGPipeline(model_path_or_name=MODEL_AND_TOKENIZER_PATH,tokenizer_path_or_name=MODEL_AND_TOKENIZER_PATH,use_cuda=True)

In [38]:
text = "Python is an interpreted, high-level, general-purpose programming language. Created by Guido van Rossum \
and first released in 1991, Python's design philosophy emphasizes code \
readability with its notable use of significant whitespace."

text2 = "Gravity (from Latin gravitas, meaning 'weight'), or gravitation, is a natural phenomenon by which all \
things with mass or energy—including planets, stars, galaxies, and even light—are brought toward (or gravitate toward) \
one another. On Earth, gravity gives weight to physical objects, and the Moon's gravity causes the ocean tides. \
The gravitational attraction of the original gaseous matter present in the Universe caused it to begin coalescing \
and forming stars and caused the stars to group together into galaxies, so gravity is responsible for many of \
the large-scale structures in the Universe. Gravity has an infinite range, although its effects become increasingly \
weaker as objects get further away"

text3 = "42 is the answer to life, universe and everything."

text4 = "Forrest Gump is a 1994 American comedy-drama film directed by Robert Zemeckis and written by Eric Roth. \
It is based on the 1986 novel of the same name by Winston Groom and stars Tom Hanks, Robin Wright, Gary Sinise, \
Mykelti Williamson and Sally Field. The story depicts several decades in the life of Forrest Gump (Hanks), \
a slow-witted but kind-hearted man from Alabama who witnesses and unwittingly influences several defining \
historical events in the 20th century United States. The film differs substantially from the novel."

In [39]:
e2e_qg(text)



['What language is Python interpreted?',
 'Who created Python?',
 'When was Python first released?',
 "What is Python's design philosophy?"]

In [40]:
e2e_qg(text2)



['What is a natural phenomenon called gravitation?',
 'What does gravity give weight to physical objects on Earth?']

In [41]:
e2e_qg(text3)



['What is the answer to life, universe, and everything?',
 'What is 42?',
 'How is 42 the answer?']

In [42]:
e2e_qg(text4)



['Who is the director of Forrest Gump?',
 'What is the name of the American comedy-drama film?']