In [None]:
import os
import pathlib
import re

from transformers import pipeline

import ipywidgets as widgets
from ipywidgets import interact, interact_manual

from IPython.display import display, Markdown

import utils
import importlib
importlib.reload(utils)


In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Grammar correction models explorations

In [None]:
GRAMFORMER_MODEL_URL = "prithivida/grammar_error_correcter_v1"
grammar_model, grammar_tokenizer = utils.load_s2s_model(GRAMFORMER_MODEL_URL)

In [None]:
influent_sentences = [
    "He are moving here.",
    "I am doing fine. How is you?",
    "Anna and Mike is going skiing",
    "I walk to the store and I bought milk",
    "We all eat the fish and then made dessert",
    "what be the reason for everyone leave the company",
    "Is you sure?",
    "Are he sure?",
]   

### T5: Text To Text Transfer Transformer:
Many tasks are cast into this framework: machine translation, classification task, regression task ( for example, predict how similar two sentences are, the similarity score is in range 1 to 5), other sequence to sequence tasks like document summarization (for example, summarising articles from CNN daily mail corpus).

<p align="center">
    <img src="t5_img.png"  width=500>
    <br>
</p>

In [None]:
s = """T5 uses common crawl web extracted text. The authors apply some pretty simple heuristic filtering. 
T5 removes any lines that didn’t end in a terminal punctuation mark. 
It also removes line with the word javascript and any pages that had a curly bracket (since it often appears in code). 
It deduplicates the dataset by taking a sliding window of 3 sentence chunks and deduplicated 
it so that only one of them appeared the dataset. For example, above 3 pages, 
the last paragraph on the middle page is removed since the same content appears on the first page. 
It ends up with 750 gigabytes of clean-ish English text. 
The dataset is publicly available on tensorlow.text.c4."""
c = utils.grammar_correct(s, grammar_model, grammar_tokenizer, prefix='summarize')

In [None]:
s = "How many channels are there for each cell"
corrected = utils.grammar_correct(s, grammar_model, grammar_tokenizer)
corrected

*a summarized version of the initial corpus.*

<p>

**HERE**, it is used as a grammar corrector.
<p>

In [None]:
for sentence in influent_sentences:
    print(f'Incorrect: {sentence}')
    corrected = utils.grammar_correct(sentence, grammar_model, grammar_tokenizer)
    print(f"Predicted: {corrected.pop()}")
    print("-"*100)

**NOW** we apply the grammar model to an md file composed of several lines, which we split and analyze
separatly, because the model has a maximum number of characters as input size (max_lenght=128).

In [None]:
with open('grammar_exploration.md', 'r') as f:
    texts = f.read()
texts = texts.split('\n')
tobe_corrected = set()
for line in texts:
    ls = re.split(r"\.|\?|\!", line)
    for l in ls:
        tobe_corrected.add(l)

In [None]:
tobe_corrected

In [None]:
for s in tobe_corrected:
    if s not in ('', ' ', '<br>', '<p>', '</b>'):
        print(f'Incorrect: {s}')
        corrected = utils.grammar_correct(s, grammar_model, grammar_tokenizer)
        print(f"Predicted: {corrected.pop()}")
        print("-"*100)

# Question-Answering models exploration

Question-Answering NLP models are able, given a context and a question, to extract the relevant part of the context
which answers the requested question. We use a tiny version (130 MB) of the ROBERTA-SQUAD2 model. 

In [None]:
model_name = "deepset/tinyroberta-squad2"
nlp = pipeline('question-answering', model=model_name, tokenizer=model_name)

In [None]:
# questions = ['What is the definition of an adjoint operator?',
#             'What is the definition of an unitary operator?',
#             'Is T=T* a self-adjoint operator?']

# context = """Let T be an operator over an Hilbert space, its adjoint T* is defined by (Tx,y)=(x,T*y).\n
#              When T=T* then T is a self-adjoint operator.\n 
#            An unitary operator U is such that UU*=U*U=I, where U* is its adjoint, and I is the identity operator."""

questions = ["Where are used self-adjoint operators?", "Why are self-adjoint operators important in quantum mechanics?"] 
context = """
Self-adjoint operators are used in functional analysis and quantum mechanics. 
In quantum mechanics their importance lies in the Dirac-von Neumann formulation of quantum mechanics, 
in which physical observables such as position, energy, angular momentum, spin, are represented by 
self-adjoint operators on a Hilbert space. Let T be an operator over an Hilbert space, 
its adjoint T* is defined by (Tx,y)=(x,T*y).\n
When T=T* then T is a self-adjoint operator.\n 
An unitary operator U is such that UU*=U*U=I, where U* is its adjoint, and I is the identity operator.
"""

# for q in questions:
#     qa = {'question': q, 'context': context}
#     res = nlp(qa)
#     print(res)

In [None]:
# nlp2 = pipeline('question-answering', 
#         model='bert-large-uncased-whole-word-masking-finetuned-squad',
#         tokenizer='bert-large-uncased-whole-word-masking-finetuned-squad')

# for q in questions:
#     qa = {'question': q, 'context': context}
#     res = nlp2(qa)
#     print(res)

In [None]:
qa_model, qa_tokenizer = utils.load_qa_model(model_name)

### Interactive Widgets for grammar and question-answering models.

In [None]:
@interact_manual(
    sentence='How are angular momentum represented in quantum mechanics?',
    layot=widgets.Layout(width='500px'))
def get_correction(sentence):
    corrected = utils.grammar_correct(sentence, grammar_model, grammar_tokenizer)
    print(f'Original sentence: {sentence}')
    #print(f'Answer 1: {answer_tokens_to_string}')
    return print(f'Correction: {corrected}')

In [None]:
@interact_manual(
    question='How is angular momentum represented in quantum mechanics?', 
    context=context,
    layot=widgets.Layout(width='500px'))
def get_answer(question, context):
    #answer_tokens_to_string = utils.qa_inference(qa_model, qa_tokenizer, question, context)
    qc = {'question': question, 'context': context}
    res = nlp(qc)
    #answer_tokens_to_string = utils.qa_inference(qa_model, qa_tokenizer, question, context)
    print(f'Question: {question}')
    print()
    print(f'Context: {context}')
    #print(f'Answer 1: {answer_tokens_to_string}')
    return print(f'Answer: {res["answer"]}')


# widgets.Checkbox(
#     value=True,
#     description='Is the answer valid?',
#     disabled=False,
#     indent=False
# )
@interact(value=['yes', 'no'])
def is_valid(value):
    print('Is the answer valid?')
    v = True if value == 'yes' else False
    return widgets.Valid(
        layout=widgets.Layout(width='200px'),
        value=v,
        description='valid?',
    )
#     return widgets.Checkbox(
#     value=v,
#     description='Is the answer valid?',
#     disabled=False,
#     indent=False
# )