In [74]:
import numpy as np
import requests
import html2text
from googlesearch import search
import json
import re
from simpletransformers.question_answering import QuestionAnsweringModel
from IPython.display import display
from IPython.html import widgets
from bs4 import BeautifulSoup
from markdown import markdown

In [64]:
def query_pages(query, n=5):
    return list(search(query, num=n, stop=n, pause=2))

def query_to_text(query, n=5):
    html_conv = html2text.HTML2Text()
    html_conv.ignore_links = True
    html_conv.escape_all = True
    
    text = []
    for link in query_pages(query, n):
        req = requests.get(link)
        text.append(html_conv.handle(req.text))
        
    return text

In [75]:
# Source: https://gist.github.com/lorey/eb15a7f3338f959a78cc3661fbc255fe
def markdown_to_text(markdown_string):
    """ Converts a markdown string to plaintext """

    # md -> html -> text since BeautifulSoup can extract text cleanly
    html = markdown(markdown_string)

    # remove code snippets
    html = re.sub(r'<pre>(.*?)</pre>', ' ', html)
    html = re.sub(r'<code>(.*?)</code >', ' ', html)

    # extract text
    soup = BeautifulSoup(html, "html.parser")
    text = ''.join(soup.findAll(text=True))

    return text

def format_text(text):
    text = markdown_to_text(text)
    text.replace('\n', ' ')
#     text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
#     text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    return text

In [76]:
format_text(query_to_text('How many fingers do humans have?', n=1)[0])

"writing\nHow ToGrammar TipsTrendsInspiration\nProductCompany\nstart writing\n\n\n\n\n\nCan You Call Your Thumb a Finger?\nShundalyn Allen\nWriting\n\nA thumb is a digit, but not technically a finger.\nMany people don’t make the distinction between thumbs and other digits.\n\n\nHow many fingers do you have? Your answer might depend on whether you consider\nyour thumb to be a finger. Let’s look into whether or not you can accurately\ncall your thumb a finger.\nHere’s a tip: Want to make sure your writing always looks great? Grammarly\ncan save you from misspellings, grammatical and punctuation mistakes, and\nother writing issues on all your favorite websites.\nYour writing, at its best.\nBe the best writer in the office.\nGet Grammarly\nFinger Definition\nIf you look up finger on OxfordDictionaries.com, you will find this\ndefinition: “Each of the four slender jointed parts attached to either hand. .\n.” Doesn’t this definition seem to exclude the thumb? Why exclude it?\nThumb vs. Finge

In [59]:
def create_model():
     return QuestionAnsweringModel('distilbert', 'distilbert-base-uncased-distilled-squad')
    
def predict_answer(model, question, contexts, seq_len=512, debug=False):
    split_context = []
    
    if not isinstance(contexts, list):
        contexts = [contexts]
    
    for context in contexts:
        for i in range(0, len(context), seq_len):
            split_context.append(context[i:i+seq_len])
    
    f_data = []
    
    for i, c in enumerate(split_context):
        f_data.append(
            {'qas': 
              [{'question': question,
               'id': i,
               'answers': [{'text': ' ', 'answer_start': 0}],
               'is_impossible': False}],
              'context': c
            })
        
    prediction = model.predict(f_data)
    if debug:
        print(prediction)
    preds = [x['answer'].lower() for x in prediction]
    return max(set(preds), key = preds.count)

In [60]:
def q_to_a(model, question, n=2, debug=False):
    context = query_to_text(question, n=n)
    pred = predict_answer(model, question, context, debug=debug)
    return pred

In [5]:
model = create_model()

In [6]:
predict_answer(model, 'what color is the bird?', 'the bird is red.')

100%|██████████| 1/1 [00:00<00:00, 436.23it/s]

Converting to features started.





HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




[{'id': 0, 'answer': 'red'}]

In [32]:
question = 'What is the bone on the back of your skull called?'
context = query_to_text(question, n=3)
pred = predict_answer(model, question, context)
print(pred)

 14%|█▎        | 13/95 [00:00<00:00, 127.41it/s]

Converting to features started.


100%|██████████| 95/95 [00:00<00:00, 150.34it/s]


HBox(children=(IntProgress(value=0, max=12), HTML(value='')))


occipital bone


In [35]:
q_to_a(model, 'What is the bone on the back of your skull called?')

 33%|███▎      | 15/45 [00:00<00:00, 143.47it/s]

Converting to features started.


100%|██████████| 45/45 [00:00<00:00, 131.05it/s]


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




'occipital bone'

In [45]:
model = create_model()

In [54]:
text = widgets.Text(description='Question:', width=300)
display(text)

button = widgets.Button(description='Get an Answer')
display(button)

def on_button_click(b)
    answer = q_to_a(model, text.value)
    print('Answer:', answer)
    
button.on_click(on_button_click)

Text(value='', description='Question:')

Button(description='Get an Answer', style=ButtonStyle())

How many fingers do humans have?


 28%|██▊       | 15/53 [00:00<00:00, 144.69it/s]

Converting to features started.


100%|██████████| 53/53 [00:00<00:00, 141.50it/s]


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


Answer: 


In [57]:
q_to_a(model, 'How many fingers do humans have?')

 32%|███▏      | 17/53 [00:00<00:00, 167.23it/s]

Converting to features started.


100%|██████████| 53/53 [00:00<00:00, 150.85it/s]


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




''

In [62]:
answer = q_to_a(model, 'How many fingers do humans have?', debug=True)

 32%|███▏      | 17/53 [00:00<00:00, 165.33it/s]

Converting to features started.


100%|██████████| 53/53 [00:00<00:00, 149.82it/s]


HBox(children=(IntProgress(value=0, max=7), HTML(value='')))


[{'id': 0, 'answer': ''}, {'id': 1, 'answer': 'How many'}, {'id': 2, 'answer': 'four'}, {'id': 3, 'answer': 'four'}, {'id': 4, 'answer': 'five'}, {'id': 5, 'answer': 'five'}, {'id': 6, 'answer': 'thumb'}, {'id': 7, 'answer': ''}, {'id': 8, 'answer': ''}, {'id': 9, 'answer': '356'}, {'id': 10, 'answer': ''}, {'id': 11, 'answer': '443cf72c056c479de112086ea9ccadf9-235x124.jpeg'}, {'id': 12, 'answer': ''}, {'id': 13, 'answer': '2019'}, {'id': 14, 'answer': ''}, {'id': 15, 'answer': 'five'}, {'id': 16, 'answer': 'five'}, {'id': 17, 'answer': 'five'}, {'id': 18, 'answer': 'four'}, {'id': 19, 'answer': ''}, {'id': 20, 'answer': 'two'}, {'id': 21, 'answer': 'Fingers do not contain muscles'}, {'id': 22, 'answer': 'two'}, {'id': 23, 'answer': 'human thumb also has other muscles in the thenar group'}, {'id': 24, 'answer': 'two'}, {'id': 25, 'answer': 'small finger'}, {'id': 26, 'answer': '5'}, {'id': 27, 'answer': '6'}, {'id': 28, 'answer': '8'}, {'id': 29, 'answer': 'fingertips.[8] ### Brain re

In [61]:
query_to_text('How many fingers do humans have?', n=2)

TypeError: query_to_text() got an unexpected keyword argument 'debug'