Skip to content

Commit

Permalink
Add ability to fetch examples
Browse files Browse the repository at this point in the history
Co-authored-by: Gabriel Benhur Schuck <gabrielschuck.tecladista@gmail.com>
  • Loading branch information
felipemfp and gabrielschuck committed Apr 23, 2019
1 parent 62856ce commit ddb538f
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 9 deletions.
37 changes: 29 additions & 8 deletions dicio/dicio.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,19 @@
TAG_EXTRA = ('class="adicional"', '</p>')
TAG_EXTRA_SEP = 'br'
TAG_EXTRA_DELIMITER = ('<b>', '</b>')
TAG_ENCHANT = ('id="enchant"', '</div>')
TAG_PHRASE_DELIMITER=('<div class="frase"','</div>')


class Word(object):

def __init__(self, word, meaning=None, synonyms=[], extra={}):
def __init__(self, word, meaning=None, synonyms=[], examples=[], extra={}):
self.word = word.strip().lower()
self.url = BASE_URL.format(Utils.remove_accents(self.word))
self.meaning = meaning
self.synonyms = synonyms
self.extra = extra
self.examples = examples

def load(self, dicio=None, get=urlopen):
if dicio:
Expand All @@ -38,6 +41,7 @@ def load(self, dicio=None, get=urlopen):
self.meaning = found.meaning
self.synonyms = found.synonyms
self.extra = found.extra
self.examples = found.examples

def __repr__(self):
return 'Word({!r})'.format(self.word)
Expand Down Expand Up @@ -70,13 +74,15 @@ def search(self, word):
except:
return None

found = Word(word)

found.meaning = self.scrape_meaning(page)
found.synonyms = self.scrape_synonyms(page)
found.extra = self.scrape_extra(page)
if page.find(TAG_ENCHANT[0]) > -1:
return None

return found
return Word(word,
meaning=self.scrape_meaning(page),
synonyms=self.scrape_synonyms(page),
examples=self.scrape_examples(page),
extra=self.scrape_extra(page),
)

def scrape_meaning(self, page):
"""
Expand Down Expand Up @@ -109,6 +115,21 @@ def first_synonym(self, html):
_html = _html.replace(TAG_SYNONYMS_DELIMITER[1], "", 1)
return Word(synonym), _html

def scrape_examples(self, page):
"""
Return a list of examples.
"""
examples = []
html = page
index = html.find(TAG_PHRASE_DELIMITER[0])
while index > -1:
example_html = Utils.text_between(html, *TAG_PHRASE_DELIMITER, force_html=True)
examples += [Utils.remove_spaces(Utils.remove_tags(example_html))]
html = html[index+len(TAG_PHRASE_DELIMITER[0]):]
index = html.find(TAG_PHRASE_DELIMITER[0])
return examples


def scrape_extra(self, page):
"""
Return a dictionary of extra information.
Expand All @@ -125,4 +146,4 @@ def scrape_extra(self, page):
dict_extra[key] = value
except:
pass
return dict_extra
return dict_extra
9 changes: 8 additions & 1 deletion tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
],
'word': 'comilão',
'url': 'http://www.dicio.com.br/comilao',
'examples': [
'"Em geral, quem consome carne é um bom comilão , come batata, não gosta muito de peixe e bebe mais. Folha de São Paulo, 11/08/2011',
'O urso mais comilão dos desenhos e seu amigo Catatau ganharam uma versão "live action" (com atores) na tela grande. Folha de São Paulo, 21/01/2011',
'Ringo consegue ainda mais comilão e destrói de tudo na casa Eurides Nascimento, em Matão (SP). Folha de São Paulo, 14/10/2011'
],
'extra': {
'Classe gramatical': 'adjetivo e substantivo masculino',
'Separação das sílabas': 'co-mi-lão',
Expand Down Expand Up @@ -162,6 +167,7 @@ def test_search(self):
expected = Word(comilao['word'])
expected.meaning = comilao['meaning']
expected.synonyms = comilao['synonyms']
expected.examples = comilao['examples']
expected.extra = comilao['extra']

# act
Expand All @@ -173,6 +179,7 @@ def test_search(self):
self.assertEqual(expected.meaning, result.meaning)
self.assertListEqual(list(map(str, expected.synonyms)),
list(map(str, result.synonyms)))
self.assertListEqual(expected.examples, result.examples)
self.assertDictEqual(expected.extra, result.extra)

def test_search_with_invalid_word(self):
Expand All @@ -197,4 +204,4 @@ def test_search_with_not_real_word_or_not_found(self):


if __name__ == '__main__':
unittest.main()
unittest.main()

0 comments on commit ddb538f

Please sign in to comment.