In [1]:
import nltk
# nltk.download()

# Interesting reading:
https://www.nltk.org/book/ch08.html

# Ambiguity

In [11]:
# PP -> Indica a circunstancia (Sintagma), refere-se ao NP

simple_grammar = nltk.CFG.fromstring("""
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'eu'
VP -> V NP | VP PP
Det -> 'um' | 'meus'
N -> 'elefante' | 'pijamas' | 'cachorro'
V -> 'mordi'
P -> 'em' | 'com' | 'para'
""")

In [3]:
# Para resolver a ambiguidade sem olhar a semantica podemos olhar a frequencia de meus pijamas se refere
sentence = 'Eu mordi um cachorro com meus pijamas'.lower().split()
parser = nltk.ChartParser(simple_grammar)
trees = parser.parse(sentence)
for tree in trees:
    print(tree)

(S
  (NP eu)
  (VP
    (VP (V mordi) (NP (Det um) (N cachorro)))
    (PP (P com) (NP (Det meus) (N pijamas)))))
(S
  (NP eu)
  (VP
    (V mordi)
    (NP
      (Det um)
      (N cachorro)
      (PP (P com) (NP (Det meus) (N pijamas))))))


In [4]:
from nltk.corpus import floresta

In [6]:
floresta.parsed_sents()[3].draw()

In [7]:
productions = {}
for ps in floresta.parsed_sents():
    for prod in ps.productions():
        if prod.lhs() in productions:
            productions[prod.lhs()].add(prod.rhs())
        else:
            productions[prod.lhs()] = set()
            productions[prod.lhs()].add(prod.rhs())

Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered 

Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered by returning a flat parse.
Bad tree detected; trying to recover...
  Recovered 

In [8]:
CFG = []
for L, R in productions.items():
    for tup in R:
        Rs = []
        for item in tup:
            parts = []
            if not isinstance(item, nltk.grammar.Nonterminal):
                parts.append("'{}'".format(str(item)))
            else:
                parts.append(str(item))
        Rs.append(" ".join(parts))
    line = "{} -> {}".format(L, " | ".join(Rs))
    CFG.append(line)

In [13]:
for d in CFG:
    print(d)
    d.replace("+", "+")
    print(d)
    break

PIV+icl -> ACC+icl
PIV+icl -> ACC+icl


In [11]:
rules = nltk.CFG.fromstring(" \n ".join(CFG))

ValueError: Unable to parse line 1: >N+advp -> KOMP<+acl
Expected a nonterminal, found: >N+advp -> KOMP<+acl

# Parsers 

## ChartParser

- Is slow and not memory efficient.
- Can handle left-recursive grammars.
- Always finds all possible syntax trees.
- Is the best parser to start with. Consider other parsers only if this one is not fast enough or it’s not memory efficient enough.

## RecursiveDescentParser¶

- Does top-down parsing.
- Does not work with left-recursive productions (they sent it to an infinite loop).
- Wastes a lot of time, considering words and structures that do not correspond to the input sentence.
- When backtracking, this parser discards parsed constituents that will need to be rebuilt again later.

## ShiftReduceParser¶

- Does bottom-up parsing.
- Does not implement any backtracking, so it is not guaranteed to find a parse for a text, even if one exists. - Furthermore, it will only find at most one parse, even if more parses exist.


In [None]:
parser =  nltk.parse.BottomUpChartParser(rules)

sentence = 'Eu mordi um elefante com meus pijamas'.lower().split()
trees = parser.parse(sentence)
for tree in trees:
    print(tree)

# Extending the lexicon
- use http://nilc.icmc.usp.br/embeddings

In [None]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format('skip_s50.txt')

In [None]:
sentence = 'Eu mordi um elefante com minhas calças'.lower().split()
parser = nltk.ChartParser(simple_grammar)
trees = parser.parse(sentence)
for tree in trees:
    try:
        print(tree)
    except e:
        print(e)