In [0]:
import nltk
text = nltk.word_tokenize("We are going to the party")
nltk.pos_tag(text)

[('We', 'PRP'),
 ('are', 'VBP'),
 ('going', 'VBG'),
 ('to', 'TO'),
 ('the', 'DT'),
 ('party', 'NN')]

In [0]:
from nltk.parse.generate import generate, demo_grammar
from nltk import CFG

In [0]:
print(demo_grammar)


  S -> NP VP
  NP -> Det N
  PP -> P NP
  VP -> 'slept' | 'saw' NP | 'walked' PP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'dog'
  P -> 'in' | 'with'



In [0]:
from __future__ import print_function

import itertools
import sys
from nltk.grammar import Nonterminal


def generate(grammar, start=None, depth=None, n=None):
    """
    Generates an iterator of all sentences from a CFG.

    :param grammar: The Grammar used to generate sentences.
    :param start: The Nonterminal from which to start generate sentences.
    :param depth: The maximal depth of the generated tree.
    :param n: The maximum number of sentences to return.
    :return: An iterator of lists of terminal tokens.
    """
    if not start:
        start = grammar.start()
    if depth is None:
        depth = sys.maxsize

    iter = _generate_all(grammar, [start], depth)

    if n:
        iter = itertools.islice(iter, n)

    return iter



def _generate_all(grammar, items, depth):
    if items:
        try:
            for frag1 in _generate_one(grammar, items[0], depth):
                for frag2 in _generate_all(grammar, items[1:], depth):
                    yield frag1 + frag2
        except RuntimeError as _error:
            if _error.message == "maximum recursion depth exceeded":
                # Helpful error message while still showing the recursion stack.
                raise RuntimeError("The grammar has rule(s) that yield infinite recursion!!")
            else:
                raise
    else:
        yield []


def _generate_one(grammar, item, depth):
    if depth > 0:
        if isinstance(item, Nonterminal):
            for prod in grammar.productions(lhs=item):
                for frag in _generate_all(grammar, prod.rhs(), depth-1):
                    yield frag
        else:
            yield [item]

demo_grammar = """
  S -> NP VP
  NP -> Det N
  PP -> P NP
  VP -> 'slept' | 'saw' NP | 'walked' PP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'dog'
  P -> 'in' | 'with'
"""


def demo(N=23):
    from nltk.grammar import CFG

    print('Generating the first %d sentences for demo grammar:' % (N,))
    print(demo_grammar)
    grammar = CFG.fromstring(demo_grammar)
    for n, sent in enumerate(generate(grammar, n=N), 1):
        print('%3d. %s' % (n, ' '.join(sent)))



if __name__ == '__main__':
    demo()

Generating the first 23 sentences for demo grammar:

  S -> NP VP
  NP -> Det N
  PP -> P NP
  VP -> 'slept' | 'saw' NP | 'walked' PP
  Det -> 'the' | 'a'
  N -> 'man' | 'park' | 'dog'
  P -> 'in' | 'with'

  1. the man slept
  2. the man saw the man
  3. the man saw the park
  4. the man saw the dog
  5. the man saw a man
  6. the man saw a park
  7. the man saw a dog
  8. the man walked in the man
  9. the man walked in the park
 10. the man walked in the dog
 11. the man walked in a man
 12. the man walked in a park
 13. the man walked in a dog
 14. the man walked with the man
 15. the man walked with the park
 16. the man walked with the dog
 17. the man walked with a man
 18. the man walked with a park
 19. the man walked with a dog
 20. the park slept
 21. the park saw the man
 22. the park saw the park
 23. the park saw the dog


In [0]:
import nltk
from nltk.corpus import brown
prepchoices = nltk.ConditionalFreqDist((v[0], p[0]) 
    for (v, p) in nltk.bigrams(brown.tagged_words(tagset="universal")) 
        if v[1] == "VERB" and p[1] == "ADP") 

In [0]:
prepchoices["writing"]

FreqDist({'in': 5, 'at': 3, 'from': 3, 'to': 2, 'on': 1, 'for': 1, 'about': 1, 'since': 1, 'under': 1, 'with': 1})

In [0]:
grammar = {}
grammar["sitting"] = {}
grammar["sitting"]["table"] = "on"
grammar["sitting"]["van"] = "in"

In [0]:
print(grammar)

{'sitting': {'van': 'in', 'table': 'on'}}


In [0]:
import spacy
nlp = spacy.load('en')
sent = "when the bell rang, saurav went out"
doc=nlp(sent)

sub_toks = [tok for tok in doc if (tok.dep_ == "nsubj") ]

print(sub_toks) 

[saurav]


In [0]:
from __future__ import unicode_literals, print_function

raw_text = 'Hello, world. Here are two sentences.'
nlp = spacy.load('en')
doc = nlp(raw_text)
sentences = [sent.string.strip() for sent in doc.sents]
print(sentences)

['Hello, world.', 'Here are two sentences.']


In [0]:
import spacy
nlp = spacy.load('en')

doc = nlp(u"the shop is closed.")

for token in doc:
    print(token, token.lemma, token.lemma_)

the 7425985699627899538 the
shop 15809682053778148938 shop
is 10382539506755952630 be
closed 16417442958758597567 close
. 12646065887601541794 .


In [0]:
def noun_chunks(doc, drop_determiners=True, min_freq=1):
    """
    Extract an ordered sequence of noun chunks from a spacy-parsed doc, optionally
    filtering by frequency and dropping leading determiners.
    Args:
        doc (``textacy.Doc`` or ``spacy.Doc``)
        drop_determiners (bool): remove leading determiners (e.g. "the")
            from phrases (e.g. "the quick brown fox" => "quick brown fox")
        min_freq (int): remove chunks that occur in ``doc`` fewer than
            ``min_freq`` times
    Yields:
        ``spacy.Span``: the next noun chunk from ``doc`` in order of appearance
        in the document
    """
    if hasattr(doc, 'spacy_doc'):
        ncs = doc.spacy_doc.noun_chunks
    else:
        ncs = doc.noun_chunks
    if drop_determiners is True:
        ncs = (nc if nc[0].pos != DET else nc[1:]
               for nc in ncs)
    if min_freq > 1:
        ncs = list(ncs)
        freqs = itertoolz.frequencies(nc.lower_ for nc in ncs)
        ncs = (nc for nc in ncs
               if freqs[nc.lower_] >= min_freq)

    for nc in ncs:
        yield nc

In [0]:
noun_chunks("the boy")

<generator object noun_chunks at 0x7f1477330a98>

In [0]:
import spacy
from nltk import Tree


en_nlp = spacy.load('en')

doc = en_nlp("The quick brown fox jumps over the lazy dog.")

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

        jumps                    
  ________|______________         
 |        |             over     
 |        |              |        
 |       fox            dog      
 |    ____|_____      ___|____    
 .  The quick brown the      lazy



[None]

In [0]:
import spacy
from nltk import Tree


en_nlp = spacy.load('en')

doc = en_nlp("I am walk down the street.")

def tok_format(tok):
    return "_".join([tok.orth_, tok.tag_])


def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_tree(child) for child in node.children])
    else:
        return tok_format(node)


[to_nltk_tree(sent.root).pretty_print() for sent in doc.sents]

      walk_VB              
   ______|____________      
  |      |     |   down_IN 
  |      |     |      |     
  |      |     |  street_NN
  |      |     |      |     
I_PRP  am_VBP ._.   the_DT 



[None]

In [0]:
doc2 = en_nlp("He was swimming in the river")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

       swimming_VBG         
   _________|__________      
  |         |        in_IN  
  |         |          |     
  |         |       river_NN
  |         |          |     
He_PRP   was_VBD     the_DT 



[None]

In [0]:
doc3 = en_nlp("When he got the email, he came to my small office house and started shouting.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

                               came_VBD                                                 
  ________________________________|______________________________________________        
 |    |      |     |           got_VBD                    to_IN                  |      
 |    |      |     |      ________|________                 |                    |       
 |    |      |     |     |        |     email_NN         house_NN           started_VBD 
 |    |      |     |     |        |        |         _______|_________           |       
,_, he_PRP and_CC ._. When_WRB  he_PRP   the_DT  my_PRP$ small_JJ office_NN shouting_VBG

came
to
house


In [0]:
doc3 = en_nlp("I am walking on the road.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

      walking_VBG            
   ________|_____________     
  |        |       |   on_IN 
  |        |       |     |    
  |        |       |  road_NN
  |        |       |     |    
I_PRP    am_VBP   ._.  the_DT

walking
on
road


In [0]:
doc3 = en_nlp("The little boys were playing in the garden")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

         playing_VBG                             
    __________|_____________________________      
   |                    |                 in_IN  
   |                    |                   |     
   |                 boys_NNS           garden_NN
   |           _________|_________          |     
were_VBD    The_DT            little_JJ   the_DT 

playing
in
garden


In [0]:
doc3 = en_nlp("Admist all confusion, Salman was found guilty in the case.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]
for sent in doc3.sents:
    print(sent.root)
    for ch in sent.root.children:
        if(ch.tag_ == "IN"):
            print(ch)
            for sec in ch.children:
                print(sec)

                       found_VBN                                          
  _________________________|__________________________________________     
 |      |         |        |      |                 |               in_IN 
 |      |         |        |      |                 |                 |    
 |      |         |        |      |            confusion_NN        case_NN
 |      |         |        |      |       __________|_________        |    
,_, Salman_NNP was_VBD guilty_JJ ._. Admist_VB              all_DT  the_DT

found
in
case


In [0]:
doc3 = en_nlp("The mother was cooking dinner in the home kitchen and the boys were playing in the garden.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc3.sents]

grammar = {}

def VB_IN_NN(payload):
    if(payload.tag_[:2] != 'VB'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            VB_IN_NN(ch)
    temp = [payload]
    for ch in payload.children:
        if(ch.tag_ == "IN"):
            temp.append(ch)
            for sec in ch.children:
                temp.append(sec)
                if(len(temp) == 3):
                    grammar[payload.text.lower()] = {}
                    grammar[payload.text.lower()][sec.text.lower()] = ch.text.lower()
                return
    

for sent in doc3.sents:
    VB_IN_NN(sent.root)
print(grammar)

                                   cooking_VBG                                                           
    ____________________________________|_______________________________________                          
   |        |       |        |                     |                       playing_VBG                   
   |        |       |        |                     |                  __________|___________________      
   |        |       |        |                   in_IN               |          |         |       in_IN  
   |        |       |        |                     |                 |          |         |         |     
   |        |       |    mother_NN             kitchen_NN            |          |      boys_NNS garden_NN
   |        |       |        |           __________|_________        |          |         |         |     
was_VBD dinner_NN and_CC   The_DT     the_DT              home_NN were_VBD     ._.      the_DT    the_DT 

{'cooking': {'kitchen': 'in'}, 'playing':

In [0]:
import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer
mdetok = MosesDetokenizer()

In [0]:
for sent in brown.sents('cb01')[:20]:
    munged_sentence = ' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'")
    print(mdetok.detokenize(munged_sentence.split(), return_str=True))
    print()

Assembly session brought much good

The General Assembly, which adjourns today, has performed in an atmosphere of crisis and struggle from the day it convened.

It was faced immediately with a showdown on the schools, an issue which was met squarely in conjunction with the governor with a decision not to risk abandoning public education.

There followed the historic appropriations and budget fight, in which the General Assembly decided to tackle executive powers.

The final decision went to the executive but a way has been opened for strengthening budgeting procedures and to provide legislators information they need.

Long-range planning of programs and ways to finance them have become musts if the state in the next few years is to avoid crisis-to-crisis government.

This session, for instance, may have insured a financial crisis two years from now.

In all the turmoil, some good legislation was passed.

Some other good bills were lost in the shuffle and await future action.

Certainly

In [0]:
count = 0

for cps in brown.fileids()[:10]:
    
    for sent in brown.sents(cps):
        count += 1
        munged_sentence = ' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'")
        doc4 = en_nlp(mdetok.detokenize(munged_sentence.split(), return_str=True))
        #[to_nltk_tree(sent.root).pretty_print() for sent in doc4.sents]
        for sent in doc4.sents:
            VB_IN_NN(sent.root)

print(grammar)
#print(str(len(combos)) + " " + str(count))

{'agreed': {'agenda': 'upon'}, 'send': {'homes': 'to'}, 'propose': {'amount': 'by'}, 'comes': {'walk': 'within'}, 'view': {'conspiracy': 'as'}, 'needed': {'program': 'for'}, 'run': {'nomination': 'for'}, 'rising': {'massachusetts': 'in'}, 'penalized': {'services': 'for'}, 'came': {'ballot': 'on'}, 'credits': {'reduction': 'with'}, 'paying': {'it': 'for'}, 'looks': {'administration': 'to'}, 'alloted': {'municipalities': 'to'}, 'freeze': {'laos': 'in'}, 'warned': {'meeting': 'in'}, 'count': {'aid': 'on'}, 'modernized': {'station': 'with'}, 'arranging': {'graduates': 'for'}, 'boost': {'5,000': 'to'}, 'offer': {'developments': 'in'}, 'saved': {'mitchell': 'for'}, 'passed': {'dissent': 'without'}, 'change': {'insurance': 'to'}, 'back': {'hilt': 'to'}, 'attend': {'portland': 'in'}, 'encourage': {'nations': 'in'}, 'examined': {'session': 'at'}, 'authorized': {'session': 'at'}, 'launched': {'be': 'into'}, 'feeling': {'even': 'for'}, 'support': {'efforts': 'in'}, 'reported': {'day': 'after'}, '

In [0]:
grammar["joined"]["1925"]

'in'

In [0]:
import numpy as np
np.save('correctly.npy', grammar)

In [0]:
temp_grammar = np.load('correctly.npy').item()

In [0]:
def VB_IN_NN_correction(payload, raw_text, master_dictionary):
	if(payload.tag_[:2] != 'VB'):
		return
	for ch in payload.children:
		if(ch.tag_[:2] == 'VB'):
			VB_IN_NN(ch)
	temp = [payload]
	for ch in payload.children:
		if(ch.tag_ == "IN"):
			temp.append(ch)
			for sec in ch.children:
				temp.append(sec)
				if(len(temp) == 3):
					try:
						correct_prep = master_dictionary[payload.text.lower()][sec.text.lower()]
						if(correct_prep != ch.text.lower()):
							raw_text = raw_text[:ch.idx] + raw_text[ch.idx:].replace(temp[1].text, correct_prep, 1)
							return raw_text
					except KeyError:
						return raw_text
				return

In [0]:
text = "i was dancing with the park."
doc = en_nlp(text)
for sent in doc.sents:
    text = VB_IN_NN_correction(sent.root, text, grammar)
    print(text)

i was dancing with the park.


In [0]:
from pattern.en import conjugate, lemma, lexeme, INFINITIVE, PRESENT, PAST, PARTICIPLE, FUTURE, SG, PL, INDICATIVE, IMPERATIVE, CONDITIONAL, SUBJUNCTIVE, PROGRESSIVE 

In [0]:
print(conjugate(verb='downloading', tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)) # add aspect=PROGRESSIVE to indicate continuous tense

downloading


In [0]:
doc2 = en_nlp("has ram taken the ball?")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

        taken_VBN            
    ________|____________     
   |        |      |  ball_NN
   |        |      |     |    
has_VBZ  ram_VBN  ?_.  the_DT



[None]

In [0]:
doc2 = en_nlp("ram has been watching tv.")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

               watching_VBG          
   _________________|______________   
ram_NN has_VBZ   been_VBN   tv_NN ._.



[None]

In [0]:
for sent in doc2.sents:
    for comp in sent.root.children:
        if(comp.tag_ == 'VBD'):
            print(comp)

In [0]:
def VB_VB_VB(payload):
    if(payload.tag_[:2] != 'VB'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            VB_VB_VB(ch)
    temp = []
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            temp.append(ch.lower_ + '_' + ch.tag_)
        if(len(temp) == 2):
            temp.append(payload.lower_+ '_' + ch.tag_)
            combos.append(temp)
            temp = []

In [0]:
import spacy
from nltk import Tree

en_nlp = spacy.load('en')
rtext = "Ramu has been travel since early this year."
doc2 = en_nlp(rtext)
combos = []
for sent in doc2.sents:
    rtext = VB_VB_VB_correction(sent.root, rtext)
print(rtext)

Ramu has been travelling since early this year.


In [0]:
def VB_VB_VB_correction(payload, raw_text):
    if(payload.tag_[:2] != 'VB' and payload.tag_[:2] != 'NN'  and payload.tag_[:2] != 'JJ'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'): # this might need to be removed
            VB_VB_VB_correction(ch, raw_text)
    temp = []
    nounBeforeVerb = False
    nounAfterVerb = False
    verbFound = False
    since = False
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            verbFound = True
        if((not verbFound) and (ch.tag_[:2] == 'NN' or ch.tag_[:2] == 'PR')):
            nounBeforeVerb = True
        if(verbFound and (ch.tag_[:2] == 'NN' or ch.tag_[:2] == 'PR')):
            nounAfterVerb = True
        if(ch.lower_ == 'since'):
            since = True
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'):
            # print(ch.idx)
            temp.append(ch.lower_ + '_' + ch.tag_)
        if(len(temp) == 2):
            temp.append(payload.lower_+ '_' + ch.tag_)
            #print(temp)
            if (temp[0][-3:] == 'VBZ' or temp[0][-3:] == 'VBP') and temp[1][-3:] == 'VBN':
                if nounAfterVerb or since:
                    x = conjugate(verb=lemma(temp[2][:-4]), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)
                elif nounBeforeVerb:
                    x = conjugate(verb=lemma(temp[2][:-4]), tense=PAST+PARTICIPLE, mood=INDICATIVE, person=1, number=PL)
                # print(temp[2][:-4] + ' -> ' + x)
            combos.append(temp)
            # print(nounBeforeVerb)
            raw_text = raw_text[:payload.idx] + raw_text[payload.idx:].replace(temp[2][:-4], x, 1)
            #print(raw_text)
            temp = []
            return raw_text
    return raw_text

In [0]:
def VB_IN_NN(payload):
	if(payload.tag_[:2] != 'VB'):
		return
	for ch in payload.children:
		if(ch.tag_[:2] == 'VB'):
			VB_IN_NN(ch)
	temp = [payload]
	for ch in payload.children:
		if(ch.tag_ == "IN"):
			temp.append(ch)
			for sec in ch.children:
				temp.append(sec)
				if(len(temp) == 3):
					grammar[payload.text.lower()] = {}
					grammar[payload.text.lower()][sec.text.lower()] = ch.text.lower()
				return
            
def VB_IN_NN_correction(payload, raw_text, master_dictionary):
	if(payload.tag_[:2] != 'VB'):
		return
	for ch in payload.children:
		if(ch.tag_[:2] == 'VB'):
			VB_IN_NN_correction(ch, raw_text, master_dictionary)
	temp = [payload]
	for ch in payload.children:
		if(ch.tag_ == "IN"):
			temp.append(ch)
			for sec in ch.children:
				temp.append(sec)
				if(len(temp) == 3):
					try:
						correct_prep = master_dictionary[payload.text.lower()][sec.text.lower()]
						if(correct_prep != ch.text.lower()):
							raw_text = raw_text[:ch.idx] + raw_text[ch.idx:].replace(temp[1].text, correct_prep, 1)
							return raw_text
					except KeyError:
						return raw_text
	return raw_text

In [0]:
def VB_VB_correction(payload, raw_text):
    if(payload.tag_[:2] != 'VB'):
        return
    for ch in payload.children:
        if(ch.tag_[:2] == 'VB'): # this might need to be removed
            VB_VB_VB_correction(ch, raw_text)
            
            if(ch.lower_ == 'has') or (ch.lower_ == 'have') or (ch.lower_ == 'had'):
                x = conjugate(verb=lemma(payload.text), tense=PAST+PARTICIPLE, mood=INDICATIVE, person=1, number=PL)
            else:
                x = conjugate(verb=lemma(payload.text), tense=PRESENT, mood=INDICATIVE, aspect=PROGRESSIVE, person=1, number=PL)
        
            raw_text = raw_text[:payload.idx] + raw_text[payload.idx:].replace(payload.text, x, 1)
            return raw_text
    return raw_text

In [0]:
doc2 = en_nlp("he has doing his homework")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

       doing_VBG            
   ________|__________       
  |        |     homework_NN
  |        |          |      
he_PRP  has_VBZ    his_PRP$ 



[None]

In [0]:
doc2 = en_nlp("he is walking on the road")
[to_nltk_tree(sent.root).pretty_print() for sent in doc2.sents]

       walking_VBG        
   _________|_________     
  |         |       on_IN 
  |         |         |    
  |         |      road_NN
  |         |         |    
he_PRP    is_VBZ    the_DT



[None]

In [0]:
import spacy
from nltk import Tree

en_nlp = spacy.load('en')
rtext = "He has done his homework."
doc2 = en_nlp(rtext)
combos = []
for sent in doc2.sents:
    rtext = VB_VB_correction(sent.root, rtext)
print(rtext)

He has done his homework.
