# POS tagging using spacy

In [0]:
import spacy
from spacy.pipeline import SentenceSegmenter

In [0]:
# Construction via create_pipe
sentencizer = nlp.create_pipe("sentencizer")

# Construction from class
from spacy.pipeline import Sentencizer
sentencizer = Sentencizer()

In [9]:
from spacy.lang.en import English

nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)
doc = nlp("A simple pipeline component, to allow custom sentence boundary detection logic that doesn’t require the dependency parse. By default, sentence segmentation is performed by the DependencyParser, so the Sentencizer lets you implement a simpler, rule-based strategy that doesn’t require a statistical model to be loaded. The component is also available via the string name sentencizer. After initialization, it is typically added to the processing pipeline using nlp.add_pipe.")
len(list(doc.sents))

4

In [10]:
import spacy

nlp = spacy.load('en_core_web_sm')
doc = nlp('A simple pipeline component, to allow custom sentence boundary detection logic that doesn’t require the dependency parse.')

pos_tags={}
for token in doc:
    pos_tags[token.text]=token.pos_

print(pos_tags)


{'A': 'DET', 'simple': 'ADJ', 'pipeline': 'NOUN', 'component': 'NOUN', ',': 'PUNCT', 'to': 'PART', 'allow': 'VERB', 'custom': 'NOUN', 'sentence': 'NOUN', 'boundary': 'ADJ', 'detection': 'NOUN', 'logic': 'NOUN', 'that': 'DET', 'does': 'VERB', 'n’t': 'ADV', 'require': 'VERB', 'the': 'DET', 'dependency': 'NOUN', 'parse': 'NOUN', '.': 'PUNCT'}


In [0]:
import nltk

In [0]:
# Grammar defined for our task   

adhoc_grammar = nltk.CFG.fromstring("""
  S  -> NP VP
  NP -> Det Nom | PropN
  Nom -> Adj Nom | N
  VP -> V Adj | V NP | V S | V NP PP
  PP -> P NP
  PropN -> 'Adam' | 'Nicole' | 'Sam'
  Det -> 'the' | 'a'
  N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'| 'cat'|'dog' 
  Adj  -> 'angry' | 'frightened' |  'little' | 'tall'
  V ->  'chased'  | 'saw' | 'said' | 'thought' | 'was' | 'put'| 'rescued'
  P -> 'on'
  """)

In [0]:
# Sentence 1
sentence_1 = 'the angry dog chased the frightened little cat'.split()

# Sentence
sentence_2= 'Adam rescued the squirrel'.split()


In [0]:
# Function for parsing

def parse(sentence):
    
    # List for parsed tree
    parsed_tree = []  
    
    # Parsing done by 
    parser = nltk.ChartParser(adhoc_grammar)
    

    # Loop for creating appending the trees
    for tree in parser.parse(sentence):
        parsed_tree.append(tree)
    
    
    
    # Return the parsed tree
    return(str(parsed_tree[0])) 



In [20]:
# Calling the parse function for sentence 1
pt2=(parse(sentence_2))
print(sentence_2)
print(pt2)

['Adam', 'rescued', 'the', 'squirrel']
(S
  (NP (PropN Adam))
  (VP (V rescued) (NP (Det the) (Nom (N squirrel)))))


In [17]:
# Calling the parse function for sentence 1   
pt1=(parse(sentence_1))
print(sentence_1)
print(pt1)

['the', 'angry', 'dog', 'chased', 'the', 'frightened', 'little', 'cat']
(S
  (NP (Det the) (Nom (Adj angry) (Nom (N dog))))
  (VP
    (V chased)
    (NP
      (Det the)
      (Nom (Adj frightened) (Nom (Adj little) (Nom (N cat)))))))


## Context free grammer

In [21]:
# Grammar defined for our task   

adhoc_grammar = nltk.CFG.fromstring("""
  S  -> NP VP
  NP -> Det Nom | PropN
  Nom -> Adj Nom | N
  VP -> V Adj | V NP | V S | V NP PP
  PP -> P NP
  PropN -> 'Adam' | 'Nicole' | 'Sam'
  Det -> 'the' | 'a'
  N -> 'bear' | 'squirrel' | 'tree' | 'fish' | 'log'| 'cat'|'dog' 
  Adj  -> 'angry' | 'frightened' |  'little' | 'tall'
  V ->  'chased'  | 'saw' | 'said' | 'thought' | 'was' | 'put'| 'rescued'
  P -> 'on'
  """)

# Sentence 1
sentence_1 = 'the angry dog chased the frightened little cat'.split()

# Sentence
sentence_2= 'Adam rescued the squirrel'.split()


# Function for parsing

def parse(sentence):
    
    # List for parsed tree
    parsed_tree = []  
    
    # Parsing done by 
    parser = nltk.ChartParser(adhoc_grammar)
    

    # Loop for creating appending the trees
    for tree in parser.parse(sentence):
        parsed_tree.append(tree)
    
    
    
    # Return the parsed tree
    return(str(parsed_tree[0])) 


# Calling the parse function for sentence 1   
pt1=(parse(sentence_1))
print(pt1)

# Calling the parse function for sentence 1
pt2=(parse(sentence_2))
print(pt2)

# Cleaning of the trees for better visualization purposes

pt1=pt1.replace('(','[')
pt1=pt1.replace(')',']')


pt2=pt2.replace('(','[')
pt2=pt2.replace(')',']')


(S
  (NP (Det the) (Nom (Adj angry) (Nom (N dog))))
  (VP
    (V chased)
    (NP
      (Det the)
      (Nom (Adj frightened) (Nom (Adj little) (Nom (N cat)))))))
(S
  (NP (PropN Adam))
  (VP (V rescued) (NP (Det the) (Nom (N squirrel)))))


## Statistical parsing using pcgf

In [22]:
# Header Files

from nltk import PCFG
from nltk.probability import DictionaryProbDist
from nltk import nonterminals, Nonterminal, Production
from nltk.corpus import treebank
from nltk import treetransforms
from nltk import induce_pcfg
from nltk.parse import pchart

# PCFG Grammar

toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
""")

# Saving all the rules of the grammar in a variable
pcfg_prods = toy_pcfg1.productions()


# Selecting one probability grammar rule
pcfg_prod = pcfg_prods[10]


print('A PCFG production:', pcfg_prod)
print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
print('pcfg_prod.prob() =>', pcfg_prod.prob())

# Taking all productions where LHS=N
n_productions = toy_pcfg1.productions(Nonterminal('N'))

dict = {}
for pr in n_productions: dict[pr.rhs()] = pr.prob()
n_probDist = DictionaryProbDist(dict)

# Generates random samples depending on the prob.

print(n_probDist.generate())

print(n_probDist.generate())

print(n_probDist.generate())

A PCFG production: VP -> V NP [0.7]
pcfg_prod.lhs()  => VP
pcfg_prod.rhs()  => (V, NP)
pcfg_prod.prob() => 0.7
('telescope',)
('man',)
('man',)
