# Syntactic Parsing for English in Action: Using NLTK and ErgApi




In [0]:
!pip install --user -U nltk

Requirement already up-to-date: nltk in /root/.local/lib/python3.6/site-packages (3.5)


In [0]:
import nltk
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> d

Download which package (l=list; x=cancel)?
  Identifier> treebank
    Downloading package treebank to /root/nltk_data...
      Unzipping corpora/treebank.zip.

---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> q


True

## PCFG

In [0]:
from nltk.corpus import treebank
from nltk import PCFG
from nltk.grammar import CFG, Nonterminal
from nltk.parse import pchart
from nltk import induce_pcfg
from nltk.tree import Tree

In [0]:
t1 = treebank.parsed_sents("wsj_0001.mrg")[0]
s = '(S (NP (DT the) (NN cat)) (VP (VBD ate) (NP (DT a) (NN cookie))))'
t2 = Tree.fromstring(s)
#t1.draw()

In [0]:
toy_pcfg1 = PCFG.fromstring("""
    S -> NP VP [1.0]
    NP -> Det N [0.5] | NP PP [0.25] | 'John' [0.1] | 'I' [0.15]
    Det -> 'the' [0.8] | 'my' [0.2]
    N -> 'man' [0.5] | 'telescope' [0.5]
    VP -> VP PP [0.1] | V NP [0.7] | V [0.2]
    V -> 'ate' [0.35] | 'saw' [0.65]
    PP -> P NP [1.0]
    P -> 'with' [0.61] | 'under' [0.39]
""")

In [0]:
pcfg_prods = toy_pcfg1.productions()

pcfg_prod = pcfg_prods[9]
print('A PCFG production:', pcfg_prod)
print('pcfg_prod.lhs()  =>', pcfg_prod.lhs())
print('pcfg_prod.rhs()  =>', pcfg_prod.rhs())
print('pcfg_prod.prob() =>', pcfg_prod.prob())

A PCFG production: VP -> VP PP [0.1]
pcfg_prod.lhs()  => VP
pcfg_prod.rhs()  => (VP, PP)
pcfg_prod.prob() => 0.1


In [0]:
# extract productions from three trees and induce the PCFG
print("Induce PCFG grammar from treebank data:")

productions = []
for item in treebank.fileids()[:1]:
    for tree in treebank.parsed_sents(item):
        # perform optional tree transformations, e.g.:
        tree.collapse_unary(collapsePOS = False)# Remove branches A-B-C into A-B+C
        tree.chomsky_normal_form(horzMarkov = 2)# Remove A->(B,C,D) into A->B,C+D->D
        productions += tree.productions()
print(productions)

Induce PCFG grammar from treebank data:
[S -> NP-SBJ S|<VP-.>, NP-SBJ -> NP NP-SBJ|<,-ADJP>, NP -> NNP NNP, NNP -> 'Pierre', NNP -> 'Vinken', NP-SBJ|<,-ADJP> -> , NP-SBJ|<ADJP-,>, , -> ',', NP-SBJ|<ADJP-,> -> ADJP ,, ADJP -> NP JJ, NP -> CD NNS, CD -> '61', NNS -> 'years', JJ -> 'old', , -> ',', S|<VP-.> -> VP ., VP -> MD VP, MD -> 'will', VP -> VB VP|<NP-PP-CLR>, VB -> 'join', VP|<NP-PP-CLR> -> NP VP|<PP-CLR-NP-TMP>, NP -> DT NN, DT -> 'the', NN -> 'board', VP|<PP-CLR-NP-TMP> -> PP-CLR NP-TMP, PP-CLR -> IN NP, IN -> 'as', NP -> DT NP|<JJ-NN>, DT -> 'a', NP|<JJ-NN> -> JJ NN, JJ -> 'nonexecutive', NN -> 'director', NP-TMP -> NNP CD, NNP -> 'Nov.', CD -> '29', . -> '.', S -> NP-SBJ S|<VP-.>, NP-SBJ -> NNP NNP, NNP -> 'Mr.', NNP -> 'Vinken', S|<VP-.> -> VP ., VP -> VBZ NP-PRD, VBZ -> 'is', NP-PRD -> NP PP, NP -> NN, NN -> 'chairman', PP -> IN NP, IN -> 'of', NP -> NP NP|<,-NP>, NP -> NNP NNP, NNP -> 'Elsevier', NNP -> 'N.V.', NP|<,-NP> -> , NP, , -> ',', NP -> DT NP|<NNP-VBG>, DT -> 'the'

In [0]:
S = Nonterminal('S')
grammar = induce_pcfg(S, productions)
print(grammar)

Grammar with 53 productions (start state = S)
    S -> NP-SBJ S|<VP-.> [1.0]
    NP-SBJ -> NP NP-SBJ|<,-ADJP> [0.5]
    NP -> NNP NNP [0.25]
    NNP -> 'Pierre' [0.125]
    NNP -> 'Vinken' [0.25]
    NP-SBJ|<,-ADJP> -> , NP-SBJ|<ADJP-,> [1.0]
    , -> ',' [1.0]
    NP-SBJ|<ADJP-,> -> ADJP , [1.0]
    ADJP -> NP JJ [1.0]
    NP -> CD NNS [0.125]
    CD -> '61' [0.5]
    NNS -> 'years' [1.0]
    JJ -> 'old' [0.5]
    S|<VP-.> -> VP . [1.0]
    VP -> MD VP [0.333333]
    MD -> 'will' [1.0]
    VP -> VB VP|<NP-PP-CLR> [0.333333]
    VB -> 'join' [1.0]
    VP|<NP-PP-CLR> -> NP VP|<PP-CLR-NP-TMP> [1.0]
    NP -> DT NN [0.125]
    DT -> 'the' [0.666667]
    NN -> 'board' [0.25]
    VP|<PP-CLR-NP-TMP> -> PP-CLR NP-TMP [1.0]
    PP-CLR -> IN NP [1.0]
    IN -> 'as' [0.5]
    NP -> DT NP|<JJ-NN> [0.125]
    DT -> 'a' [0.333333]
    NP|<JJ-NN> -> JJ NN [1.0]
    JJ -> 'nonexecutive' [0.5]
    NN -> 'director' [0.25]
    NP-TMP -> NNP CD [1.0]
    NNP -> 'Nov.' [0.125]
    CD -> '29' [0.5]
    . -

## Parsers

In [0]:
print("Parse sentence using induced grammar:")

parser = pchart.InsideChartParser(grammar)
parser.trace(3)

sent = treebank.parsed_sents('wsj_0001.mrg')[0].leaves()
print(sent)

for parse in parser.parse(sent):
    print(parse)

Parse sentence using induced grammar:
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']
  |[-] . . . . . . . . . . . . . . . . .| [0:1] 'Pierre' [1.0]
  |. [-] . . . . . . . . . . . . . . . .| [1:2] 'Vinken' [1.0]
  |. . [-] . . . . . . . . . . . . . . .| [2:3] ','  [1.0]
  |. . . [-] . . . . . . . . . . . . . .| [3:4] '61' [1.0]
  |. . . . [-] . . . . . . . . . . . . .| [4:5] 'years' [1.0]
  |. . . . . [-] . . . . . . . . . . . .| [5:6] 'old' [1.0]
  |. . . . . . [-] . . . . . . . . . . .| [6:7] ','  [1.0]
  |. . . . . . . [-] . . . . . . . . . .| [7:8] 'will' [1.0]
  |. . . . . . . . [-] . . . . . . . . .| [8:9] 'join' [1.0]
  |. . . . . . . . . [-] . . . . . . . .| [9:10] 'the' [1.0]
  |. . . . . . . . . . [-] . . . . . . .| [10:11] 'board' [1.0]
  |. . . . . . . . . . . [-] . . . . . .| [11:12] 'as' [1.0]
  |. . . . . . . . . . . . [-] . . . . .| [12:13] 'a' [1.0]
  |. . . . . . . . . . . .

## ERG를 사용하려면
DELPH-IN(Deep Linguistic Processing with HPSG Initiative)은 핵어중심구구조문법(Head-Driven Phrase Structure Grammar, HPSG)과 최소귀환의미론(Minimal Recursion Semantics)와 함께, 언어학적 방법과 통계 처리 방법을 사용해 텍스트나 발화의 **의미**를 추출하는, 자연어의 딥러닝 기반 언어 처리를 목표로 하는 세계의 전산언어학자들의 모임입니다.

1993년 스탠포드 대학교 언어정보연구센터에서 개발을 시작한 English Resource Grammar(ERG)는 규칙기반 기계번역을 목표로 만들어졌지만, 이후 더 많은 영역에 활용될 수 있도록 확장되었습니다. [온라인 데모 페이지](http://erg.delph-in.net/logon)에서 별도의 설치 없이 ERG를 통한 parsing을 살펴볼 수 있습니다.

ERG는 세 가지 방법으로 사용할 수 있습니다.

1. DELPH-IN 배포판 사용하기 (리눅스 필요; full 버전)
1. 독립실행 가능한 ACE, PET 파서 사용하기 (윈도우 및 맥OS 지원)
1. [API](http://moin.delph-in.net/ErgApi) 사용하기


첫 번째 경우 서버를 제공하는 오슬로대학교 통신 상태에 따라 시간도 오래 걸리고, 무엇보다 root 권한이 없어야만 실행이 되는데, Colab에서 돌아가는 코드는 모두 root권한이기에 오류가 발생합니다. 해당 방법은 아래 '참고'페이지를 참고해 로컬에서 돌려 보세요.

두 번째 방법 역시 독립된 실행 파일을 설치하는 방법이기에 Colab에서 실행하기 힘듭니다. '참고'페이지를 참고해 로컬에서 실행해 보세요.

따라서 이 실습에서는 세 번째 방법을 사용합니다.

[참고](http://moin.delph-in.net/ErgProcessing)

In [0]:
!pip install pydelphin

Collecting pydelphin
[?25l  Downloading https://files.pythonhosted.org/packages/e5/79/0e230983438e96e3c58db898d94a7b94e31b860bd0b379679df48b96e5c9/PyDelphin-1.2.4-py3-none-any.whl (172kB)
[K     |██                              | 10kB 15.6MB/s eta 0:00:01[K     |███▉                            | 20kB 1.6MB/s eta 0:00:01[K     |█████▊                          | 30kB 2.3MB/s eta 0:00:01[K     |███████▋                        | 40kB 1.6MB/s eta 0:00:01[K     |█████████▌                      | 51kB 2.0MB/s eta 0:00:01[K     |███████████▍                    | 61kB 2.4MB/s eta 0:00:01[K     |█████████████▎                  | 71kB 2.8MB/s eta 0:00:01[K     |███████████████▏                | 81kB 3.0MB/s eta 0:00:01[K     |█████████████████               | 92kB 3.4MB/s eta 0:00:01[K     |███████████████████             | 102kB 2.8MB/s eta 0:00:01[K     |█████████████████████           | 112kB 2.8MB/s eta 0:00:01[K     |██████████████████████▉         | 122kB 2.8MB/s et

[PyDelphin](http://moin.delph-in.net/PyDelphinTop?action=show&redirect=PyDelphin)은 DELPH-IN representation을 위한 파이썬 라이브러리로, 우리가 사용할 API의 클라이언트 역할 또한 합니다.

In [0]:
import json
from delphin.web import client
url = "http://erg.delph-in.net/rest/0.9/"
r = client.parse('Dogs chase cats.', params={"mrs":"json"}, server=url)
result = r.result(0)['mrs']
print(json.dumps(result, indent=4, sort_keys=False))

{
    "top": "h1",
    "index": "e3",
    "relations": [
        {
            "label": "h4",
            "predicate": "udef_q",
            "lnk": {
                "from": 0,
                "to": 4
            },
            "arguments": {
                "ARG0": "x6",
                "RSTR": "h5",
                "BODY": "h7"
            }
        },
        {
            "label": "h8",
            "predicate": "_dog_n_1",
            "lnk": {
                "from": 0,
                "to": 4
            },
            "arguments": {
                "ARG0": "x6"
            }
        },
        {
            "label": "h2",
            "predicate": "_chase_v_1",
            "lnk": {
                "from": 5,
                "to": 10
            },
            "arguments": {
                "ARG0": "e3",
                "ARG1": "x6",
                "ARG2": "x9"
            }
        },
        {
            "label": "h10",
            "predicate": "udef_q",
            "lnk": {
  

Param을 조절해 다른 의미 표상 방법을 사용할 수 있습니다.

In [0]:
import json
from delphin.web import client
url = "http://erg.delph-in.net/rest/0.9/"
r = client.parse('Dogs chase cats.', params={"eds":"json"}, server=url)
result = r.result(0)['eds']
print(json.dumps(result, indent=4, sort_keys=False))

{
    "top": "e3",
    "nodes": {
        "_1": {
            "label": "udef_q",
            "lnk": {
                "from": 0,
                "to": 4
            },
            "edges": {
                "BV": "x6"
            }
        },
        "x6": {
            "label": "_dog_n_1",
            "lnk": {
                "from": 0,
                "to": 4
            },
            "type": "x",
            "properties": {
                "PERS": "3",
                "NUM": "pl",
                "IND": "+"
            },
            "edges": {}
        },
        "e3": {
            "label": "_chase_v_1",
            "lnk": {
                "from": 5,
                "to": 10
            },
            "type": "e",
            "properties": {
                "SF": "prop",
                "TENSE": "pres",
                "MOOD": "indicative",
                "PROG": "-",
                "PERF": "-"
            },
            "edges": {
                "ARG1": "x6",
               

MRS에 관한 학습을 원하시면 [Copestake, et al. (2005)](http://lingo.stanford.edu/sag/papers/copestake.pdf)를 참조하세요!

## 의존관계 알아보기

In [0]:
from delphin.web import client
url = "http://erg.delph-in.net/rest/0.9/"
r = client.parse("Is it true that John loves Mary?", params={"dm":"sdp"}, server=url)
dm = r.result(0)["dm"]
print(dm)

#15951
1	Is	is	VBZ	-	-	_	_	_
2	it	it	PRP	-	-	_	_	_
3	true	true	JJ	+	+	a_of:e-h	_	_
4	that	that	IN	-	-	_	_	_
5	John	John	NNP	-	-	named:x-c	_	ARG1
6	loves	love	VBZ	-	+	v:e-i-p	ARG1	_
7	Mary	Mary	NNP	-	-	named:x-c	_	ARG2
8	?	_	.	-	-	_	_	_


## 활용 예제 : 핵어 추출하기

In [0]:
def getHead(sentence):
  from delphin.web import client
  url = "http://erg.delph-in.net/rest/0.9/"
  r = client.parse(sentence, params={"eds":"json"}, server=url)
  eds = r.result(0)["eds"]
  head = eds["nodes"][eds["top"]]["label"]
  return head

In [0]:
print(getHead("I am a boy."))
print(getHead("I love natural language processing."))
print(getHead("Your Sentence Here"))

_be_v_id
_love_v_1
unknown


## 의미 표상을 논문에 사용하기 : LaTeX 활용

In [0]:
from delphin import dmrs
from delphin.web import client
url = "http://erg.delph-in.net/rest/0.9/"
r = client.parse("John loves Mary as much as Mark loves her.", params={"dm":"latex"}, server=url)

In [0]:
dm = r.result(0)["dm"]
dm = dm.replace("amp", "&")

In [0]:
print(dm)

\begin{dependency}[edge above, edge slant=0.15ex, edge unit distance=2ex]
  \begin{deptext}[column sep=1ex]
    John \& loves \& Mary \& as \& much \& as \& Mark \& loves \& her \& .\\
\end{deptext}
\deproot{2}{top}
\depedge{2}{3}{ARG2}
\depedge{2}{1}{ARG1}
\depedge{5}{8}{than}
\depedge{5}{2}{ARG1}
\depedge{8}{9}{ARG2}
\depedge{8}{7}{ARG1}
\end{dependency}
