# Practice of Basic NLP pre-processing using spaCy
 - Reference https://ungodly-hour.tistory.com/37

## 1. Install packages

In [1]:
!pip install spacy
!python -m spacy download xx_sent_ud_sm
!python -m spacy download en_core_web_md 
import spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## 2. Tokenization

In [10]:
nlp = spacy.load('en_core_web_md')
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'
doc = nlp(text)
tokenized = list(doc)
print(tokenized)

[I, have, studied, at, Ecole, Normal, Superieure, (, ENS, ), In, Paris, ,, Leipzig, University, in, Germany, ,, and, KAIST, in, Korea, during, my, Ph, ., D, and, Master, .]


## 3. POS tagging and Dependency

In [23]:
nlp = spacy.load('en_core_web_md')
text = 'text = "Sejong of Joseon (15 May 1397 – 8 April 1450), personal name Yi Do (Korean: 이도; Hanja: 李祹), widely known as Sejong the Great (Korean: 세종대왕; Hanja: 世宗大王), was the fourth ruler of the Joseon dynasty of Korea."'
doc = nlp(text)

str_format = "{:>10}"*8
print(str_format.format('Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is alpha', 'is stop'))
print("=="*40)

for token in doc:
    print(str_format.format(token.text, token.lemma_, token.pos_, token.tag_, 
                            token.dep_, token.shape_, str(token.is_alpha), str(token.is_stop)))

      Text     Lemma       POS       Tag       Dep     Shape  is alpha   is stop
      text      text      NOUN        NN     nsubj      xxxx      True     False
         =         =     PUNCT         .     punct         =     False     False
         "         "     PUNCT        ``     punct         "     False     False
    Sejong    Sejong     PROPN       NNP     appos     Xxxxx      True     False
        of        of       ADP        IN      prep        xx      True      True
    Joseon    Joseon     PROPN       NNP      pobj     Xxxxx      True     False
         (         (     PUNCT     -LRB-     punct         (     False     False
        15        15       NUM        CD    nummod        dd     False     False
       May       May     PROPN       NNP     appos       Xxx      True      True
      1397      1397       NUM        CD     appos      dddd     False     False
         –         –     PUNCT         :     punct         –     False     False
         8         8       N

## 3. Stop words

In [12]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(stopwords)

{'ours', 'namely', 'his', 'which', 'hence', 'never', 'however', 'several', 'she', 'serious', 'could', 'hereupon', 'sixty', '’d', 'ourselves', 'at', 'in', '‘ll', 'too', 'four', "'ve", 'therein', 'throughout', 'itself', 'would', 'always', 'n’t', 'rather', 'across', 'other', 'some', 'he', 'has', 'almost', 'whereafter', 'thereupon', 'they', 'fifteen', '‘re', 'thereby', 'behind', 'many', 'had', 'three', 'perhaps', 'hundred', 'others', 'somehow', '’m', 'so', 'any', 'over', 'without', 'mostly', 'yours', 'name', 'why', 'front', 'whether', 'unless', 'else', 'by', 'become', 'due', 'with', 'six', 'not', 'this', 'do', 'themselves', 'what', 'otherwise', '’ll', 'few', 'sometime', 'who', 'all', 'the', 'really', 'beside', 'and', 'enough', 'where', 'beforehand', 'off', 'formerly', 'we', 'their', 'amount', 'for', 'than', 'twenty', 'using', 'among', 'nowhere', 'put', 'even', 'none', 'twelve', 'cannot', 'alone', 'ever', 'everyone', 'eleven', 'a', 'whatever', 'you', 'down', 'sometimes', 'already', 'on', 't

In [13]:
nlp = spacy.load('en_core_web_md')
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'
doc = nlp(text)

filtered = []
for word in doc:
    if not word.is_stop:
        filtered.append(word)

print(filtered)

[studied, Ecole, Normal, Superieure, (, ENS, ), Paris, ,, Leipzig, University, Germany, ,, KAIST, Korea, Ph, ., D, Master, .]


## 4. Dependency Parsing
 - nlp() 메서드 결과물에 noun_chunks() 메소드를 호출하면, 자동으로 dependecy graph를 고려하여 noun phrase를 뽑아준다. (return 값은 Span 클래스의 generator)

In [14]:
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'
doc = nlp(text)

noun_chunks = doc.noun_chunks

print("=="*40)
str_format = "{:>30}{:>15}{:>15}{:>20}"
print(str_format.format('Text', 'Root Text', 'Root Dep', 'Root Head Text'))
print("=="*40)

for chunk in doc.noun_chunks:
    print(str_format.format(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text))

                          Text      Root Text       Root Dep      Root Head Text
                             I              I          nsubj             studied
       Ecole Normal Superieure     Superieure           pobj                  at
                          (ENS            ENS          appos          Superieure
                         Paris          Paris           pobj                  In
                       Germany        Germany           pobj                  in
                         KAIST          KAIST           conj          University
                         Korea          Korea           pobj                  in
                       my Ph.D              D           pobj              during
                        Master         Master           conj                   D


# 5. Navigating Parse Tree

In [16]:
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'
doc = nlp(text)

for tok in doc:
    print(tok.text)
    children = list(tok.children)
    print('children:', children, 'head:', tok.head if tok.head != tok else "!this is root node")
    print("=="*28)

I
children: [] head: studied
have
children: [] head: studied
studied
children: [I, have, at] head: !this is root node
at
children: [Superieure] head: studied
Ecole
children: [] head: Superieure
Normal
children: [] head: Superieure
Superieure
children: [Ecole, Normal, ENS, )] head: at
(
children: [] head: ENS
ENS
children: [(] head: Superieure
)
children: [] head: Superieure
In
children: [Paris] head: University
Paris
children: [] head: In
,
children: [] head: University
Leipzig
children: [] head: University
University
children: [In, ,, Leipzig, in, ,, and, KAIST, .] head: !this is root node
in
children: [Germany] head: University
Germany
children: [] head: in
,
children: [] head: University
and
children: [] head: University
KAIST
children: [in, during] head: University
in
children: [Korea] head: KAIST
Korea
children: [] head: in
during
children: [D] head: KAIST
my
children: [] head: D
Ph
children: [] head: D
.
children: [] head: D
D
children: [my, Ph, ., and, Master] head: during
and
c

## 6. Named Entity Recognition

In [22]:
text = "Sejong of Joseon (15 May 1397 – 8 April 1450), personal name Yi Do (Korean: 이도; Hanja: 李祹), widely known as Sejong the Great (Korean: 세종대왕; Hanja: 世宗大王), was the fourth ruler of the Joseon dynasty of Korea."
doc = nlp(text)

print("="*40)
str_format = "{:>20}"*2
print(str_format.format('Text', 'NER'))
print("="*40)
for ent in doc.ents:
    print(str_format.format(ent.text, ent.label_))

                Text                 NER
              Sejong              PERSON
              Joseon                 GPE
         15 May 1397                DATE
        8 April 1450                DATE
                  Yi              PERSON
              Korean                NORP
               Hanja              PERSON
    Sejong the Great              PERSON
              Korean                NORP
               Hanja              PERSON
              fourth             ORDINAL
  the Joseon dynasty                DATE
               Korea                 GPE
