# Practice of Basic NLP pre-processing using spaCy
 - Reference https://ungodly-hour.tistory.com/37

# 0. Setting

##### 0) 패키지 설치

In [2]:
!pip install spacy
!python -m spacy download xx_sent_ud_sm
!python -m spacy download en_core_web_md

Collecting spacy
  Downloading spacy-3.5.1-cp39-cp39-win_amd64.whl (12.2 MB)
     --------------------------------------- 12.2/12.2 MB 46.7 MB/s eta 0:00:00
Collecting wasabi<1.2.0,>=0.9.1
  Downloading wasabi-1.1.1-py3-none-any.whl (27 kB)
Collecting typer<0.8.0,>=0.3.0
  Downloading typer-0.7.0-py3-none-any.whl (38 kB)
Collecting catalogue<2.1.0,>=2.0.6
  Downloading catalogue-2.0.8-py3-none-any.whl (17 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0
  Downloading spacy_loggers-1.0.4-py3-none-any.whl (11 kB)
Collecting cymem<2.1.0,>=2.0.2
  Downloading cymem-2.0.7-cp39-cp39-win_amd64.whl (30 kB)
Collecting thinc<8.2.0,>=8.1.8
  Downloading thinc-8.1.9-cp39-cp39-win_amd64.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 18.8 MB/s eta 0:00:00
Collecting langcodes<4.0.0,>=3.2.0
  Using cached langcodes-3.3.0-py3-none-any.whl (181 kB)
Collecting pathy>=0.10.0
  Downloading pathy-0.10.1-py3-none-any.whl (48 kB)
     ---------------------------------------- 48.9/48.9 kB ? e

# 1. Tokenization

In [3]:
# 0) 라이브러리 불러오기
import spacy

# 1) 데이터 불러오기
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'

# 2) Tokenization 모델 불러오기
nlp = spacy.load('en_core_web_md') # !!!
# 3) Tokenization 수행
doc = nlp(text)
tokenized = list(doc)

# 4) 결과 출력
print(tokenized)

[I, have, studied, at, Ecole, Normal, Superieure, (, ENS, ), In, Paris, ,, Leipzig, University, in, Germany, ,, and, KAIST, in, Korea, during, my, Ph, ., D, and, Master, .]


# 2. POS tagging and Dependency

In [4]:
# 0) 라이브러리 불러오기
import spacy

# 1) 데이터 불러오기
text = 'text = "Sejong of Joseon (15 May 1397 – 8 April 1450), personal name Yi Do (Korean: 이도; Hanja: 李祹), widely known as Sejong the Great (Korean: 세종대왕; Hanja: 世宗大王), was the fourth ruler of the Joseon dynasty of Korea."'

# 2) Tokenization 모델 불러오기
nlp = spacy.load('en_core_web_md')
# 3) Tokenization 수행
doc = nlp(text)

# 4) POS Tagging
str_format = "{:>10}"*8 # !!!
print(str_format.format('Text', 'Lemma', 'POS', 'Tag', 'Dep', 'Shape', 'is alpha', 'is stop')) # !!!
print("=="*40)

for token in doc: # !!!
    print(str_format.format(token.text, token.lemma_, token.pos_, token.tag_, # !!!
                            token.dep_, token.shape_, str(token.is_alpha), str(token.is_stop)))

      Text     Lemma       POS       Tag       Dep     Shape  is alpha   is stop
      text      text      NOUN        NN      ROOT      xxxx      True     False
         =         =         X        FW     punct         =     False     False
         "         "     PUNCT        ``     punct         "     False     False
    Sejong    Sejong     PROPN       NNP     appos     Xxxxx      True     False
        of        of       ADP        IN      prep        xx      True      True
    Joseon    Joseon     PROPN       NNP      pobj     Xxxxx      True     False
         (         (     PUNCT     -LRB-     punct         (     False     False
        15        15       NUM        CD    nummod        dd     False     False
       May       May     PROPN       NNP     appos       Xxx      True      True
      1397      1397       NUM        CD    nummod      dddd     False     False
         –         –     PUNCT         :     punct         –     False     False
         8         8       N

# 3. Stop words 제거

In [5]:
stopwords = spacy.lang.en.stop_words.STOP_WORDS
print(stopwords)

{'too', 'who', 'next', 'might', 'yourself', 'throughout', 'via', 'since', 'beforehand', 'first', 'something', 'hereafter', 'only', 'itself', 'moreover', 'wherever', 'does', 'again', 'to', 'both', 'used', 'others', '‘re', 'whereafter', 'nine', 'among', 'cannot', 'enough', 'our', 'must', 'n’t', 'which', 'sixty', 'also', 'twenty', 'how', 'every', 'see', 'further', 'except', 'ours', 'go', 'towards', 'ten', '‘s', 'twelve', 'made', 'anyone', 'whose', 'whereupon', "'re", 'whenever', 'you', '’re', 'into', 'though', 'noone', 'serious', 'whither', "'ll", 'please', 'whereby', 'be', 'back', 'seemed', 'ourselves', 'nor', 'whole', 'him', 'top', "'ve", "n't", 'least', 'should', "'m", 'may', 'each', 'herein', 'amongst', '’m', 'thus', 'nevertheless', 'sometime', 'hereby', "'s", 'have', 'hence', 'did', 'anywhere', 'an', 'besides', "'d", 'hers', 'a', 're', 'before', 'own', 'ever', 'they', 'beyond', 'above', 'whatever', 'well', 'otherwise', 'else', 'yet', 'eleven', 'hereupon', 'almost', 'behind', 'and', '

In [6]:
# 0) 라이브러리 불러오기
import spacy

# 1) 데이터 불러오기
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'

# 2) Tokenization 모델 불러오기
nlp = spacy.load('en_core_web_md')
# 3) Tokenization 수행
doc = nlp(text)

# 4) Stop Words 제거
filtered = []
for token in doc:
    if not token.is_stop: # !!!
        filtered.append(token)

# 5) 결과 출력
print(filtered)

[studied, Ecole, Normal, Superieure, (, ENS, ), Paris, ,, Leipzig, University, Germany, ,, KAIST, Korea, Ph, ., D, Master, .]


# 4. Dependency Parsing
 - nlp() 메서드 결과물에 noun_chunks() 메소드를 호출하면, 자동으로 dependecy graph를 고려하여 noun phrase를 뽑아준다. (return 값은 Span 클래스의 generator)

In [7]:
# 0) 라이브러리 불러오기
import spacy

# 1) 데이터 불러오기
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'

# 2) Tokenization 모델 불러오기
nlp = spacy.load('en_core_web_md')
# 3) Tokenization 수행
doc = nlp(text)

# 4) Chunk Dependency Parsing
noun_chunks = doc.noun_chunks

print("=="*40)
str_format = "{:>30}{:>15}{:>15}{:>20}" # !!!
print(str_format.format('Text', 'Root Text', 'Root Dep', 'Root Head Text')) # !!!
print("=="*40)

for chunk in doc.noun_chunks: # !!!
    print(str_format.format(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)) # !!!

                          Text      Root Text       Root Dep      Root Head Text
                             I              I          nsubj             studied
       Ecole Normal Superieure     Superieure           pobj                  at
                          (ENS            ENS          appos          Superieure
                         Paris          Paris           pobj                  In
                       Germany        Germany           pobj                  in
                         KAIST          KAIST           conj          University
                         Korea          Korea           pobj                  in
                       my Ph.D              D           pobj              during
                        Master         Master           conj                   D


# 5. Navigating Parse Tree

In [8]:
# 0) 라이브러리 불러오기
import spacy

# 1) 데이터 불러오기
text = 'I have studied at Ecole Normal Superieure (ENS) In Paris, Leipzig University in Germany, and KAIST in Korea during my Ph.D and Master.'

# 2) Tokenization 모델 불러오기
nlp = spacy.load('en_core_web_md')
# 3) Tokenization 수행
doc = nlp(text)

# 4) Navigating Parse Tree
for token in doc:
    print(token.text) # !!!
    children = list(token.children) # !!!
    print('children:', children, 'head:', token.head if token.head != token else "!this is root node") # !!!
    print("=="*28)

I
children: [] head: studied
have
children: [] head: studied
studied
children: [I, have, at] head: !this is root node
at
children: [Superieure] head: studied
Ecole
children: [] head: Superieure
Normal
children: [] head: Superieure
Superieure
children: [Ecole, Normal, ENS, )] head: at
(
children: [] head: ENS
ENS
children: [(] head: Superieure
)
children: [] head: Superieure
In
children: [Paris] head: University
Paris
children: [] head: In
,
children: [] head: University
Leipzig
children: [] head: University
University
children: [In, ,, Leipzig, in, ,, and, KAIST, .] head: !this is root node
in
children: [Germany] head: University
Germany
children: [] head: in
,
children: [] head: University
and
children: [] head: University
KAIST
children: [in, during] head: University
in
children: [Korea] head: KAIST
Korea
children: [] head: in
during
children: [D] head: KAIST
my
children: [] head: D
Ph
children: [] head: D
.
children: [] head: D
D
children: [my, Ph, ., and, Master] head: during
and
c

## 6. Named Entity Recognition

In [9]:
# 0) 라이브러리 불러오기
import spacy

# 1) 데이터 불러오기
text = "Sejong of Joseon (15 May 1397 – 8 April 1450), personal name Yi Do (Korean: 이도; Hanja: 李祹), widely known as Sejong the Great (Korean: 세종대왕; Hanja: 世宗大王), was the fourth ruler of the Joseon dynasty of Korea."

# 2) Tokenization 모델 불러오기
nlp = spacy.load('en_core_web_md')
# 3) Tokenization 수행
doc = nlp(text)

# 4) Named Entity Recognition
print("="*40)
str_format = "{:>20}"*2
print(str_format.format('Text', 'NER'))
print("="*40)
for ent in doc.ents: # !!!
    print(str_format.format(ent.text, ent.label_)) # !!!

                Text                 NER
              Sejong              PERSON
                  15            CARDINAL
        8 April 1450                DATE
               Yi Do              PERSON
              Korean                NORP
               Hanja              PERSON
              Korean                NORP
               Hanja              PERSON
              fourth             ORDINAL
  the Joseon dynasty                DATE
               Korea                 GPE
