# 字詞前置處理

In [1]:
# Anaconda 已內建
!pip install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl.metadata (2.8 kB)
Collecting click (from nltk)
  Downloading click-8.1.7-py3-none-any.whl.metadata (3.0 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.5.15-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/42.0 kB ? eta -:--:--
     --------- ------------------------------ 10.2/42.0 kB ? eta -:--:--
     --------------------------- ---------- 30.7/42.0 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 42.0/42.0 kB 406.8 kB/s eta 0:00:00
Collecting tqdm (from nltk)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.6 kB ? eta -:--:--
     ---------------------------------------- 57.6/57.6 kB 1.5 MB/s eta 0:00:00
Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ----- ---------------------------------- 0.2/1.5 MB 5.9 

In [1]:
# 載入相關套件
import nltk

In [2]:
nltk.download()  # 會出現選擇視窗,執行過一次就可

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# 透過 fileid 可以找到該語料庫底下的文本檔有哪些
from nltk.corpus import gutenberg

gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [5]:
# 測試文章段落
text = (
    "Today is a great day. It is even better than yesterday."
    + " And yesterday was the best day ever."
)

## 分割字句(斷句)

In [6]:
# 分割字句
nltk.sent_tokenize(text)

['Today is a great day.',
 'It is even better than yesterday.',
 'And yesterday was the best day ever.']

## 分詞

In [7]:
# 分詞
nltk.word_tokenize(text)

['Today',
 'is',
 'a',
 'great',
 'day',
 '.',
 'It',
 'is',
 'even',
 'better',
 'than',
 'yesterday',
 '.',
 'And',
 'yesterday',
 'was',
 'the',
 'best',
 'day',
 'ever',
 '.']

## 詞形還原

In [8]:
from nltk.stem import PorterStemmer, LancasterStemmer, SnowballStemmer

tokens = [
    "the",
    "spectators",
    "all",
    "stood",
    "and",
    "sang",
    "the",
    "national",
    "anthem",
    "are",
    "looking",
    "saw",
]

# stemming
port = PorterStemmer()  # 運用現有的規則移除常見單詞字尾
stemmed_port = [port.stem(token) for token in tokens]

lanc = LancasterStemmer()  # 有超過120條規則來刪除或替換詞綴。
stemmed_lanc = [lanc.stem(token) for token in tokens]

snow = SnowballStemmer("english")  # 支援多種語言加以優化辨識
stemmed_snow = [snow.stem(token) for token in tokens]

# showing stemmed results
print("Porter: {}".format(stemmed_port))
print("Lancaster: {}".format(stemmed_lanc))
print("Snowball: {}".format(stemmed_snow))

Porter: ['the', 'spectat', 'all', 'stood', 'and', 'sang', 'the', 'nation', 'anthem', 'are', 'look', 'saw']
Lancaster: ['the', 'spect', 'al', 'stood', 'and', 'sang', 'the', 'nat', 'anthem', 'ar', 'look', 'saw']
Snowball: ['the', 'spectat', 'all', 'stood', 'and', 'sang', 'the', 'nation', 'anthem', 'are', 'look', 'saw']


In [11]:
from nltk.stem import WordNetLemmatizer  # 載入WordNet語料庫

wnl = WordNetLemmatizer()
# lemmatize nouns
print(wnl.lemmatize("cars", "n"))  # 單詞，指定詞性：名詞
print(wnl.lemmatize("men", "n"))

# lemmatize verbs
print(wnl.lemmatize("running", "v"))
print(wnl.lemmatize("ate", "v"))

# lemmatize adjectives
print(wnl.lemmatize("saddest", "a"))
print(wnl.lemmatize("fancier", "a"))

car
men
run
eat
sad
fancy


In [12]:
word = "changes"
ps = nltk.porter.PorterStemmer()
ps.stem(word)

'chang'

In [13]:
# 字根詞形還原(Stemming)
text = (
    "Today is a great day. It is even better than yesterday."
    + " And yesterday was the best day ever."
)
ps = nltk.porter.PorterStemmer()
ps_text = " ".join([ps.stem(word) for word in text.split()])
ps_text

'today is a great day. it is even better than yesterday. and yesterday wa the best day ever.'

In [14]:
# 字根詞形還原(Stemming)X
ps = nltk.porter.PorterStemmer()
" ".join([ps.stem(word) for word in nltk.word_tokenize(text)])

'today is a great day . it is even better than yesterday . and yesterday wa the best day ever .'

In [15]:
# 依字典規則的詞形還原(Lemmatization)
word = "better"  # crashed
lem = nltk.WordNetLemmatizer()
lem.lemmatize(word)

'better'

In [16]:
# 依字典規則的詞形還原(Lemmatization)
text = "My system keeps crashing his crashed yesterday, ours crashes daily"
lem = nltk.WordNetLemmatizer()
" ".join([lem.lemmatize(word) for word in text.split()])

'My system keep crashing his crashed yesterday, ours crash daily'

In [17]:
text = """
Are fish complicated or is everything else in our diet just incredibly dumbed down? We eat basically four different mammals — cows, pigs, sheep, and goats. We eat basically four different birds — chickens, turkeys, ducks and geese. We eat basically three farmed grains — rice, corn and wheat. All of it farmed."""

## 停用詞(Stopwords)

In [18]:
import string

print("標點符號:", string.punctuation)

標點符號: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [19]:
# nltk.corpus.stopwords
nltk.corpus.stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [20]:
# 標點符號(Punctuation)
import string

print("標點符號:", string.punctuation)

# 測試文章段落
text = (
    "Today is a great day. It is even better than yesterday."
    + " And yesterday was the best day ever."
)
# 讀取停用詞
stopword_list = set(nltk.corpus.stopwords.words("english") + list(string.punctuation))


# 移除停用詞(Removing Stopwords)
def remove_stopwords(text, is_lower_case=False):
    if is_lower_case:
        text = text.lower()
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = " ".join(filtered_tokens)
    return filtered_text, filtered_tokens


filtered_text, filtered_tokens = remove_stopwords(text)
filtered_text

標點符號: !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


'Today great day It even better yesterday And yesterday best day ever'

## BOW 測試

### 文章來源：
1. https://www.bloomberg.com/news/articles/2018-06-18/google-is-training-machines-to-predict-when-a-patient-will-die
2. https://medium.com/@sebastian_andrei/south-koreas-convenience-store-culture-187c33a649a6


In [19]:
# 測試文章段落
with open("./news.txt", "r+", encoding="UTF-8") as f:
    text = f.read()

filtered_text, filtered_tokens = remove_stopwords(text, True)

import collections

# 生字表的集合
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))

[('’', 35), ('stores', 15), ('convenience', 14), ('one', 8), ('—', 8), ('even', 8), ('seoul', 8), ('city', 7), ('korea', 6), ('korean', 6), ('cities', 6), ('people', 5), ('summer', 4), ('new', 4), ('also', 4), ('find', 4), ('store', 4), ('would', 4), ('like', 4), ('average', 4)]


In [20]:
# 移除停用詞(Removing Stopwords)
lem = nltk.WordNetLemmatizer()


def remove_stopwords_regex(text, is_lower_case=False):
    if is_lower_case:
        text = text.lower()
    tokenizer = nltk.tokenize.RegexpTokenizer(r"\w+")  # 篩選文數字(Alphanumeric)
    tokens = tokenizer.tokenize(text)
    tokens = [lem.lemmatize(token.strip()) for token in tokens]  # 詞形還原
    filtered_tokens = [token for token in tokens if token not in stopword_list]
    filtered_text = " ".join(filtered_tokens)
    return filtered_text, filtered_tokens


filtered_text, filtered_tokens = remove_stopwords_regex(text, True)
word_freqs = collections.Counter()
for word in filtered_tokens:
    word_freqs[word] += 1
print(word_freqs.most_common(20))

[('store', 19), ('convenience', 14), ('city', 13), ('one', 8), ('even', 8), ('seoul', 8), ('korea', 6), ('korean', 6), ('night', 6), ('food', 5), ('ha', 5), ('people', 5), ('summer', 4), ('new', 4), ('life', 4), ('also', 4), ('find', 4), ('would', 4), ('like', 4), ('chain', 4)]


In [21]:
lem.lemmatize("korean")

'korean'

## 相似詞(Synonyms)

In [22]:
# 找出相似詞(Synonyms)
synonyms = nltk.corpus.wordnet.synsets("love")
synonyms

[Synset('love.n.01'),
 Synset('love.n.02'),
 Synset('beloved.n.01'),
 Synset('love.n.04'),
 Synset('love.n.05'),
 Synset('sexual_love.n.02'),
 Synset('love.v.01'),
 Synset('love.v.02'),
 Synset('love.v.03'),
 Synset('sleep_together.v.01')]

In [23]:
# 單字說明
synonyms[0].definition()

'a strong positive emotion of regard and affection'

In [24]:
# 單字說明
synonyms[1].definition()

'any object of warm affection or devotion'

In [25]:
# 單字的例句
synonyms[0].examples()

['his love for his work', 'children need a lot of love']

## 相反詞(Antonyms)

In [26]:
# 找出相反詞(Antonyms)
antonyms = []
for syn in nltk.corpus.wordnet.synsets("ugly"):
    for l in syn.lemmas():
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
antonyms

['beautiful']

## 詞性標籤(POS Tagging)

In [27]:
nltk.pos_tag(["happy"])

[('happy', 'JJ')]

In [28]:
# 找出詞性標籤(POS Tagging)
text = "I am a human being, capable of doing terrible things"
sentences = nltk.sent_tokenize(text)
for sent in sentences:
    print(nltk.pos_tag(nltk.word_tokenize(sent)))

[('I', 'PRP'), ('am', 'VBP'), ('a', 'DT'), ('human', 'JJ'), ('being', 'VBG'), (',', ','), ('capable', 'JJ'), ('of', 'IN'), ('doing', 'VBG'), ('terrible', 'JJ'), ('things', 'NNS')]


## 命名實體識別(Named Entity Recognition, NER)

In [29]:
from nltk import ne_chunk  # NER套件
from nltk import word_tokenize

"""
ne_chunk():命名實體識別
pos_tag():詞性分析
word_tokenize():分詞
"""
sent = "Mark is studying at Stanford University in California"
print(ne_chunk(nltk.pos_tag(word_tokenize(sent)), binary=False))

(S
  (PERSON Mark/NNP)
  is/VBZ
  studying/VBG
  at/IN
  (ORGANIZATION Stanford/NNP University/NNP)
  in/IN
  (GPE California/NNP))


## 實體關聯

In [30]:
for doc in nltk.corpus.ieer.parsed_docs("NYT_19980315"):
    print(doc)

<IEERDocument NYT19980315.0063: 'PUBLIC RADIO HOSTS DROP IN AND MAYBE STAY TOO LONG'>
<IEERDocument NYT19980315.0064: 'IN CYBERSPACE, IS THERE LAW WHERE THERE IS NO LAND?'>
<IEERDocument NYT19980315.0067: 'THE SITES: TUNING INTO MUSIC ON THE WEB'>
<IEERDocument NYT19980315.0069: 'ANALYSIS: TAXING INTERNET SALES _ GOVERNORS VS. TAX FREEDOM ACT'>
<IEERDocument NYT19980315.0070: 'A SEARCH ENGINE THAT CHARGES FOR TOP BILLING'>
<IEERDocument NYT19980315.0071: 'COMING SOON: TV DOCUMENTARIES TO A BOOKSTORE NEAR YOU'>
<IEERDocument NYT19980315.0072: 'WEATHER'>
<IEERDocument NYT19980315.0073: 'DICAPRIO , CHARISMATIC STAR, BALKS AT TEEN IDOL IMAGE'>
<IEERDocument NYT19980315.0074: "You've read the book? Now see the television program. Inspired by ``Angela's...">
<IEERDocument NYT19980315.0084: 'A DIRECTOR WHO DARES, AND TAKES THE HEAT'>
<IEERDocument NYT19980315.0085: 'ADVERTISING: AFTER 32 YEARS , WELLS BDDP WILL CLOSE'>
<IEERDocument NYT19980315.0086: "`THE GIFTS OF THE JEWS': DESERT NOMADS CH

In [31]:
for doc in nltk.corpus.ieer.parsed_docs("NYT_19980315"):
    print(doc.text)
    break

(DOCUMENT
  For
  almost
  (DURATION 20 years)
  ,
  since
  its
  debut
  in
  (DATE 1979)
  ,
  (PERSON Bob Edwards)
  has
  presided
  over
  the
  (ORGANIZATION National Public Radio)
  news
  magazine
  ``Morning
  Edition.''
  But
  from
  the
  start,
  the
  soothing,
  avuncular
  tone
  that
  is
  (PERSON Edwards)
  '
  trademark
  raised
  certain
  questions.
  ``Isn't
  that
  man
  dangerous?''
  a
  (MEASURE 10-year)
  -old
  in
  the
  school
  car
  pool
  I
  was
  driving
  asked
  in
  (DATE 1980)
  .
  I
  couldn't
  imagine
  what
  could
  be
  less
  dangerous.
  ``What
  are
  you
  talking
  about?''
  I
  asked.
  ``It's
  early
  in
  the
  morning,''
  she
  said.
  ``There
  are
  all
  these
  people
  driving
  around.
  He's
  going
  to
  make
  them
  all
  go
  back
  to
  sleep.''
  Like
  a
  number
  of
  other
  high-profile
  (ORGANIZATION NPR)
  news-magazine
  hosts
  or
  news
  readers,
  (PERSON Edwards)
  conveys
  a
  distinct
  sense
  

In [32]:
import re

"""
https://docs.python.org/zh-tw/3/library/re.html
.* :0~多個任意字
\b :邊界符,不可有其他字符 
?!\b.+ing :不可包含?!以後的字
"""
IN = re.compile(r".*\bin\b(?!\b.+ing)")
for doc in nltk.corpus.ieer.parsed_docs("NYT_19980315"):
    for rel in nltk.sem.extract_rels("ORG", "LOC", doc, corpus="ieer", pattern=IN):
        print(nltk.sem.rtuple(rel))

[ORG: 'WHYY'] 'in' [LOC: 'Philadelphia']
[ORG: 'McGlashan &AMP; Sarrail'] 'firm in' [LOC: 'San Mateo']
[ORG: 'Freedom Forum'] 'in' [LOC: 'Arlington']
[ORG: 'Brookings Institution'] ', the research group in' [LOC: 'Washington']
[ORG: 'Idealab'] ', a self-described business incubator based in' [LOC: 'Los Angeles']
[ORG: 'Open Text'] ', based in' [LOC: 'Waterloo']
[ORG: 'WGBH'] 'in' [LOC: 'Boston']
[ORG: 'Bastille Opera'] 'in' [LOC: 'Paris']
[ORG: 'Omnicom'] 'in' [LOC: 'New York']
[ORG: 'DDB Needham'] 'in' [LOC: 'New York']
[ORG: 'Kaplan Thaler Group'] 'in' [LOC: 'New York']
[ORG: 'BBDO South'] 'in' [LOC: 'Atlanta']
[ORG: 'Georgia-Pacific'] 'in' [LOC: 'Atlanta']
