In [1]:
import nltk
from pprint import pprint

In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Woojin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Woojin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Woojin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Woojin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\Woojin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\Woojin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\words.zip.


True

# Basic Tokenize

In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize

sentences = """
Your time is limited, so don't waste it living someone else's life. 
Don't be trapped by dogma – which is living with the results of other people's thinking."""

word_tokenize_result = word_tokenize(sentences)
wordpunct_tokenize_result = wordpunct_tokenize(sentences)

print("Word_tokenize")
pprint(word_tokenize_result)
print("\n")
print("Wordpunct_tokenize")
pprint(wordpunct_tokenize_result)


Word_tokenize
['Your',
 'time',
 'is',
 'limited',
 ',',
 'so',
 'do',
 "n't",
 'waste',
 'it',
 'living',
 'someone',
 'else',
 "'s",
 'life',
 '.',
 'Do',
 "n't",
 'be',
 'trapped',
 'by',
 'dogma',
 '–',
 'which',
 'is',
 'living',
 'with',
 'the',
 'results',
 'of',
 'other',
 'people',
 "'s",
 'thinking',
 '.']


Wordpunct_tokenize
['Your',
 'time',
 'is',
 'limited',
 ',',
 'so',
 'don',
 "'",
 't',
 'waste',
 'it',
 'living',
 'someone',
 'else',
 "'",
 's',
 'life',
 '.',
 'Don',
 "'",
 't',
 'be',
 'trapped',
 'by',
 'dogma',
 '–',
 'which',
 'is',
 'living',
 'with',
 'the',
 'results',
 'of',
 'other',
 'people',
 "'",
 's',
 'thinking',
 '.']


# Part-of-speech (POS) tagging

In [4]:
pos_result=nltk.pos_tag(nltk.tokenize.wordpunct_tokenize(sentences))
pprint(pos_result)

[('Your', 'PRP$'),
 ('time', 'NN'),
 ('is', 'VBZ'),
 ('limited', 'VBN'),
 (',', ','),
 ('so', 'IN'),
 ('don', 'JJ'),
 ("'", 'POS'),
 ('t', 'NN'),
 ('waste', 'NN'),
 ('it', 'PRP'),
 ('living', 'VBG'),
 ('someone', 'NN'),
 ('else', 'RB'),
 ("'", 'POS'),
 ('s', 'JJ'),
 ('life', 'NN'),
 ('.', '.'),
 ('Don', 'NNP'),
 ("'", 'POS'),
 ('t', 'NN'),
 ('be', 'VB'),
 ('trapped', 'VBN'),
 ('by', 'IN'),
 ('dogma', 'NN'),
 ('–', 'NN'),
 ('which', 'WDT'),
 ('is', 'VBZ'),
 ('living', 'VBG'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('results', 'NNS'),
 ('of', 'IN'),
 ('other', 'JJ'),
 ('people', 'NNS'),
 ("'", 'POS'),
 ('s', 'NN'),
 ('thinking', 'NN'),
 ('.', '.')]


# Normalization

In [5]:
# Stemming

from nltk.stem.snowball import SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.porter import PorterStemmer

text = list(nltk.word_tokenize(sentences))

snowball = SnowballStemmer('english')
lancaster = LancasterStemmer()
porter = PorterStemmer()

for stemmer in (snowball, lancaster, porter):
    stemmed_text = [stemmer.stem(t) for t in text]
    print(" ".join(stemmed_text))
    print(stemmed_text)

your time is limit , so do n't wast it live someon els 's life . do n't be trap by dogma – which is live with the result of other peopl 's think .
['your', 'time', 'is', 'limit', ',', 'so', 'do', "n't", 'wast', 'it', 'live', 'someon', 'els', "'s", 'life', '.', 'do', "n't", 'be', 'trap', 'by', 'dogma', '–', 'which', 'is', 'live', 'with', 'the', 'result', 'of', 'other', 'peopl', "'s", 'think', '.']
yo tim is limit , so do n't wast it liv someon els 's lif . do n't be trap by dogm – which is liv with the result of oth peopl 's think .
['yo', 'tim', 'is', 'limit', ',', 'so', 'do', "n't", 'wast', 'it', 'liv', 'someon', 'els', "'s", 'lif', '.', 'do', "n't", 'be', 'trap', 'by', 'dogm', '–', 'which', 'is', 'liv', 'with', 'the', 'result', 'of', 'oth', 'peopl', "'s", 'think', '.']
your time is limit , so do n't wast it live someon els 's life . Do n't be trap by dogma – which is live with the result of other peopl 's think .
['your', 'time', 'is', 'limit', ',', 'so', 'do', "n't", 'wast', 'it', '

In [6]:
# Lemmatization

from nltk.stem.wordnet import WordNetLemmatizer

# Note: use part of speech tag, we'll see this in machine learning! 
lemmatizer = WordNetLemmatizer()
lemmas = [lemmatizer.lemmatize(t) for t in text]
print(" ".join(lemmas))

Your time is limited , so do n't waste it living someone else 's life . Do n't be trapped by dogma – which is living with the result of other people 's thinking .


In [7]:
import string
from nltk.corpus import wordnet as wn

## Module constants
lemmatizer  = WordNetLemmatizer()
stopwords   = set(nltk.corpus.stopwords.words('english'))
punctuation = string.punctuation

def tagwn(tag):
    """
    Returns the WordNet tag from the Penn Treebank tag.
    """

    return {
        'N': wn.NOUN,
        'V': wn.VERB,
        'R': wn.ADV,
        'J': wn.ADJ
    }.get(tag[0], wn.NOUN)


def normalize(text):
    for token, tag in nltk.pos_tag(nltk.wordpunct_tokenize(text)):
        #if you're going to do part of speech tagging, do it here
        token = token.lower()
        if token in stopwords or token in punctuation:
            continue
        token = lemmatizer.lemmatize(token, tagwn(tag))
        yield token

print(list(normalize(sentences)))

['time', 'limit', 'waste', 'live', 'someone', 'else', 'life', 'trap', 'dogma', '–', 'live', 'result', 'people', 'thinking']


In [8]:
punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [9]:
print(stopwords)

{'for', "should've", 'hadn', 'can', 'themselves', 'doing', 'didn', 're', 'what', 'our', 'do', 'hasn', "wouldn't", 'over', 'under', 'again', 'above', 'where', 'having', 'ourselves', "she's", 've', 'during', 'isn', "mightn't", 'weren', 'why', 'before', 'each', 'out', 'himself', 'those', 'been', 'be', 'her', 't', 'ours', 'your', 'yours', 'only', "that'll", 'i', 'against', 'because', 'now', 'won', 'you', 'there', 'its', 'them', 'which', 'in', "weren't", 'own', 'did', "you'd", 'than', 'most', 'about', 'and', 'haven', 'whom', 'at', 'doesn', 'other', 'the', 'couldn', 'but', 'some', 'then', 'until', 'how', 'am', 'into', 'has', 'by', 'were', 'yourself', 'mightn', 'any', 'hers', 'this', 'ma', 'through', "isn't", 'from', 'myself', "needn't", 'between', "you'll", 'it', 'here', 'should', 'a', 'down', 'few', 'herself', 'that', 'are', 'all', 'too', 'nor', 'if', 'off', 'below', "it's", "don't", 'm', 'needn', 'does', 'or', 'while', 'wouldn', 'after', 'he', 'further', 'such', 'was', 'o', 'just', 'shan',

# Named-entity recognition (NER)

In [10]:
text = sentences
print(nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text))))

(S
  Your/PRP$
  time/NN
  is/VBZ
  limited/VBN
  ,/,
  so/RB
  do/VBP
  n't/RB
  waste/VB
  it/PRP
  living/VBG
  someone/NN
  else/RB
  's/POS
  life/NN
  ./.
  Do/VBP
  n't/RB
  be/VB
  trapped/VBN
  by/IN
  dogma/NN
  –/NN
  which/WDT
  is/VBZ
  living/VBG
  with/IN
  the/DT
  results/NNS
  of/IN
  other/JJ
  people/NNS
  's/POS
  thinking/NN
  ./.)


In [12]:
from nltk.tag import StanfordNERTagger

stanford_data = 'C:\\Users\\Woojin\\Desktop\\stanford-ner-4.0.0\\stanford-ner-4.0.0\\classifiers\\english.all.3class.distsim.crf.ser.gz'
stanford_jar =  'C:\\Users\\Woojin\\Desktop\\stanford-ner-4.0.0\\stanford-ner-4.0.0\\stanford-ner-4.0.0.jar'

text = sentences
st = StanfordNERTagger(stanford_data, stanford_jar, 'utf-8')
for i in st.tag(text.split()):
    print('[' + i[1] + '] ' + i[0])

[O] Your
[O] time
[O] is
[O] limited,
[O] so
[O] don't
[O] waste
[O] it
[O] living
[O] someone
[O] else's
[O] life.
[O] Don't
[O] be
[O] trapped
[O] by
[O] dogma
[O] –
[O] which
[O] is
[O] living
[O] with
[O] the
[O] results
[O] of
[O] other
[O] people's
[O] thinking.


# Parsing

In [15]:
grammar = nltk.grammar.CFG.fromstring("""

S -> NP PUNCT | NP
NP -> N N | ADJP NP | DET N | DET ADJP
ADJP -> ADJ NP | ADJ N

DET -> 'an' | 'the' | 'a' | 'that' | 'my'
N -> 'airplane' | 'runway' | 'face' | 'chair' | 'person' 
ADJ -> 'red' | 'slow' | 'tired' | 'handsome'
PUNCT -> '.'
""")

In [16]:
def parse(sent):
    sent = sent.lower()
    parser = nltk.parse.ChartParser(grammar)
    for p in parser.parse(nltk.word_tokenize(sent)):
        yield p 

        
for tree in parse("my handsome face"): 
    tree.pprint()
#     tree[0].draw()

(S (NP (DET my) (ADJP (ADJ handsome) (N face))))


In [17]:
from nltk.parse.stanford import StanfordParser

stanford_model = 'C:\\Users\\Woojin\\Desktop\\stanford-parser-4.0.0\\stanford-parser-4.0.0\\stanford-parser-4.0.0-models.jar'
stanford_jar = 'C:\\Users\\Woojin\\Desktop\\stanford-parser-4.0.0\\stanford-parser-4.0.0\\stanford-parser.jar'

st = StanfordParser(stanford_model, stanford_jar)
sent = "My gorgeous face is brighter in night."
for tree in st.parse(nltk.wordpunct_tokenize(sent)):
    tree.pprint()
#     tree.draw()

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  


(ROOT
  (S
    (NP (PRP$ My) (JJ gorgeous) (NN face))
    (VP
      (VBZ is)
      (ADJP (ADJP (JJR brighter)) (PP (IN in) (NP (NN night)))))
    (. .)))
