# Learning NLTK

In [None]:
import nltk
nltk.download()

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize

EXAMPLE_TEXT = "Hello Mr. Smith, how are you doing today? The weather is great, and Python is awesome. The sky is pinkish-blue. You shouldn't eat cardboard."

print(word_tokenize(EXAMPLE_TEXT))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'weather', 'is', 'great', ',', 'and', 'Python', 'is', 'awesome', '.', 'The', 'sky', 'is', 'pinkish-blue', '.', 'You', 'should', "n't", 'eat', 'cardboard', '.']


In [36]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

example_sent = "This is a sample sentence, showing off the stop words filtration."

stop_words = set(stopwords.words('english'))

word_tokens = set(word_tokenize(example_sent))

filtered_sentence = [w for w in word_tokens if not w in stop_words]
print (word_tokens)
print (filtered_sentence)


{'a', 'stop', 'the', ',', 'is', '.', 'filtration', 'words', 'sentence', 'sample', 'showing', 'This', 'off'}
['stop', ',', '.', 'filtration', 'words', 'sentence', 'sample', 'showing', 'This']


# Detour: List Comprehensions and Set Comprehensions

$\{x^2|x \epsilon R\}$

In [34]:
Celsius = [39.2, 36.5, 37.3, 37.8]
Fahrenheit = [ ((float(9)/5)*x + 32) for x in Celsius ]
[(x,y,z) for x in range(1,30) for y in range(x,30) for z in range(y,30) if x**2 + y**2 == z**2]

from math import sqrt
n = 100
sqrt_n = int(sqrt(n))
#list
no_primes = [j for i in range(2,sqrt_n) for j in range(i*2, n, i)]
#set
no_primes = {j for i in range(2,sqrt_n) for j in range(i*2, n, i)}

# Back to NLTK

In [10]:
from nltk.stem import PorterStemmer

example_words = ["python","pythoner","pythoning","pythoned","pythonly"]
for word in example_words:
    print (PorterStemmer().stem(word))

python
python
python
python
pythonli


In [12]:
new_text = "It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once."

In [13]:
new_text

'It is important to by very pythonly while you are pythoning with python. All pythoners have pythoned poorly at least once.'

In [16]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer


In [17]:
for word in word_tokenize(new_text):
    print(PorterStemmer().stem(word))

It
is
import
to
by
veri
pythonli
while
you
are
python
with
python
.
All
python
have
python
poorli
at
least
onc
.


In [18]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [19]:
train_text = state_union.raw("2005-GWBush.txt")
sample_text = state_union.raw("2006-GWBush.txt")

In [20]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [21]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [22]:
def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)

    except Exception as e:
        print(str(e))


process_content()


[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'NNP'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'NNP'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'NNP'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'DT'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NNS'), (',', ','), ('distinguished', 'VBD'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'NN'), ('our', 'PRP$'), (

# Regex Detour

In [25]:
'''
identifiers

\d any number
\D anything but a number
\s space
\S anything but a space
\w anycharacter
\W any thing but a character
. is anything except a new line
\b white space around words
\. a period


Modifiers

{1,3} we are expecting 1- 3 of those \d{1-3}
+ 1 or more
? match 0 o 1
* match 0 or more
$ match the 
^ matching the beginning of a string
| either of 

'''



'\nidentifiers\n'

In [26]:
import re

exampleString = '''
Jessica is 15 years old, and Daniel is 27 years old.
Edward is 97 years old, and his grandfather, Oscar, is 102. 
'''

In [27]:
exampleString

'\nJessica is 15 years old, and Daniel is 27 years old.\nEdward is 97 years old, and his grandfather, Oscar, is 102. \n'