# NLTK Tutorial

## Testing NLTK installation and import

In [None]:
from nltk.corpus import brown

In [2]:
brown.words()

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [4]:
# Loading all items from NLTK's book module

from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [5]:
# Exploring brown corpus

from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [6]:
reviews_words = brown.words(categories="reviews")
reviews_words

['It', 'is', 'not', 'news', 'that', 'Nathan', ...]

In [7]:
len(reviews_words)

40704

## Text Extraction and Tokenization

In [9]:
# Sentence Tokenizing

from nltk.tokenize import sent_tokenize

EXAMPLE_TEXT = "Hello World! This is sentence tokenizing."
print(sent_tokenize(EXAMPLE_TEXT))

['Hello World!', 'This is sentence tokenizing.']


In [11]:
# Word Tokenizing

from nltk.tokenize import word_tokenize

EXAMPLE_WORD = "Hello World! This is word tokenizing."
print (word_tokenize(EXAMPLE_WORD))

['Hello', 'World', '!', 'This', 'is', 'word', 'tokenizing', '.']


In [13]:
# Stop Words

import nltk
from nltk.corpus import stopwords
set(stopwords.words('english')) # filtered out before the NLP takes place

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [14]:
# Using Stop Words to remove unnecessary words from example text

#from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

example_sent = "an Apple a day keeps diseases at bay."

stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(example_sent)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
filtered_sentence = []

for w in word_tokens:
    if w not in stop_words:
        filtered_sentence.append(w)
        
print(filtered_sentence)

['Apple', 'day', 'keeps', 'diseases', 'bay', '.']


In [15]:
# Stemming

from nltk.stem import PorterStemmer
#from nitk.tokenize import sent_tokenize, word_tokenize
ps = PorterStemmer()
new_text = "importance of caving as explained by cavers"

words = word_tokenize(new_text)
for w in words:
    print(ps.stem(w))

import
of
cave
as
explain
by
caver


In [17]:
# Lemmatization

from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

print(lemmatizer.lemmatize("feet"))
print(lemmatizer.lemmatize("cacti"))
print(lemmatizer.lemmatize("geese"))
                           
# Without a POS tag, Lemmatizer assumes everything is a noun
                           
print(lemmatizer.lemmatize("loving"))
print(lemmatizer.lemmatize("loving", 'v')) # with POS tag

foot
cactus
goose
loving
love


In [18]:
# Text Extraction / POS Tagging

#stop words = set(stopwords.words('english))
txt = '''
    Text mining, also referred to as text data mining,
    roughly equivalent to text analytics, is the process of deriving high-quality information from text.
    High-quality information is typically derived through the devising of patterns and trends through means
    such as statistical pattern learning.
    '''
tokenized = sent_tokenize(txt)
for i in tokenized:
    wordsList = nltk.word_tokenize(i)
    wordsList = [w for w in wordsList if not w in stop_words]
    tagged = nltk.pos_tag(wordsList)
    
    print (tagged)

[('Text', 'NNP'), ('mining', 'NN'), (',', ','), ('also', 'RB'), ('referred', 'VBD'), ('text', 'JJ'), ('data', 'NN'), ('mining', 'NN'), (',', ','), ('roughly', 'RB'), ('equivalent', 'JJ'), ('text', 'NN'), ('analytics', 'NNS'), (',', ','), ('process', 'NN'), ('deriving', 'VBG'), ('high-quality', 'NN'), ('information', 'NN'), ('text', 'NN'), ('.', '.')]
[('High-quality', 'NNP'), ('information', 'NN'), ('typically', 'RB'), ('derived', 'VBD'), ('devising', 'VBG'), ('patterns', 'NNS'), ('trends', 'NNS'), ('means', 'VBZ'), ('statistical', 'JJ'), ('pattern', 'NN'), ('learning', 'NN'), ('.', '.')]


In [21]:
# Text Extraction and pre-processing - Named Entity Recognition (NER)

#import nltk
doc = '''Google is an American multinational technology company that specializes in related 
services and products, which include online advertising technologies, search engine, cloud 
computing, and hardware. it was founded in 1998 by Larry Page and Sergey Brin while they were 
Ph.D. students at Stanford University in California'''

# tokenize doc
tokenized_doc = nltk.word_tokenize(doc)
tagged_sentences = nltk.pos_tag(tokenized_doc)
ne_chunked_sents = nltk.ne_chunk(tagged_sentences)

# extract all named entities
named_entities = []
for tagged_tree in ne_chunked_sents:
    if hasattr(tagged_tree, "label"):
        entity_name = ' '.join(c[0] for c in tagged_tree.leaves())
        entity_type = tagged_tree.label()
        named_entities.append((entity_name, entity_type))
print(named_entities)

[('Google', 'GPE'), ('American', 'GPE'), ('Larry Page', 'PERSON'), ('Sergey Brin', 'PERSON'), ('Stanford University', 'ORGANIZATION'), ('California', 'GPE')]
