In [1]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [2]:
text1

<Text: Moby Dick by Herman Melville 1851>

### Brown Corpus

In [3]:
# load the Brown Corpus
from nltk.corpus import brown

In [5]:
print("Total categories:",len(brown.categories()))

Total categories: 15


In [6]:
print(brown.categories())

['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


In [8]:
# tokenized sentences
brown.sents(categories='mystery')


[['There', 'were', 'thirty-eight', 'patients', 'on', 'the', 'bus', 'the', 'morning', 'I', 'left', 'for', 'Hanover', ',', 'most', 'of', 'them', 'disturbed', 'and', 'hallucinating', '.'], ['An', 'interne', ',', 'a', 'nurse', 'and', 'two', 'attendants', 'were', 'in', 'charge', 'of', 'us', '.'], ...]

In [9]:
# POS tagged sentences
brown.tagged_sents(categories='mystery')

[[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')], [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')], ...]

In [12]:
# get sentences in natural form
sentences = brown.sents(categories='mystery')
sentences= [' '.join(sentence_token) for sentence_token in sentences]
print(sentences[0:5])


['There were thirty-eight patients on the bus the morning I left for Hanover , most of them disturbed and hallucinating .', 'An interne , a nurse and two attendants were in charge of us .', "I felt lonely and depressed as I stared out the bus window at Chicago's grim , dirty West Side .", 'It seemed incredible , as I listened to the monotonous drone of voices and smelled the fetid odors coming from the patients , that technically I was a ward of the state of Illinois , going to a hospital for the mentally ill .', 'I suddenly thought of Mary Jane Brennan , the way her pretty eyes could flash with anger , her quiet competence , the gentleness and sweetness that lay just beneath the surface of her defenses .']


In [13]:
# get tagged words
tagged_words= brown.tagged_words(categories='mystery')


In [14]:
# get nouns from tagged words
nouns = [(word, tag) for word, tag in tagged_words if any(noun_tag in tag for noun_tag in ['NP', 'NN'])]

In [15]:
print( nouns[0:10]) # prints the first 10 nouns

[('patients', 'NNS'), ('bus', 'NN'), ('morning', 'NN'), ('Hanover', 'NP'), ('interne', 'NN'), ('nurse', 'NN'), ('attendants', 'NNS'), ('charge', 'NN'), ('bus', 'NN'), ('window', 'NN')]


In [20]:
# build frequency distribution for nouns
import nltk
nouns_freq = nltk.FreqDist([word for word, tag in nouns])

In [23]:
# print top 10 occuring nouns
print(nouns_freq.most_common(10))

[('man', 106), ('time', 82), ('door', 80), ('car', 69), ('room', 65), ('Mr.', 63), ('way', 61), ('office', 50), ('eyes', 48), ('hand', 46)]


### Reuters corpus

In [25]:
#import the reuters corpus
from nltk.corpus import reuters

In [26]:
# categories of reuters corpus
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [27]:
# len of reuters corpus
print(len(reuters.categories()))


90


In [28]:
# get sentences in housing and income categories
sentences = reuters.sents(categories=['housing','income'])
sentences = [' '.join(sentence_token) for sentence_token in sentences]


In [29]:
# prints the first 5 sentences
print(sentences[0:5])


["YUGOSLAV ECONOMY WORSENED IN 1986 , BANK DATA SHOWS National Bank economic data for 1986 shows that Yugoslavia ' s trade deficit grew , the inflation rate rose , wages were sharply higher , the money supply expanded and the value of the dinar fell .", 'The trade deficit for 1986 was 2 . 012 billion dlrs , 25 . 7 pct higher than in 1985 .', 'The trend continued in the first three months of this year as exports dropped by 17 . 8 pct , in hard currency terms , to 2 . 124 billion dlrs .', 'Yugoslavia this year started quoting trade figures in dinars based on current exchange rates , instead of dollars based on a fixed exchange rate of 264 . 53 dinars per dollar .', "Yugoslavia ' s balance of payments surplus with the convertible currency area fell to 245 mln dlrs in 1986 from 344 mln in 1985 ."]


In [31]:
#  fileid based access
print(reuters.fileids(categories=['hoosing','income']))


['test/16118', 'test/18534', 'test/18540', 'test/18664', 'test/18665', 'test/18672', 'test/18911', 'training/10602', 'training/10604', 'training/2618', 'training/7005', 'training/7006', 'training/7015', 'training/7036', 'training/7098', 'training/7099']


In [32]:
print(reuters.sents(fileids=['test/16118','test/18534']))

[['YUGOSLAV', 'ECONOMY', 'WORSENED', 'IN', '1986', ',', 'BANK', 'DATA', 'SHOWS', 'National', 'Bank', 'economic', 'data', 'for', '1986', 'shows', 'that', 'Yugoslavia', "'", 's', 'trade', 'deficit', 'grew', ',', 'the', 'inflation', 'rate', 'rose', ',', 'wages', 'were', 'sharply', 'higher', ',', 'the', 'money', 'supply', 'expanded', 'and', 'the', 'value', 'of', 'the', 'dinar', 'fell', '.'], ['The', 'trade', 'deficit', 'for', '1986', 'was', '2', '.', '012', 'billion', 'dlrs', ',', '25', '.', '7', 'pct', 'higher', 'than', 'in', '1985', '.'], ...]


### Accessing the WordNet Corpus

<font size=2>The WordNet corpus is perhaps one of the most used corpora out there because it consists of a vast corpus of words and semantically linked synsets for each word. We will explore some of the basic features of the WordNet Corpus here, including synsets and methods of accessing the corpus data.</font>

In [35]:
# load the Wordnet Corpus
from nltk.corpus import wordnet as wn

In [36]:
word = 'hike' # taking hike as our word of interest

In [37]:
# get word synsets
word_synsets = wn.synsets(word)
print( word_synsets)

[Synset('hike.n.01'), Synset('rise.n.09'), Synset('raise.n.01'), Synset('hike.v.01'), Synset('hike.v.02')]


In [39]:
# get details for each synonym in synset
for synset in word_synsets:
    print( 'Synset Name:', synset.name())
    print('POS Tag:', synset.pos())
    print ('Definition:', synset.definition())
    print( 'Examples:', synset.examples())
    print("**************")

Synset Name: hike.n.01
POS Tag: n
Definition: a long walk usually for exercise or pleasure
Examples: ['she enjoys a hike in her spare time']
**************
Synset Name: rise.n.09
POS Tag: n
Definition: an increase in cost
Examples: ['they asked for a 10% rise in rates']
**************
Synset Name: raise.n.01
POS Tag: n
Definition: the amount a salary is increased
Examples: ['he got a 3% raise', 'he got a wage hike']
**************
Synset Name: hike.v.01
POS Tag: v
Definition: increase
Examples: ['The landlord hiked up the rents']
**************
Synset Name: hike.v.02
POS Tag: v
Definition: walk a long way, as for pleasure or physical exercise
Examples: ['We were hiking in Colorado', 'hike the Rockies']
**************


### Application of Natural Language Processing(NLP)
- **Machine Translation**
    - Machine translation is perhaps one of the most coveted and sought-after applications
for NLP. It is defined as the technique that helps in providing syntactic, grammatical,
and semantically correct translation between any two pair of languages. It was perhaps
the first major area of research and development in NLP. On a simple level, machine
translation is the translation of natural language carried out by a machine. By default, the
basic building blocks for the machine translation process involve simple substitution of
words from one language to another, but in that case we ignore things like grammar and
phrasal structure consistency. Hence, more sophisticated techniques have evolved over a
period of time, including combining large resources of text corpora along with statistical
and linguistic techniques. One of the most popular machine translation systems is Google
Translate. Figure 1-19 shows a successful machine translation operation executed by
Google Translate for the sentence What is the fare to the airport? from English to Italian.
![](http://www.k-international.com/wp-content/uploads/2013/12/Google-Translate.jpg)
- **Speech Recognition Systems**
    - This is perhaps the most difficult application for NLP. Perhaps the most difficult test of
intelligence in artificial intelligence systems is the Turing Test. This test is defined as a
test of intelligence for a computer. A question is posed to a computer and a human, and
the test is passed if it is impossible to say which of the answers given was given by the
human. Over time, a lot of progress has been made in this area by using techniques like
speech synthesis, analysis, syntactic parsing, and contextual reasoning. But one chief
limitation for speech recognition systems still remains: They are very domain specific and
will not work if the user strays even a little bit from the expected scripted inputs needed
by the system. Speech-recognition systems are now found in many places, from desktop
computers to mobile phones to virtual assistance systems.
- **Question Answering Systems**
    - Question Answering Systems (QAS) are built upon the principle of Question Answering,
based on using techniques from NLP and information retrieval (IR). QAS is primarily
concerned with building robust and scalable systems that provide answers to questions
given by users in natural language form. Imagine being in a foreign country, asking a
question to your personalized assistant in your phone in pure natural language, and
getting a similar response from it. This is the ideal state toward which researchers and
technologists are working. Some success in this field has been achieved with personalized
assistants like Siri and Cortana, but their scope is still limited because they understand
only a subset of key clauses and phrases in the entire human natural language. To build a successful QAS, you need a huge knowledgebase consisting of data about
various domains. Efficient querying systems into this knowledgebase would be leveraged
by the QAS to provide answers to questions in natural language form. Creating and
maintaining a queryable vast knowledgebase is extremely difficult—hence, you find the
rise of QAS in niche domains like food, healthcare, e-commerce, and so on. Chatbots are
one emerging trend that makes extensive use of QAS.
- **Contextual Recognition and Resolution**
    - This covers a wide area in understanding natural language and includes both syntactic
and semantic-based reasoning. Word sense disambiguation is a popular application,
where we want to find out the contextual sense of a word in a given sentence. Consider
the word book . It can mean an object containing knowledge and information when used
as a noun, and it can also mean to reserve a seat or a table when used as a verb. Detecting
these differences in sentences based on context is the main premise of word sense
disambiguation—a daunting task covered in Chapter 7 .
Coreference resolution is another problem in linguistics NLP is trying to address. By
definition, coreference is said to occur when two or more terms or expressions in a body
of text refer to the same entity. Then they are said to have the same referent . Consider
John just told me that he is going to the exam hall . In this sentence, the pronoun he has the
referent John . Resolving such pronouns is a part of coreference resolution, and it becomes
challenging once we have multiple referents in a body of text. For example, John just talked
with Jim. He told me we have a surprise test tomorrow . In this body of text, the pronoun he
could refer to either John or Jim , thus making pinpointing the exact referent difficult.
- **Text Summarization**
    - The main aim of text summarization is to take a corpus of text documents—which could
be a collection of texts, paragraphs, or sentences—and reducing the content appropriately
to create a summary that retains the key points of the collection. Summarization can
be carried out by looking at the various documents and trying to find out the keywords,
phrases, and sentences that have an important prominence in the whole collection. Two
main types of techniques for text summarization include extraction-based summarization
and abstraction-based summarization . With the advent of huge amounts of text and
unstructured data, the need for text summarization in getting to valuable insights quickly
is in great demand.
Text-summarization systems usually perform two main types of operations. The first
is generic summarization , which tries to provide a generic summary of the collection of
documents under analysis. The second type of operation is query-based summarization ,
which provides query-relevant text summaries where the corpus is filtered further based
on specific queries, relevant keywords and phrases are extracted relevant to the query,
and the summary is constructed. Chapter 5 covers this in detail .
- **Text Categorization**
    - The main aim of text categorization is identifying to which category or class a specific
document should be placed based on the contents of the document. This is one of the
most popular applications of NLP and machine learning because with the right data, it
is extremely simple to understand the principles behind its internals and implement a
working text categorization system. Both supervised and unsupervised machine learning
techniques can be used in solving this problem, and sometimes a combination of both is
used. This has helped build a lot of successful and practical applications, including spam
filters and news article categorization.
- **Text Analytics**
   - As mentioned before, with the advent of huge amounts of computing power, unstructured
data, and success with machine learning and statistical analysis techniques, it wasn’t long
before text analytics started garnering a lot of attention. However, text analytics poses
some challenges compared to regular analytical methods. Free-flowing text is highly
unstructured and rarely follows any specific pattern—like weather data or structured
attributes in relational databases. Hence, standard statistical methods aren’t helpful when
applied out of the box on unstructured text data. This section covers some of the main
concepts in text analytics and also discusses the definition and scope of text analytics,
which will give you a broad idea of what you can expect in the upcoming chapters.
Text analytics , also known as text mining , is the methodology and process followed
to derive quality and actionable information and insights from textual data. This involves
using NLP, information retrieval, and machine learning techniques to parse unstructured
text data into more structured forms and deriving patterns and insights from this data
that would be helpful for the end user. Text analytics comprises a collection of machine
learning, linguistic, and statistical techniques that are used to model and extract
information from text primarily for analysis needs, including business intelligence,
exploratory, descriptive, and predictive analysis. Here are some of the main techniques
and operations in text analytics:.
    - Text classification
    - Text clustering
    - Text summarization
    - Sentiment analysis
    - Entity extraction and recognition
    - Similarity analysis and relation modeling
    - Spam detection
    - News articles categorization
    - Social media analysis and monitoring
    - Bio-medical
    - Security intelligence
    - Marketing and CRM
    - Sentiment analysis
    - Ad placements
    - Chatbots
    - Virtual assistants