In [1]:
import os
import nltk
import nltk.corpus

In [2]:
print(os.listdir(nltk.data.find("corpora")))

['abc', 'abc.zip', 'alpino', 'alpino.zip', 'biocreative_ppi', 'biocreative_ppi.zip', 'brown', 'brown.zip', 'brown_tei', 'brown_tei.zip', 'cess_cat', 'cess_cat.zip', 'cess_esp', 'cess_esp.zip', 'chat80', 'chat80.zip', 'city_database', 'city_database.zip', 'cmudict', 'cmudict.zip', 'comparative_sentences', 'comparative_sentences.zip', 'comtrans.zip', 'conll2000', 'conll2000.zip', 'conll2002', 'conll2002.zip', 'conll2007.zip', 'crubadan', 'crubadan.zip', 'dependency_treebank', 'dependency_treebank.zip', 'dolch', 'dolch.zip', 'europarl_raw', 'europarl_raw.zip', 'floresta', 'floresta.zip', 'framenet_v15', 'framenet_v15.zip', 'framenet_v17', 'framenet_v17.zip', 'gazetteers', 'gazetteers.zip', 'genesis', 'genesis.zip', 'gutenberg', 'gutenberg.zip', 'ieer', 'ieer.zip', 'inaugural', 'inaugural.zip', 'indian', 'indian.zip', 'jeita.zip', 'kimmo', 'kimmo.zip', 'knbc.zip', 'lin_thesaurus', 'lin_thesaurus.zip', 'machado.zip', 'mac_morpho', 'mac_morpho.zip', 'masc_tagged.zip', 'movie_reviews', 'movie

In [3]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
hamlet = nltk.corpus.gutenberg.words('shakespeare-hamlet.txt')
hamlet

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

In [5]:
for word in hamlet[:500]:
    print(word, sep=' ', end=' ')

[ The Tragedie of Hamlet by William Shakespeare 1599 ] Actus Primus . Scoena Prima . Enter Barnardo and Francisco two Centinels . Barnardo . Who ' s there ? Fran . Nay answer me : Stand & vnfold your selfe Bar . Long liue the King Fran . Barnardo ? Bar . He Fran . You come most carefully vpon your houre Bar . ' Tis now strook twelue , get thee to bed Francisco Fran . For this releefe much thankes : ' Tis bitter cold , And I am sicke at heart Barn . Haue you had quiet Guard ? Fran . Not a Mouse stirring Barn . Well , goodnight . If you do meet Horatio and Marcellus , the Riuals of my Watch , bid them make hast . Enter Horatio and Marcellus . Fran . I thinke I heare them . Stand : who ' s there ? Hor . Friends to this ground Mar . And Leige - men to the Dane Fran . Giue you good night Mar . O farwel honest Soldier , who hath relieu ' d you ? Fra . Barnardo ha ' s my place : giue you goodnight . Exit Fran . Mar . Holla Barnardo Bar . Say , what is Horatio there ? Hor . A peece of him Bar 

## Sentence and word tokenizer

In [6]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [7]:
abs = '''This paper analyses the work of the Eritrea-Ethiopia Claims Commission, which was established in December 2000 to adjudicate claims for loss, damage and injury arising from violations of international law that occurred during the 1998-2000 conflict between Eritrea and Ethiopia. In particular, this paper critically considers some of the Commission’s legal and practical innovations and attempts, from its analysis, to draw out some key lessons for future bodies that may have to work under similar situations with similar mandates. After providing contextual background on the conflict and the efforts to achieve a peaceful resolution of it in the first section, the second section of this paper describes the workings of the Commission. In the third section, the authors seek to critically analyse the legal and practical contributions of the Commission. The fourth section is devoted to considering the circumstances under which the constitution of such a body is appropriate. As part of this section, and with specific reference to the Ethiopia-Eritrea dispute, the paper weighs the strengths of such bodies against bodies with a purely fact-finding mandate. The fifth and final section then presents a summary of the key findings of the paper.'''

In [8]:
abs_tok_words = word_tokenize(abs)
abs_tok_sent = sent_tokenize(abs)
abs_tok_words, abs_tok_sent

(['This',
  'paper',
  'analyses',
  'the',
  'work',
  'of',
  'the',
  'Eritrea-Ethiopia',
  'Claims',
  'Commission',
  ',',
  'which',
  'was',
  'established',
  'in',
  'December',
  '2000',
  'to',
  'adjudicate',
  'claims',
  'for',
  'loss',
  ',',
  'damage',
  'and',
  'injury',
  'arising',
  'from',
  'violations',
  'of',
  'international',
  'law',
  'that',
  'occurred',
  'during',
  'the',
  '1998-2000',
  'conflict',
  'between',
  'Eritrea',
  'and',
  'Ethiopia',
  '.',
  'In',
  'particular',
  ',',
  'this',
  'paper',
  'critically',
  'considers',
  'some',
  'of',
  'the',
  'Commission',
  '’',
  's',
  'legal',
  'and',
  'practical',
  'innovations',
  'and',
  'attempts',
  ',',
  'from',
  'its',
  'analysis',
  ',',
  'to',
  'draw',
  'out',
  'some',
  'key',
  'lessons',
  'for',
  'future',
  'bodies',
  'that',
  'may',
  'have',
  'to',
  'work',
  'under',
  'similar',
  'situations',
  'with',
  'similar',
  'mandates',
  '.',
  'After',
  'prov

In [9]:
from nltk.probability import FreqDist

In [10]:
fdist = FreqDist()
for word in abs_tok_words:
    fdist[word.lower()] += 1
fdist

FreqDist({'the': 23, 'of': 12, ',': 9, 'and': 8, 'to': 7, '.': 7, 'section': 6, 'paper': 5, 'this': 4, 'commission': 4, ...})

In [11]:
fdist_top_10 = fdist.most_common(10)
fdist_top_10

[('the', 23),
 ('of', 12),
 (',', 9),
 ('and', 8),
 ('to', 7),
 ('.', 7),
 ('section', 6),
 ('paper', 5),
 ('this', 4),
 ('commission', 4)]

## Blank tokenizer - Paragraphs

In [12]:
from nltk.tokenize import blankline_tokenize
abs_tok_blank = blankline_tokenize(abs)
len(abs_tok_blank)

1

## Tokenization - Bi, Tri, n-grams

In [13]:
from nltk.util import bigrams, trigrams, ngrams

In [14]:
s = 'The best and the most beautiful things in the world cannot be seen or even touched, they must be felt with the heart'
quote_tok = word_tokenize(s)
quote_tok

['The',
 'best',
 'and',
 'the',
 'most',
 'beautiful',
 'things',
 'in',
 'the',
 'world',
 'can',
 'not',
 'be',
 'seen',
 'or',
 'even',
 'touched',
 ',',
 'they',
 'must',
 'be',
 'felt',
 'with',
 'the',
 'heart']

In [15]:
quotes_bi = list(bigrams(quote_tok))
quotes_tri = list(trigrams(quote_tok))
quotes_5 = list(ngrams(quote_tok, 5))

In [16]:
quotes_bi

[('The', 'best'),
 ('best', 'and'),
 ('and', 'the'),
 ('the', 'most'),
 ('most', 'beautiful'),
 ('beautiful', 'things'),
 ('things', 'in'),
 ('in', 'the'),
 ('the', 'world'),
 ('world', 'can'),
 ('can', 'not'),
 ('not', 'be'),
 ('be', 'seen'),
 ('seen', 'or'),
 ('or', 'even'),
 ('even', 'touched'),
 ('touched', ','),
 (',', 'they'),
 ('they', 'must'),
 ('must', 'be'),
 ('be', 'felt'),
 ('felt', 'with'),
 ('with', 'the'),
 ('the', 'heart')]

In [17]:
quotes_tri

[('The', 'best', 'and'),
 ('best', 'and', 'the'),
 ('and', 'the', 'most'),
 ('the', 'most', 'beautiful'),
 ('most', 'beautiful', 'things'),
 ('beautiful', 'things', 'in'),
 ('things', 'in', 'the'),
 ('in', 'the', 'world'),
 ('the', 'world', 'can'),
 ('world', 'can', 'not'),
 ('can', 'not', 'be'),
 ('not', 'be', 'seen'),
 ('be', 'seen', 'or'),
 ('seen', 'or', 'even'),
 ('or', 'even', 'touched'),
 ('even', 'touched', ','),
 ('touched', ',', 'they'),
 (',', 'they', 'must'),
 ('they', 'must', 'be'),
 ('must', 'be', 'felt'),
 ('be', 'felt', 'with'),
 ('felt', 'with', 'the'),
 ('with', 'the', 'heart')]

In [18]:
quotes_5

[('The', 'best', 'and', 'the', 'most'),
 ('best', 'and', 'the', 'most', 'beautiful'),
 ('and', 'the', 'most', 'beautiful', 'things'),
 ('the', 'most', 'beautiful', 'things', 'in'),
 ('most', 'beautiful', 'things', 'in', 'the'),
 ('beautiful', 'things', 'in', 'the', 'world'),
 ('things', 'in', 'the', 'world', 'can'),
 ('in', 'the', 'world', 'can', 'not'),
 ('the', 'world', 'can', 'not', 'be'),
 ('world', 'can', 'not', 'be', 'seen'),
 ('can', 'not', 'be', 'seen', 'or'),
 ('not', 'be', 'seen', 'or', 'even'),
 ('be', 'seen', 'or', 'even', 'touched'),
 ('seen', 'or', 'even', 'touched', ','),
 ('or', 'even', 'touched', ',', 'they'),
 ('even', 'touched', ',', 'they', 'must'),
 ('touched', ',', 'they', 'must', 'be'),
 (',', 'they', 'must', 'be', 'felt'),
 ('they', 'must', 'be', 'felt', 'with'),
 ('must', 'be', 'felt', 'with', 'the'),
 ('be', 'felt', 'with', 'the', 'heart')]

## Stemming - Normalizing words into their base form

In [19]:
from nltk.stem import PorterStemmer
pst = PorterStemmer()

In [20]:
pst.stem("having")

'have'

In [21]:
words_to_stem = ["give", "given", "giving", "gave"]
for word in words_to_stem:
    print(word + " : ", pst.stem(word))

give :  give
given :  given
giving :  give
gave :  gave


In [22]:
from nltk.stem import LancasterStemmer
lst = LancasterStemmer()

In [23]:
for word in words_to_stem:
    print(word + " : ", lst.stem(word))

give :  giv
given :  giv
giving :  giv
gave :  gav


## Lemmatization - Morphological analysis of the words

- Groups together different inflected forms of a word, called lemma.
- Somehow similar to Stemming as it maps several words into one common root.
- Output of lemmatization is a proper word unlike stemming.
- For example : Lemmatizer would map gone, going and went to go

Wordnet imported because lemmatizer requires a dictionary

In [24]:
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer

In [25]:
word_lem = WordNetLemmatizer()

In [26]:
word_lem.lemmatize('corpora'), pst.stem('corpora'), lst.stem('corpora')

('corpus', 'corpora', 'corpor')

In [27]:
for word in words_to_stem:
    print(word + ' : ', word_lem.lemmatize(word))

give :  give
given :  given
giving :  giving
gave :  gave


The words are same because we have not assigned any pos tags hence all are treated as nouns

## Stopwords

Helpful in the creation of sentences, but not in the processing of the language

In [1]:
from nltk.corpus import stopwords

In [2]:
a = stopwords.words('english')

In [3]:
a.index('not')

118

In [4]:
a[115:120]

['such', 'no', 'nor', 'not', 'only']

In [30]:
fdist_top_10

[('the', 23),
 ('of', 12),
 (',', 9),
 ('and', 8),
 ('to', 7),
 ('.', 7),
 ('section', 6),
 ('paper', 5),
 ('this', 4),
 ('commission', 4)]

In [31]:
import re
punc = re.compile(r'[-.?!,:;()|0-9]')

In [35]:
len(abs_tok_words)

211

In [36]:
post_punct = []
for word in abs_tok_words:
    word = punc.sub("", word)
    if len(word) > 0:
        post_punct.append(word)

In [37]:
len(post_punct)

193

## POS

In [38]:
sent = 'Timothy is a natural when it comes to drawing.'
sent_tok = word_tokenize(sent)

In [39]:
for tok in sent_tok:
    print(nltk.pos_tag([tok]))

[('Timothy', 'NN')]
[('is', 'VBZ')]
[('a', 'DT')]
[('natural', 'JJ')]
[('when', 'WRB')]
[('it', 'PRP')]
[('comes', 'VBZ')]
[('to', 'TO')]
[('drawing', 'VBG')]
[('.', '.')]


In [40]:
sent_2 = 'John is eating a delicious cake'
sent_2_tok = word_tokenize(sent_2)

In [41]:
for tok in sent_2_tok:
    print(nltk.pos_tag([tok]))

[('John', 'NNP')]
[('is', 'VBZ')]
[('eating', 'VBG')]
[('a', 'DT')]
[('delicious', 'JJ')]
[('cake', 'NN')]


"is eating" is considered as s single term

## Named Entity Recognition

In [42]:
from nltk import ne_chunk

In [43]:
NE_sent = 'US president stays in the WHITE HOUSE'
NE_sent_2 = 'US president stays in the White House'
NE_sent_3 = 'US president stays in the white house'

In [44]:
NE_1_tok = word_tokenize(NE_sent)
NE_1_tags = nltk.pos_tag(NE_1_tok)
NE_2_tok = word_tokenize(NE_sent_2)
NE_2_tags = nltk.pos_tag(NE_2_tok)
NE_3_tok = word_tokenize(NE_sent_3)
NE_3_tags = nltk.pos_tag(NE_3_tok)

In [45]:
NE_NER_1 = ne_chunk(NE_1_tags)
NE_NER_1

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('GPE', [('US', 'NNP')]), ('president', 'NN'), ('stays', 'NNS'), ('in', 'IN'), ('the', 'DT'), Tree('FACILITY', [('WHITE', 'NNP'), ('HOUSE', 'NNP')])])

In [46]:
NE_NER_2 = ne_chunk(NE_2_tags)
NE_NER_2

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('GPE', [('US', 'NNP')]), ('president', 'NN'), ('stays', 'NNS'), ('in', 'IN'), ('the', 'DT'), Tree('FACILITY', [('White', 'NNP'), ('House', 'NNP')])])

In [47]:
NE_NER_3 = ne_chunk(NE_3_tags)
NE_NER_3

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('GPE', [('US', 'NNP')]), ('president', 'NN'), ('stays', 'NNS'), ('in', 'IN'), ('the', 'DT'), ('white', 'JJ'), ('house', 'NN')])

## Syntax
- Principles
- Rules
- Process

### Chunking

In [49]:
new = 'The big cat ate the little mouse which was after the fresh cheese'
new_tok = nltk.pos_tag(word_tokenize(new))
new_tok

[('The', 'DT'),
 ('big', 'JJ'),
 ('cat', 'NN'),
 ('ate', 'VBD'),
 ('the', 'DT'),
 ('little', 'JJ'),
 ('mouse', 'NN'),
 ('which', 'WDT'),
 ('was', 'VBD'),
 ('after', 'IN'),
 ('the', 'DT'),
 ('fresh', 'JJ'),
 ('cheese', 'NN')]

In [50]:
grammar_np = r'NP: {<DT>?<JJ>*<NN>}'
chunk_parser = nltk.RegexpParser(grammar_np)
chunk_res = chunk_parser.parse(new_tok)
chunk_res

LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [Tree('NP', [('The', 'DT'), ('big', 'JJ'), ('cat', 'NN')]), ('ate', 'VBD'), Tree('NP', [('the', 'DT'), ('little', 'JJ'), ('mouse', 'NN')]), ('which', 'WDT'), ('was', 'VBD'), ('after', 'IN'), Tree('NP', [('the', 'DT'), ('fresh', 'JJ'), ('cheese', 'NN')])])

## ML classifier on movie reviews in nltk corpora

In [51]:
import pandas as pd
import numpy as np

In [52]:
from nltk.corpus import movie_reviews

In [56]:
movie_reviews.categories()

['neg', 'pos']

In [58]:
len(movie_reviews.fileids('neg')), len(movie_reviews.fileids('pos'))

(1000, 1000)

In [60]:
rev_pos = movie_reviews.fileids('pos')
rev_neg = movie_reviews.fileids('neg')

In [59]:
rev_pos = nltk.corpus.movie_reviews.words('pos/cv000_29590.txt')

In [61]:
rev_list_pos = []
for rev in rev_pos:
    rev_text = nltk.corpus.movie_reviews.words(rev)
    review_string = " ".join(rev_text)
    review_string = review_string.replace(' ,', ',')
    review_string = review_string.replace(' .', '.')
    review_string = review_string.replace("\' ", "'")
    review_string = review_string.replace(" \'", "'")
    rev_list_pos.append(review_string)
    
rev_list_neg = []
for rev in rev_neg:
    rev_text = nltk.corpus.movie_reviews.words(rev)
    review_string = " ".join(rev_text)
    review_string = review_string.replace(' ,', ',')
    review_string = review_string.replace(' .', '.')
    review_string = review_string.replace("\' ", "'")
    review_string = review_string.replace(" \'", "'")
    rev_list_neg.append(review_string)

In [63]:
review_list = rev_list_pos + rev_list_neg

In [64]:
review_list.__len__()

2000