**NLP Practice with NLTK library**

In [1]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import Counter 

In [2]:
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
f = open("data.txt", "r")
text = f.read()

In [4]:
text[:1000]

"The Project Gutenberg EBook of Man to Man, by Jackson Gregory\n\nThis eBook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  You may copy it, give it away or\nre-use it under the terms of the Project Gutenberg License included\nwith this eBook or online at www.gutenberg.org\n\n\nTitle: Man to Man\n\nAuthor: Jackson Gregory\n\nRelease Date: July 29, 2006 [EBook #18933]\n\nLanguage: English\n\n\n*** START OF THIS PROJECT GUTENBERG EBOOK MAN TO MAN ***\n\n\n\n\nProduced by Al Haines\n\n\n\n\n\n\n\n\n\n\n[Frontispiece: The blazing heat was such that men and horses and steers\nsuffered terribly.]\n\n\n\n\n\n\nMAN TO MAN\n\n\nBY\n\nJACKSON GREGORY\n\n\n\nAUTHOR OF\n\nJUDITH OF BLUE LAKE RANCH, THE BELLS OF SAN JUAN, SIX FEET FOUR, ETC.\n\n\n\n\nILLUSTRATED BY\n\nJ. G. SHEPHERD\n\n\n\n\n\nGROSSET & DUNLAP\n\nPUBLISHERS -------- NEW YORK\n\n\n\n\nCOPYRIGHT, 1920, BY\n\nCHARLES SCRIBNER'S SONS\n\n\nPublished October, 1920\n\n\n\n\nCONTENTS\n\n\nCHAPTER

In [5]:
len(text)

436574

**Sentence Tokenize**

In [6]:
sentences = sent_tokenize(text.lower())

In [7]:
len(sentences)

5551

In [8]:
sentences[0:2]

['the project gutenberg ebook of man to man, by jackson gregory\n\nthis ebook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.',
 'you may copy it, give it away or\nre-use it under the terms of the project gutenberg license included\nwith this ebook or online at www.gutenberg.org\n\n\ntitle: man to man\n\nauthor: jackson gregory\n\nrelease date: july 29, 2006 [ebook #18933]\n\nlanguage: english\n\n\n*** start of this project gutenberg ebook man to man ***\n\n\n\n\nproduced by al haines\n\n\n\n\n\n\n\n\n\n\n[frontispiece: the blazing heat was such that men and horses and steers\nsuffered terribly.]']

In [9]:
corpus = []
for i in sentences:
    sent = word_tokenize(i)
    corpus.append(sent)

In [10]:
corpus[0:2]

[['the',
  'project',
  'gutenberg',
  'ebook',
  'of',
  'man',
  'to',
  'man',
  ',',
  'by',
  'jackson',
  'gregory',
  'this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions',
  'whatsoever',
  '.'],
 ['you',
  'may',
  'copy',
  'it',
  ',',
  'give',
  'it',
  'away',
  'or',
  're-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included',
  'with',
  'this',
  'ebook',
  'or',
  'online',
  'at',
  'www.gutenberg.org',
  'title',
  ':',
  'man',
  'to',
  'man',
  'author',
  ':',
  'jackson',
  'gregory',
  'release',
  'date',
  ':',
  'july',
  '29',
  ',',
  '2006',
  '[',
  'ebook',
  '#',
  '18933',
  ']',
  'language',
  ':',
  'english',
  '***',
  'start',
  'of',
  'this',
  'project',
  'gutenberg',
  'ebook',
  'man',
  'to',
  'man',
  '***',
  'produced',
  'by',
  'al',
  'haines',
  '[',
  'f

**Word Tokenization of words**

In [11]:
corpus = word_tokenize(text.lower())

In [12]:
len(corpus)

97042

In [13]:
# corpus has 97042 tokens.

In [14]:
corpus[:15]

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'man',
 'to',
 'man',
 ',',
 'by',
 'jackson',
 'gregory',
 'this',
 'ebook',
 'is']

In [15]:
corpus_alt = text.lower().split()

In [16]:
# we can implement tokenizatoin even with split() function.

In [17]:
corpus_alt[0:10]

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'man',
 'to',
 'man,',
 'by',
 'jackson']

In [18]:
len(corpus_alt)

78078

**Remove Punctuation and numbers and chars like "*", "#"**

In [19]:
corpus_1 = [w for w in corpus if w.isalpha()]

In [20]:
len(corpus_1)

77105

In [21]:
corpus_1[0:15]

['the',
 'project',
 'gutenberg',
 'ebook',
 'of',
 'man',
 'to',
 'man',
 'by',
 'jackson',
 'gregory',
 'this',
 'ebook',
 'is',
 'for']

**Remove Stopwords**

In [22]:
stop_words = stopwords.words("english")

In [23]:
corpus_2 = [t for t in corpus_1 if t not in stop_words]

In [24]:
len(corpus_2)

38117

In [25]:
corpus_2[:20]

['project',
 'gutenberg',
 'ebook',
 'man',
 'man',
 'jackson',
 'gregory',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'cost',
 'almost',
 'restrictions',
 'whatsoever',
 'may',
 'copy',
 'give',
 'away',
 'terms']

**stemming**

In [26]:
st = PorterStemmer()

In [27]:
corpus_3 = [st.stem(t) for t in corpus_2]

In [28]:
corpus_3[:20]

['project',
 'gutenberg',
 'ebook',
 'man',
 'man',
 'jackson',
 'gregori',
 'ebook',
 'use',
 'anyon',
 'anywher',
 'cost',
 'almost',
 'restrict',
 'whatsoev',
 'may',
 'copi',
 'give',
 'away',
 'term']

**lemma**

In [29]:
lem = WordNetLemmatizer()

In [30]:
corpus_4 = [lem.lemmatize(t) for t in corpus_2]

In [31]:
corpus_4[:20]

['project',
 'gutenberg',
 'ebook',
 'man',
 'man',
 'jackson',
 'gregory',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'cost',
 'almost',
 'restriction',
 'whatsoever',
 'may',
 'copy',
 'give',
 'away',
 'term']

In [32]:
len(corpus_4)

38117

**Most Common Words**

In [33]:
print(Counter(corpus_4).most_common(5))

[('steve', 542), ('packard', 541), ('blenham', 524), ('man', 445), ('terry', 409)]


**joining**

In [34]:
corpus_4[:20]

['project',
 'gutenberg',
 'ebook',
 'man',
 'man',
 'jackson',
 'gregory',
 'ebook',
 'use',
 'anyone',
 'anywhere',
 'cost',
 'almost',
 'restriction',
 'whatsoever',
 'may',
 'copy',
 'give',
 'away',
 'term']

In [35]:
original = " ".join(corpus_4)

In [36]:
original[:100]

'project gutenberg ebook man man jackson gregory ebook use anyone anywhere cost almost restriction wh'

**Part of Speech Tagging (PoST)**

In [37]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [38]:
text="Steven Paul Jobs (/dʒɒbz/; February 24, 1955 – October 5, 2011) was an American business magnate, industrial designer, investor, and media proprietor. He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc., the chairman and majority shareholder of Pixar, a member of The Walt Disney Company's board of directors following its acquisition of Pixar, and the founder, chairman, and CEO of NeXT. Jobs is widely recognized as a pioneer of the personal computer revolution of the 1970s and 1980s, along with Apple co-founder Steve Wozniak."

In [39]:
text

"Steven Paul Jobs (/dʒɒbz/; February 24, 1955 – October 5, 2011) was an American business magnate, industrial designer, investor, and media proprietor. He was the chairman, chief executive officer (CEO), and co-founder of Apple Inc., the chairman and majority shareholder of Pixar, a member of The Walt Disney Company's board of directors following its acquisition of Pixar, and the founder, chairman, and CEO of NeXT. Jobs is widely recognized as a pioneer of the personal computer revolution of the 1970s and 1980s, along with Apple co-founder Steve Wozniak."

In [40]:
text_tokens=nltk.word_tokenize(text)
tokens_without_punc = [w for w in text_tokens if w.isalpha()]
tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
text_cleaned = [lem.lemmatize(t) for t in tokens_without_sw]

In [41]:
" ".join(text_cleaned)

'Steven Paul Jobs February October American business magnate industrial designer investor medium proprietor He chairman chief executive officer CEO Apple chairman majority shareholder Pixar member The Walt Disney Company board director following acquisition Pixar founder chairman CEO NeXT Jobs widely recognized pioneer personal computer revolution along Apple Steve Wozniak'

In [42]:
tagged = nltk.pos_tag(text_cleaned)

In [43]:
tagged

[('Steven', 'NNP'),
 ('Paul', 'NNP'),
 ('Jobs', 'NNP'),
 ('February', 'NNP'),
 ('October', 'NNP'),
 ('American', 'NNP'),
 ('business', 'NN'),
 ('magnate', 'NN'),
 ('industrial', 'JJ'),
 ('designer', 'NN'),
 ('investor', 'NN'),
 ('medium', 'NN'),
 ('proprietor', 'NN'),
 ('He', 'PRP'),
 ('chairman', 'NN'),
 ('chief', 'JJ'),
 ('executive', 'JJ'),
 ('officer', 'NN'),
 ('CEO', 'NNP'),
 ('Apple', 'NNP'),
 ('chairman', 'NN'),
 ('majority', 'NN'),
 ('shareholder', 'NN'),
 ('Pixar', 'NNP'),
 ('member', 'NN'),
 ('The', 'DT'),
 ('Walt', 'NNP'),
 ('Disney', 'NNP'),
 ('Company', 'NNP'),
 ('board', 'NN'),
 ('director', 'NN'),
 ('following', 'VBG'),
 ('acquisition', 'NN'),
 ('Pixar', 'NNP'),
 ('founder', 'NN'),
 ('chairman', 'NN'),
 ('CEO', 'NNP'),
 ('NeXT', 'NNP'),
 ('Jobs', 'NNP'),
 ('widely', 'RB'),
 ('recognized', 'VBD'),
 ('pioneer', 'NN'),
 ('personal', 'JJ'),
 ('computer', 'NN'),
 ('revolution', 'NN'),
 ('along', 'IN'),
 ('Apple', 'NNP'),
 ('Steve', 'NNP'),
 ('Wozniak', 'NNP')]

In [44]:
"""
CC     coordinating conjunction
CD     cardinal digit
DT     determiner
EX     existential there (like: "there is" ... think of it like "there exists")
FW     foreign word
IN     preposition/subordinating conjunction
JJ     adjective 'big'
JJR    adjective, comparative 'bigger'
JJS    adjective, superlative 'biggest'
LS     list marker 1)
MD     modal could, will
NN     noun, singular 'desk'
NNS    noun plural 'desks'
NNP    proper noun, singular 'Harrison'
NNPS   proper noun, plural 'Americans'
PDT    predeterminer 'all the kids'
POS    possessive ending parent's
PRP    personal pronoun I, he, she
PRP$   possessive pronoun my, his, hers
RB     adverb very, silently,
RBR    adverb, comparative better
RBS    adverb, superlative best
RP     particle give up
TO     to go 'to' the store.
UH     interjection errrrrrrrm
VB     verb, base form take
VBD    verb, past tense took
VBG    verb, gerund/present participle taking
VBN    verb, past participle taken
VBP    verb, sing. present, non-3d take
VBZ    verb, 3rd person sing. present takes
WDT    wh-determiner which
WP     wh-pronoun who, what
WP$    possessive wh-pronoun whose
WRB    wh-abverb where, when
"""

'\nCC     coordinating conjunction\nCD     cardinal digit\nDT     determiner\nEX     existential there (like: "there is" ... think of it like "there exists")\nFW     foreign word\nIN     preposition/subordinating conjunction\nJJ     adjective \'big\'\nJJR    adjective, comparative \'bigger\'\nJJS    adjective, superlative \'biggest\'\nLS     list marker 1)\nMD     modal could, will\nNN     noun, singular \'desk\'\nNNS    noun plural \'desks\'\nNNP    proper noun, singular \'Harrison\'\nNNPS   proper noun, plural \'Americans\'\nPDT    predeterminer \'all the kids\'\nPOS    possessive ending parent\'s\nPRP    personal pronoun I, he, she\nPRP$   possessive pronoun my, his, hers\nRB     adverb very, silently,\nRBR    adverb, comparative better\nRBS    adverb, superlative best\nRP     particle give up\nTO     to go \'to\' the store.\nUH     interjection errrrrrrrm\nVB     verb, base form take\nVBD    verb, past tense took\nVBG    verb, gerund/present participle taking\nVBN    verb, past par

**NER (Named Entity Regocnition)**

In [45]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping chunkers\maxent_ne_chunker.zip.
[nltk_data] Error with downloaded zip file
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [46]:
named_ent = nltk.ne_chunk(tagged)

LookupError: 
**********************************************************************
  Resource [93mmaxent_ne_chunker[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('maxent_ne_chunker')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mchunkers/maxent_ne_chunker/english_ace_multiclass.pickle[0m

  Searched in:
    - 'C:\\Users\\pc/nltk_data'
    - 'C:\\Program Files\\anaconda\\nltk_data'
    - 'C:\\Program Files\\anaconda\\share\\nltk_data'
    - 'C:\\Program Files\\anaconda\\lib\\nltk_data'
    - 'C:\\Users\\pc\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
print(named_ent)

In [None]:
named_ent.draw()

**word cloud**

In [None]:
#!pip install wordcloud

In [None]:
import matplotlib.pyplot as plt

In [None]:
from wordcloud import WordCloud

In [None]:
original = " ".join(corpus_4)

In [None]:
print(Counter(corpus_4).most_common(5))

In [None]:
wordcloud = WordCloud(background_color="white", max_words=1000)
wordcloud.generate(original)

plt.figure(figsize=(13,13))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()