## Install and import

In [12]:
# install nltk
!pip install nltk



In [2]:
# import nltk
import nltk

## Sentiment Analysis through nltk

In [13]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()

# a positive example
text1 = "I love thie movie!!!"

# a negative example
text2 = "I hate this movie!!!"

# a neutual example
text3 = "I watched a movie."

# use polarity_scores function to return the dictionary of values
dict1 = sia.polarity_scores(text1)
dict1

{'neg': 0.0, 'neu': 0.283, 'pos': 0.717, 'compound': 0.7249}

In [15]:
dict2 = sia.polarity_scores(text2)
dict2

{'neg': 0.696, 'neu': 0.304, 'pos': 0.0, 'compound': -0.6784}

In [16]:
dict3 = sia.polarity_scores(text3)
dict3

{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}

In [21]:
# usually, we use the compound score as the overall sentiment score for a text
print(f"Sentiment score for '{text1}' is {dict1['compound']}.")
print(f"Sentiment score for '{text2}' is {dict2['compound']}.")
print(f"Sentiment score for '{text3}' is {dict3['compound']}.")

Sentiment score for 'I love thie movie!!!' is 0.7249.
Sentiment score for 'I hate this movie!!!' is -0.6784.
Sentiment score for 'I watched a movie.' is 0.0.


In [31]:
# ulternatively, we could use the category that has the largest value to classify a text
print(f"'{text1}' is classified as {list(dict1.keys())[list(dict1.values()).index(max(list(dict1.values())[:-1]))]}.")
print(f"'{text2}' is classified as {list(dict2.keys())[list(dict2.values()).index(max(list(dict2.values())[:-1]))]}.")
print(f"'{text3}' is classified as {list(dict3.keys())[list(dict3.values()).index(max(list(dict3.values())[:-1]))]}.")

'I love thie movie!!!' is classified as pos.
'I hate this movie!!!' is classified as neg.
'I watched a movie.' is classified as neu.


## Datasets in nltk

In [19]:
# Install corpora datasets

# the code below will wake the nltk data downloader that is displayed in a seperate window;
# click the "download" button and wait for the download to complete;
# close the window to finish the execution of this function;
# should return True
nltk.download()

# the documentation for all datasets could be found here: https://www.nltk.org/nltk_data/

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [21]:
# list of all datasets names
import os
print(os.listdir(nltk.data.find("corpora")))

['abc', 'abc.zip', 'alpino', 'alpino.zip', 'biocreative_ppi', 'biocreative_ppi.zip', 'brown', 'brown.zip', 'brown_tei', 'brown_tei.zip', 'cess_cat', 'cess_cat.zip', 'cess_esp', 'cess_esp.zip', 'chat80', 'chat80.zip', 'city_database', 'city_database.zip', 'cmudict', 'cmudict.zip', 'comparative_sentences', 'comparative_sentences.zip', 'comtrans.zip', 'conll2000', 'conll2000.zip', 'conll2002', 'conll2002.zip', 'conll2007.zip', 'crubadan', 'crubadan.zip', 'dependency_treebank', 'dependency_treebank.zip', 'dolch', 'dolch.zip', 'europarl_raw', 'europarl_raw.zip', 'extended_omw', 'extended_omw.zip', 'floresta', 'floresta.zip', 'framenet_v15', 'framenet_v15.zip', 'framenet_v17', 'framenet_v17.zip', 'gazetteers', 'gazetteers.zip', 'genesis', 'genesis.zip', 'gutenberg', 'gutenberg.zip', 'ieer', 'ieer.zip', 'inaugural', 'inaugural.zip', 'indian', 'indian.zip', 'jeita.zip', 'kimmo', 'kimmo.zip', 'knbc.zip', 'lin_thesaurus', 'lin_thesaurus.zip', 'machado.zip', 'mac_morpho', 'mac_morpho.zip', 'masc_

In [5]:
# using "names" as an example
from nltk.corpus import names

# .words() returns a list of str
print(names.words()[:10])

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']


In [11]:
# using addtional inputs to get all male names
print(names.words('male.txt')[:10])

['Aamir', 'Aaron', 'Abbey', 'Abbie', 'Abbot', 'Abbott', 'Abby', 'Abdel', 'Abdul', 'Abdulkarim']


In [12]:
# using addtional inputs to get all female names
print(names.words('female.txt')[:10])

['Abagael', 'Abagail', 'Abbe', 'Abbey', 'Abbi', 'Abbie', 'Abby', 'Abigael', 'Abigail', 'Abigale']


In [22]:
# using gutenberg as other example
from nltk.corpus import gutenberg

# .fileids() returns a list of sub-file names
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [23]:
# get specific text data by passing the file name to the words function
gutenberg.words("shakespeare-hamlet.txt")

['[', 'The', 'Tragedie', 'of', 'Hamlet', 'by', ...]

## Tokenization through nltk

In [24]:
# import word_tokenize function from nltk
from nltk.tokenize import word_tokenize

In [36]:
paragraph = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. 

The goal is a computer capable of 'understanding the contents of documents, including the contextual nuances of the language within them. 

The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. 

Challenges in natural language processing frequently involve speech recognition, natural-language understanding, and natural-language generation."""

tokens = word_tokenize(paragraph)
tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'linguistics',
 ',',
 'computer',
 'science',
 ',',
 'and',
 'artificial',
 'intelligence',
 'concerned',
 'with',
 'the',
 'interactions',
 'between',
 'computers',
 'and',
 'human',
 'language',
 ',',
 'in',
 'particular',
 'how',
 'to',
 'program',
 'computers',
 'to',
 'process',
 'and',
 'analyze',
 'large',
 'amounts',
 'of',
 'natural',
 'language',
 'data',
 '.',
 'The',
 'goal',
 'is',
 'a',
 'computer',
 'capable',
 'of',
 "'understanding",
 'the',
 'contents',
 'of',
 'documents',
 ',',
 'including',
 'the',
 'contextual',
 'nuances',
 'of',
 'the',
 'language',
 'within',
 'them',
 '.',
 'The',
 'technology',
 'can',
 'then',
 'accurately',
 'extract',
 'information',
 'and',
 'insights',
 'contained',
 'in',
 'the',
 'documents',
 'as',
 'well',
 'as',
 'categorize',
 'and',
 'organize',
 'the',
 'documents',
 'themselves',
 '.',
 'Challenges',
 'in',
 'natural',
 'language',
 'pr

In [37]:
# count frequency through freqdist
from nltk.probability import FreqDist
fd = FreqDist()
for token in tokens:
    fd[token.lower()] += 1
fd

FreqDist({'the': 8, ',': 6, 'and': 6, 'language': 5, 'of': 5, '.': 4, 'natural': 3, 'in': 3, 'documents': 3, 'processing': 2, ...})

In [38]:
# using freqdist instead of a python dictionary gives us more functionalities;
# such as finding the most common tokens through most_common() function
fd.most_common(10)

[('the', 8),
 (',', 6),
 ('and', 6),
 ('language', 5),
 ('of', 5),
 ('.', 4),
 ('natural', 3),
 ('in', 3),
 ('documents', 3),
 ('processing', 2)]

In [39]:
# blankline_tokenize
from nltk.tokenize import blankline_tokenize
lines = blankline_tokenize(paragraph)
lines[0]

'Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.'

## Grams in nltk

In [46]:
from nltk.util import bigrams, trigrams, ngrams

bigram = list(nltk.bigrams(tokens))
bigram[:5]

[('Natural', 'language'),
 ('language', 'processing'),
 ('processing', '('),
 ('(', 'NLP'),
 ('NLP', ')')]

In [47]:
trigram = list(nltk.trigrams(tokens))
trigram[:5]

[('Natural', 'language', 'processing'),
 ('language', 'processing', '('),
 ('processing', '(', 'NLP'),
 ('(', 'NLP', ')'),
 ('NLP', ')', 'is')]

In [43]:
ngram = list(nltk.ngrams(tokens, 5))
ngram[:5]

[('Natural', 'language', 'processing', '(', 'NLP'),
 ('language', 'processing', '(', 'NLP', ')'),
 ('processing', '(', 'NLP', ')', 'is'),
 ('(', 'NLP', ')', 'is', 'a'),
 ('NLP', ')', 'is', 'a', 'subfield')]

## Stemming through nltk

In [48]:
# stemming refers to the process of finding the root/normalized form of a word

# import porterstemmer
from nltk.stem import PorterStemmer

ps = PorterStemmer()
ps.stem("having")

'have'

In [51]:
# import lancasterstemmer
from nltk.stem import LancasterStemmer

ls = LancasterStemmer()
ls.stem("having")

'hav'

In [62]:
# import snowballstemmer
from nltk.stem import SnowballStemmer

# for snowball stemmer an language must be provided in lowercase as the input of the constructor
ss = SnowballStemmer("english")
ss.stem("having")

'have'

## Lemmatization through nltk

In [75]:
# lemmatization refers to the process of finding the root/normalized form of a word, beyond steamming and with fewer roots

# import lemmatizer
from nltk.stem import WordNetLemmatizer

lem = WordNetLemmatizer()
lem.lemmatize("feels")

'feel'

In [65]:
# difference between stemming and lemmatization
ss.stem("mice")

'mice'

In [66]:
lem.lemmatize("mice")

'mouse'

## Stopwords in nltk

In [78]:
from nltk.corpus import stopwords
stopwords.words("english")

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Part of speech through nltk

In [81]:
# import pos tag
from nltk import pos_tag
tags = pos_tag(tokens)
tags

[('Natural', 'JJ'),
 ('language', 'NN'),
 ('processing', 'NN'),
 ('(', '('),
 ('NLP', 'NNP'),
 (')', ')'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('subfield', 'NN'),
 ('of', 'IN'),
 ('linguistics', 'NNS'),
 (',', ','),
 ('computer', 'NN'),
 ('science', 'NN'),
 (',', ','),
 ('and', 'CC'),
 ('artificial', 'JJ'),
 ('intelligence', 'NN'),
 ('concerned', 'VBN'),
 ('with', 'IN'),
 ('the', 'DT'),
 ('interactions', 'NNS'),
 ('between', 'IN'),
 ('computers', 'NNS'),
 ('and', 'CC'),
 ('human', 'JJ'),
 ('language', 'NN'),
 (',', ','),
 ('in', 'IN'),
 ('particular', 'JJ'),
 ('how', 'WRB'),
 ('to', 'TO'),
 ('program', 'NN'),
 ('computers', 'NNS'),
 ('to', 'TO'),
 ('process', 'VB'),
 ('and', 'CC'),
 ('analyze', 'VB'),
 ('large', 'JJ'),
 ('amounts', 'NNS'),
 ('of', 'IN'),
 ('natural', 'JJ'),
 ('language', 'NN'),
 ('data', 'NNS'),
 ('.', '.'),
 ('The', 'DT'),
 ('goal', 'NN'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('computer', 'NN'),
 ('capable', 'NN'),
 ('of', 'IN'),
 ("'understanding", 'VBG'),
 ('the', 'DT'),
 ('co

In [None]:
# for all meanings of the tags, see https://stackoverflow.com/questions/15388831/what-are-all-possible-pos-tags-of-nltk

## Named entity recognition through nltk

In [90]:
# import ne chunk
from nltk import ne_chunk

t = "President Biden stepped out of the White House and gave a speech to the US Navy."
ne = ne_chunk(pos_tag(word_tokenize(t)))
print(ne)

(S
  President/NNP
  (PERSON Biden/NNP)
  stepped/VBD
  out/IN
  of/IN
  the/DT
  (FACILITY White/NNP House/NNP)
  and/CC
  gave/VBD
  a/DT
  speech/NN
  to/TO
  the/DT
  (ORGANIZATION US/NNP Navy/NNP)
  ./.)


## Chunking through nltk

In [93]:
# create a regular expression corresponds to the specific chunk you wish to find
re = r"NP: {<DT>?<JJ>*<NN>}"

# parse regular expression through regex parser
from nltk import RegexpParser
parsed = RegexpParser(re)

t = "The big cat ate the little mouse who was after fresh cheese."

# returns a syntex tree
chunks = parsed.parse(pos_tag(word_tokenize(t)))
chunks

# ignore the gs read error

The Ghostscript executable isn't found.
See http://web.mit.edu/ghostscript/www/Install.htm
If you're using a Mac, you can try installing
https://docs.brew.sh/Installation then `brew install ghostscript`


LookupError: 

Tree('S', [Tree('NP', [('The', 'DT'), ('big', 'JJ'), ('cat', 'NN')]), ('ate', 'VBD'), Tree('NP', [('the', 'DT'), ('little', 'JJ'), ('mouse', 'NN')]), ('who', 'WP'), ('was', 'VBD'), ('after', 'IN'), Tree('NP', [('fresh', 'JJ'), ('cheese', 'NN')]), ('.', '.')])