In [63]:
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.wsd import lesk
from nltk.book import *
import math

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

nltk.download('gutenberg')
nltk.download('genesis')
nltk.download('inaugural')
nltk.download('nps_chat')
nltk.download('treebank')
nltk.download('webtext')

nltk.download('sentiwordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package genesis to /root/nltk_data...
[nltk_data]   Package genesis is already up-to-date!
[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package nps_chat to /root/nltk_data...
[nltk_data]   Package nps_chat is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_d

True

WordNet is a premade hierarchical organization of nouns, verbs, adjectives and adverbs. In Python, WordNet can be imported as a library and used to process text by making it easy to convert raw strings into their respective synsets, as well as find definitions, short examples of the word in context, and relations to other words in WordNet.

In [5]:
# output all synsets of light
wn.synsets('light')

[Synset('light.n.01'),
 Synset('light.n.02'),
 Synset('light.n.03'),
 Synset('luminosity.n.01'),
 Synset('light.n.05'),
 Synset('light.n.06'),
 Synset('light.n.07'),
 Synset('light.n.08'),
 Synset('light.n.09'),
 Synset('light.n.10'),
 Synset('sparkle.n.01'),
 Synset('light.n.12'),
 Synset('inner_light.n.01'),
 Synset('light.n.14'),
 Synset('lighter.n.02'),
 Synset('light.v.01'),
 Synset('light_up.v.05'),
 Synset('alight.v.01'),
 Synset('ignite.v.01'),
 Synset('fall.v.20'),
 Synset('unhorse.v.01'),
 Synset('light.a.01'),
 Synset('light.a.02'),
 Synset('light.a.03'),
 Synset('light.a.04'),
 Synset('light.a.05'),
 Synset('light.a.06'),
 Synset('unaccented.s.02'),
 Synset('light.s.08'),
 Synset('light.s.09'),
 Synset('clean.s.03'),
 Synset('light.s.11'),
 Synset('light.s.12'),
 Synset('light.a.13'),
 Synset('light.a.14'),
 Synset('faint.s.04'),
 Synset('light.s.16'),
 Synset('abstemious.s.02'),
 Synset('light.s.18'),
 Synset('light.s.19'),
 Synset('light.s.20'),
 Synset('idle.s.04'),
 Syn

In [10]:
# select one synset of light and extract its definition, usage examples, and lemmas
light = wn.synset('light.n.01')
print(light.definition())
print(light.examples())
print(light.lemmas())

# traverse up the WordNet hierarchy and output synsets
hyper = lambda s: s.hypernyms()
list(light.closure(hyper))

(physics) electromagnetic radiation that can produce a visual sensation
['the light was filtered through a soft glass window']
[Lemma('light.n.01.light'), Lemma('light.n.01.visible_light'), Lemma('light.n.01.visible_radiation')]


[Synset('actinic_radiation.n.01'),
 Synset('electromagnetic_radiation.n.01'),
 Synset('radiation.n.01'),
 Synset('energy.n.01'),
 Synset('physical_phenomenon.n.01'),
 Synset('natural_phenomenon.n.01'),
 Synset('phenomenon.n.01'),
 Synset('process.n.06'),
 Synset('physical_entity.n.01'),
 Synset('entity.n.01')]

WordNet organizes nouns in a single, massive tree with a single root node, entity. Children of nodes in the tree are nouns in the category of their parent, i.e. 'radiation.n.01' is the child of 'energy.n.01'. Leaves of the tree are specific nouns, such as light. 

In [14]:
# output synset relations for a synset of light
print('hypernyms: ', light.hypernyms())
print('hyponyms: ', light.hyponyms())
print('meronyms: ', light.part_meronyms())
print('holonyms: ', light.part_holonyms())
print('antonyms: ', light.lemmas()[0].antonyms())

hypernyms:  [Synset('actinic_radiation.n.01')]
hyponyms:  [Synset('beam.n.04'), Synset('candlelight.n.01'), Synset('corona.n.04'), Synset('counterglow.n.01'), Synset('daylight.n.02'), Synset('firelight.n.01'), Synset('fluorescence.n.01'), Synset('friar's_lantern.n.01'), Synset('gaslight.n.01'), Synset('glow.n.05'), Synset('half-light.n.01'), Synset('incandescence.n.01'), Synset('lamplight.n.01'), Synset('luminescence.n.01'), Synset('meteor.n.02'), Synset('moonlight.n.01'), Synset('radiance.n.01'), Synset('scintillation.n.01'), Synset('starlight.n.01'), Synset('streamer.n.01'), Synset('sunlight.n.01'), Synset('torchlight.n.01'), Synset('twilight.n.02')]
meronyms:  []
holonyms:  [Synset('electromagnetic_spectrum.n.01')]
antonyms:  []


In [15]:
# output all synsets of explode
wn.synsets('explode')

[Synset('explode.v.01'),
 Synset('explode.v.02'),
 Synset('explode.v.03'),
 Synset('explode.v.04'),
 Synset('explode.v.05'),
 Synset('explode.v.06'),
 Synset('explode.v.07'),
 Synset('explode.v.08'),
 Synset('detonate.v.02'),
 Synset('explode.v.10')]

In [19]:
# select one synset of explode and extract its definition, usage examples, and lemmas
explode = wn.synset('explode.v.01')
print(explode.definition())
print(explode.examples())
print(explode.lemmas())

# traverse up the WordNet hierarchy and output synsets
hyper = lambda s: s.hypernyms()
list(explode.closure(hyper))

cause to burst with a violent release of energy
['We exploded the nuclear bomb']
[Lemma('explode.v.01.explode'), Lemma('explode.v.01.detonate'), Lemma('explode.v.01.blow_up'), Lemma('explode.v.01.set_off')]


[Synset('change_integrity.v.01'), Synset('change.v.02')]

WordNet organizes verbs similarly to nouns, but with a few key differences. Verbs do not have a top level synset, and are therefore organized into many smaller trees. Verbs also tend to be less well connected than nouns.

In [26]:
# use morphy to find different forms of the word
print(wn.morphy('exploded'))
print(wn.morphy('explodes'))
print(wn.morphy('exploding'))

explode
explode
explode


In [32]:
# select two similar words
oven = wn.synset('oven.n.01')
stove = wn.synset('stove.n.01')

# run the Wu-Palmer similarity metric
print(wn.wup_similarity(oven, stove))

# run the Lesk algorithm
sent = ['Dinner', 'is', 'in', 'the', 'oven']
print(lesk(sent, 'oven'))

0.9166666666666666
Synset('oven.n.01')


The Wu-Palmer similarity metric correctly reported a high similarity between the nouns 'oven' and 'stove'. The Lesk algorithm correctly selected the 'oven.n.01' synset from the context 'Dinner is in the oven'.

SentiWordNet is used to associate sentiments to words in natural language processing. SentiWordNet could be used in sentiment analsis in marketing analytics to judge the performance of a social media post by comparing the overall sentiment of comments to a baseline.

In [57]:
# select an emotionally charged word and find its senti-synsets
syn_list = list(swn.senti_synsets('lost'))

# output the polarity score for each word
for senti in syn_list:
  print(senti)

print("")
# make up a sentence
sent = 'that kitten is adorable'

# output the polarity for each word in the sentence
tokens = sent.split()
for token in tokens:
  syn_list = list(swn.senti_synsets(token))
  if syn_list:
    print(syn_list[0])

<doomed.n.01: PosScore=0.0 NegScore=0.0>
<lose.v.01: PosScore=0.0 NegScore=0.5>
<lose.v.02: PosScore=0.0 NegScore=0.5>
<lose.v.03: PosScore=0.0 NegScore=0.5>
<misplace.v.01: PosScore=0.0 NegScore=0.125>
<lose.v.05: PosScore=0.0 NegScore=0.125>
<lose.v.06: PosScore=0.0 NegScore=0.0>
<lose.v.07: PosScore=0.0 NegScore=0.125>
<lose.v.08: PosScore=0.0 NegScore=0.125>
<fall_back.v.04: PosScore=0.0 NegScore=0.0>
<miss.v.01: PosScore=0.0 NegScore=0.25>
<suffer.v.11: PosScore=0.0 NegScore=0.25>
<lost.a.01: PosScore=0.0 NegScore=0.75>
<confused.s.03: PosScore=0.0 NegScore=0.0>
<lost.a.03: PosScore=0.0 NegScore=0.625>
<lost.a.04: PosScore=0.0 NegScore=0.625>
<lost.s.05: PosScore=0.0 NegScore=0.5>
<lost.s.06: PosScore=0.125 NegScore=0.625>
<bemused.s.01: PosScore=0.125 NegScore=0.0>
<baffled.s.01: PosScore=0.0 NegScore=0.5>
<helpless.s.02: PosScore=0.0 NegScore=0.875>

<kitten.n.01: PosScore=0.0 NegScore=0.0>
<be.v.01: PosScore=0.25 NegScore=0.125>
<adorable.s.01: PosScore=0.5 NegScore=0.0>


SentiWordNet reported plausible polarities for the synset of 'lost'. Additionally, SentiWordNet reported overall positive polarity for the sentence 'That kitten is adorable', which is expected. Sentiment analysis is useful in NLP applications in marketing and human computer interfaces. Alexa/Google devices may be able to simplify complex responses into simple yes/no answers based on sentiment analysis.

Collocations are occurances of two or more words that combine to form a meaning in a somewhat unique way. Key to this concept is that individual words in collocations cannot be replaced with synonyms to achieve the same meaning. For example, 'strong' in 'strong tea' cannot be replaced with 'muscular'.

In [70]:
# output collocations for text4
print(text4.collocations())

# calculate mutual information
vocab = len(set(text4))
text = ' '.join(text4.tokens)
old_world = text.count('Old World') / vocab
old = text.count('Old') / vocab
world = text.count('World') / vocab
math.log2(old_world / (old * world))


United States; fellow citizens; years ago; four years; Federal
Government; General Government; American people; Vice President; God
bless; Chief Justice; one another; fellow Americans; Old World;
Almighty God; Fellow citizens; Chief Magistrate; every citizen; Indian
tribes; public debt; foreign nations
None
0.000997506234413965


8.983886091037398

The result of the mutual information formula for 'Old World' is 8.98. Since it is positive, there is likely a collocation.