In [1]:
# open the new dataset

import codecs, nltk

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

# split into sentences
sentences = nltk.sent_tokenize(article) 

# take one single sentence 

sentence = sentences[1]

#tokenize it

tokenized_sentence = nltk.word_tokenize(sentence)

# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(tokenized_sentence)

print (pos_sentence)

[('31', 'CD'), (',', ','), ('2017', 'CD'), ('In', 'IN'), ('Sydney', 'NNP'), (',', ','), ('rainbow', 'NN'), ('fireworks', 'NNS'), ('sparkled', 'VBD'), ('off', 'RP'), ('the', 'DT'), ('Harbour', 'NNP'), ('Bridge', 'NNP'), ('in', 'IN'), ('celebration', 'NN'), ('of', 'IN'), ('Australia', 'NNP'), ('’', 'NNP'), ('s', 'VBD'), ('recent', 'JJ'), ('legalization', 'NN'), ('of', 'IN'), ('gay', 'JJ'), ('marriage', 'NN'), ('.', '.')]


In [2]:
# combining lemmatization and pos tagging
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

lemma_words = []

for word,pos in pos_sentence:
    
    # if it's a verb - then we tell that to the lemmatizer
    if pos[0] == "V":
        lemma = wordnet_lemmatizer.lemmatize(word,"v")
    else:
    # otherwise, work as usual
        lemma = wordnet_lemmatizer.lemmatize(word)
    # we append the results
    lemma_words.append(lemma)
    
print (lemma_words)

['31', ',', '2017', 'In', 'Sydney', ',', 'rainbow', 'firework', 'sparkle', 'off', 'the', 'Harbour', 'Bridge', 'in', 'celebration', 'of', 'Australia', '’', 's', 'recent', 'legalization', 'of', 'gay', 'marriage', '.']


In [3]:
# let's now define a function that does all we need
from nltk.corpus import stopwords

stop_word_list = stopwords.words('english')

# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token,"v") if pos[0] == "V" else wordnet_lemmatizer.lemmatize(token) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]

    return text

In [4]:
print (sentence)
clean_sentence = nlp_pipeline(sentence)
print (clean_sentence)

31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.
['In', 'Sydney', 'rainbow', 'firework', 'sparkle', 'Harbour', 'Bridge', 'celebration', 'Australia', 'recent', 'legalization', 'gay', 'marriage']


In [5]:
# let's take an entire article and use our pipeline!

clean_article = nlp_pipeline(article)
print (clean_article)

['Advertisement', 'By', 'JACEY', 'FORTINDEC', 'In', 'Sydney', 'rainbow', 'firework', 'sparkle', 'Harbour', 'Bridge', 'celebration', 'Australia', 'recent', 'legalization', 'gay', 'marriage', 'Sydney', 'among', 'first', 'major', 'city', 'celebrate', 'firework', 'stroke', 'midnight', 'In', 'Japan', 'people', 'parade', 'fox', 'mask', 'attend', 'first', 'prayer', 'year', 'Shinto', 'shrine', 'Tokyo', 'In', 'Philippines', 'reveler', 'gather', 'phone', 'hand', 'Eastwood', 'Mall', 'Manila', 'watch', 'balloon', 'confetti', 'rain', 'midnight', 'Big', 'pot', 'tea', 'prepare', 'New', 'Year', 'Eve', 'celebration', 'Beijing', 'The', 'country', 'also', 'celebrate', 'Lunar', 'New', 'Year', 'February', 'It', 'rain', 'Singapore', 'New', 'Year', 'Eve', 'celebrant', 'shelter', 'umbrella', 'raincoat', 'firework', 'sparkle', 'overhead', 'Tourists', 'party', 'hat', 'watch', 'firework', 'front', 'famous', 'Petronas', 'Twin', 'Towers', 'Kuala', 'Lumpur', 'Malaysia', 'Hundreds', 'couple', 'get', 'marry', 'mass',

In [8]:
# word sense disambiguation

# check documentation: http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn

# let's isolate each word - you do this using a set (another type of object in python)

unique_words = set(clean_article)
# let's check how many senses each word has
for word in unique_words:
    print (word, len(wn.synsets(word)))


couple 9
The 0
Lumpur 0
Harbour 6
gay 7
feedback 2
also 1
celebration 3
confetti 1
Kuala 0
firework 1
Tourists 1
think 14
New 12
Malaysia 1
Go 35
We 0
people 6
attend 5
reveler 1
Twin 9
It 1
Beijing 1
sparkle 7
Eve 4
mass 11
hat 4
midnight 1
famous 1
city 3
Page 9
phone 4
among 0
hand 16
tea 5
Manila 2
Tell 9
legalization 1
Advertisement 1
shrine 2
get 37
Tokyo 1
Hundreds 1
rain 4
celebrate 3
gather 11
fox 10
Towers 4
Japan 5
stroke 16
shelter 7
Home 17
rainbow 2
pot 10
Bridge 12
balloon 4
major 13
first 16
Sydney 1
In 7
recent 3
Petronas 0
Year 4
Singapore 3
celebrant 2
page 9
wedding 5
prayer 5
marry 2
Australia 2
Philippines 3
u 4
raincoat 1
prepare 8
year 4
By 2
watch 13
Mall 2
umbrella 4
interested 5
overhead 9
Jakarta 1
Big 17
Lunar 1
country 5
February 1
Eastwood 0
JACEY 0
FORTINDEC 0
party 6
Shinto 3
marriage 4
parade 5
front 13
mask 9


In [9]:
word = "cell"

senses = wn.synsets(word)

for sense in senses:
    # get definition of sense
    print ("\ndefinition")
    print(sense.definition())
    
    # get a textual example
    print ("\nexample")
    print(sense.examples())
    
    # get hypernymy
    print ("\nhypernymy")
    print(sense.hypernyms())

    # get hyponyms
    print ("\nhyponyms")
    print(sense.hyponyms())
        
    # this is a way of getting synonyms - there are others
    print ("\nsynonyms")
    print (sense.lemma_names())
    
    # this is for getting antonyms - works especially with adjectives 
    print ("\nantonyms")
    print (sense.lemmas()[0].antonyms())
    
    print ("\n\n")



definition
any small compartment

example
['the cells of a honeycomb']

hypernymy
[Synset('compartment.n.01')]

hyponyms
[]

synonyms
['cell']

antonyms
[]




definition
(biology) the basic structural and functional unit of all organisms; they may exist as independent units of life (as in monads) or may form colonies or tissues as in higher plants and animals

example
[]

hypernymy
[Synset('living_thing.n.01')]

hyponyms
[Synset('akaryocyte.n.01'), Synset('archespore.n.01'), Synset('arthrospore.n.01'), Synset('arthrospore.n.02'), Synset('beta_cell.n.01'), Synset('blastema.n.01'), Synset('blastomere.n.01'), Synset('daughter_cell.n.01'), Synset('embryonic_cell.n.01'), Synset('fiber.n.03'), Synset('flagellated_cell.n.01'), Synset('gametocyte.n.01'), Synset('kupffer's_cell.n.01'), Synset('leydig_cell.n.01'), Synset('mother_cell.n.01'), Synset('parthenote.n.01'), Synset('plant_cell.n.01'), Synset('polar_body.n.01'), Synset('recombinant.n.01'), Synset('reproductive_cell.n.01'), Synset('ser

In [10]:
# let's consider two sentences where "cell" is mentioned

sent1 = "The terrorist cell was neutralized near the southern Russian city of Makhachkala, the capital of the Republic of Dagestan."

sent2 = "The molecule, which uses light energy to move protons across a somatic cell membrane, proved unsuitable for crystallography."

# you clean the sentences using our pipeline
clean_sent1 = nlp_pipeline(sent1)

clean_sent2 = nlp_pipeline(sent2)

print ("clean sent 1:", clean_sent1)
print ("clean sent 2:", clean_sent2)
print (" ")

# for each possible sense of "cell" you can, for instance, check the overlap between the definition and the sentence

clean sent 1: ['The', 'terrorist', 'cell', 'neutralize', 'near', 'southern', 'Russian', 'city', 'Makhachkala', 'capital', 'Republic', 'Dagestan']
clean sent 2: ['The', 'molecule', 'use', 'light', 'energy', 'move', 'proton', 'across', 'somatic', 'cell', 'membrane', 'prove', 'unsuitable', 'crystallography']
 


In [None]:
word = "cell"

senses = wn.synsets(word)
–
for sense in senses:


In [None]:
# homework: find the best sense - implement your version of the Lesk algorithm: https://en.wikipedia.org/wiki/Lesk_algorithm


homework 2: get the json file with the tweets from Donald Trump and improve his vocabulary by changing his poor choice of adjectives with more sophisticated synonyms (e.g. "bad ratings on the Emmys last night" -> "substandard ratings on the Emmys last night") 
 
or

make his tweets nicer by changing adjectives with related antonyms (e.g. "bad ratings on the Emmys last night" -> "excellent ratings on the Emmys last night") 

to do you need to combine:

- text processing (POS tagging + WordNet)
- and to find a solution for knowing if a word is "more sophisticated" than another one
