In [None]:
# homework from lesson 4

import codecs, nltk, string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

dataset = codecs.open("dataset.tsv", "r", "utf-8").read().strip().split("\n")

article = dataset[50].split("\t")[3]

# split into sentences
sentences = nltk.sent_tokenize(article) 

sentence = nltk.word_tokenize(sentences[4])

# you use the pos-tagger (it gives you back a list of tuples (word,pos))
pos_sentence = nltk.pos_tag(sentence)

lemma_word = [wordnet_lemmatizer.lemmatize(token.lower(),"v") if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in pos_sentence]

print (lemma_word)

Instead of writing a series of commands, like we did so far, we can write them in a single function.

In [None]:
# let's define a function that does all we need

exclude = set(string.punctuation)
stop_word_list = stopwords.words('english')

# input should be a string
def nlp_pipeline(text):
    
    # if you want you can split in sentences - i'm usually skipping this step
    # text = nltk.sent_tokenize(text) 
    
    #tokenize words for each sentence
    text = nltk.word_tokenize(text)
    
    # pos tagger
    text = nltk.pos_tag(text)

    # lemmatizer
    text = [wordnet_lemmatizer.lemmatize(token.lower(),"v")if "V" in pos else wordnet_lemmatizer.lemmatize(token.lower()) for token,pos in text]
    
    # remove punctuation and numbers
    text = [token for token in text if token not in exclude and token.isalpha()]
    
    # remove stopwords - be careful with this step    
    text = [token for token in text if token not in stop_word_list]

    return text

In [None]:
# let's take a new article
article = dataset[261].split("\t")[3]

print (article)

In [None]:
# let's use our pipeline!
clean_article = nlp_pipeline(article)
print (clean_article)

In [None]:
# word sense disambiguation

# check documentation: http://www.nltk.org/howto/wordnet.html

from nltk.corpus import wordnet as wn

# let's isolate each word - you do this using a set (another type of object in python)

unique_words = set(clean_article)

# let's check how many senses each word has
for word in unique_words:
    print (word, len(wn.synsets(word)))


In [None]:
word = "cell"

senses = wn.synsets(word)

for sense in senses:
    # get definition of sense
    print(word, sense.definition())
    
    # get a textual example
    print(sense.examples())
    
    # get hypernymy
    print(sense.hypernyms())

    # get hyponyms
    print(sense.hyponyms())
        
    # this is a way of getting synonyms - there are others
    print (sense.lemma_names())
    
    # this is for getting antonyms - works especially with adjectives 
    print (sense.lemmas()[0].antonyms())
    
    print (" ")


In [None]:
# finding the best sense
# let's consider two sentences where "cell" is mentioned

sent1 = "The terrorist cell was neutralized near the southern Russian city of Makhachkala, the capital of the Republic of Dagestan."

sent2 = "The molecule, which uses light energy to move protons across a somatic cell membrane, proved unsuitable for crystallography."

# you clean the sentences using our pipeline
clean_sent1 = nlp_pipeline(sent1)

clean_sent2 = nlp_pipeline(sent2)

print ("clean sent 1:", clean_sent1)
print ("clean sent 2:", clean_sent2)
print (" ")

# for each possible sense of "cell" you check the overlap between the definition and the sentence



In [None]:
for sense in senses:
    # get definition of sense
    definition =  sense.definition()
    
    # you clean the definition with our pipeline
    clean_definition = nlp_pipeline(definition)
    
    # you check the intersection of the two sentences
    inters_1 = set(clean_sent1).intersection(clean_definition)
    inters_2 = set(clean_sent2).intersection(clean_definition)
    
    print (definition)
    print ("clean definition:", clean_definition)
    print ("intersection with sent 1:", inters_1)
    print ("intersection with sent 2:", inters_2)
    print (len(inters_1),len(inters_2))
    print (" ")

In [None]:
for sense in senses[1:]:
    # get definition of sense
    definition = sense.definition()    
    print (definition)
    # take all hypernyms, hyponyms and synonyms - you need to do a bit of cleaning
    hypernyms = [hyper.lemmas()[0].name().replace("_"," ") for hyper in sense.hypernyms()] 
    print (hypernyms)
    
    hypernyms = " ".join(hypernyms)
    
    print (definition+" " +hypernyms)
    
    break

In [None]:
for sense in senses:
    # get definition of sense
    definition = sense.definition()    
    
    # take all hypernyms, hyponyms and synonyms - you need to do a bit of cleaning
    hypernyms = [hyper.lemmas()[0].name().replace("_"," ") for hyper in sense.hypernyms()] 
    
    hyponyms = [hypon.lemmas()[0].name().replace("_"," ") for hypon in sense.hyponyms()] 
        
    synonyms = [synon.replace("_"," ") for synon in sense.lemma_names()] 
    
    # you concatenate all of them - check out online how to use "join" to connect elements of a list
    sense_text = sense.definition() + " ".join(sense.examples())  + " ".join(hypernyms)+ " ".join(hyponyms)+ " ".join(synonyms)

    # now you have a very long definition, which uses all these pieces of information
    clean_definition = nlp_pipeline(sense_text)
    

    inters_1 = set(clean_sent1).intersection(clean_definition)
    inters_2 = set(clean_sent2).intersection(clean_definition)    
    
    print ("clean definition:", clean_definition)
    print ("intersection with sent 1:", inters_1)
    print ("intersection with sent 2:", inters_2)
    print (len(inters_1),len(inters_2))
    print (" ")

homework: scrape recent tweets from Donald Trump and improve his vocabulary by changing his poor choice of adjectives  with more sophisticated synonyms (e.g. "bad ratings on the Emmys last night" -> "substandard ratings on the Emmys last night") 
 
or

make his tweets nicer by changing adjectives with related antonyms (e.g. "bad ratings on the Emmys last night" -> "excellent ratings on the Emmys last night") 

to do you need to combine:

- the Twitter API
- text processing (POS tagging + WordNet)
- and to find a solution for knowing if a word is "more sophisticated" than another one
