In [7]:
import codecs, nltk

article = codecs.open("../datasets/CleanedArticles/15.txt","r","utf-8")
article = article.read()

# split into sentences
sentences = nltk.sent_tokenize(article) 


In [14]:
# named entity recognition with NLTK

for sent in sentences:
    
    print (sent)
    
    # first step you tokenize (read documentation to know the input of NER)
    sent = nltk.word_tokenize(sent)
    # you use the pos-tagger (it gives you back a list of tuples (word,pos))
    pos_sent = nltk.pos_tag(sent)
    
    

    # then you use the NER library
    ner = nltk.ne_chunk(pos_sent)

    # we keep only the entities, type nltk.tree.Tree
    
    ner = [x for x in ner if type(x) == nltk.tree.Tree]
    
    print (ner)
    print (" ")
    

Advertisement By JACEY FORTINDEC.
[Tree('ORGANIZATION', [('JACEY', 'NNP')])]
 
31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.
[Tree('GPE', [('Sydney', 'NNP')]), Tree('ORGANIZATION', [('Harbour', 'NNP'), ('Bridge', 'NNP')]), Tree('GPE', [('Australia', 'NNP')])]
 
(Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.)
[Tree('PERSON', [('Sydney', 'NNP')])]
 
In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo.
[Tree('GPE', [('Japan', 'NNP')]), Tree('ORGANIZATION', [('Shinto', 'NNP')]), Tree('GPE', [('Tokyo', 'NNP')])]
 
In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight.
[Tree('ORGANIZATION', [('Philippines', 'NNPS')]), Tree('ORGANIZATION', [('Eastwood', 'NNP'), ('Mall', 'NNP')]), Tree('GPE', [('Manila', 'NNP')])]
 
B

In [15]:
# named entity recognition with Spacy

# a different (new) library for doing most of the things we have seen so far
# to install it: !pip install spacy
# to download a model: !python -m spacy download en

import spacy

spacy_nlp_pipeline = spacy.load("en")

In [17]:


for sent in sentences:
    print (sent)
    sent = spacy_nlp_pipeline(sent)
    for token in sent:
        tok = token.text
        lemma = token.lemma_
        pos = token.pos_
        #print (tok, lemma,pos)
                            
    for ent in sent.ents:
        ent_text = ent.text
        label = str(ent.label_)
        print (ent_text,label)
    print (" ")


Advertisement By JACEY FORTINDEC.
JACEY FORTINDEC PERSON
 
31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage.
31 CARDINAL
2017 DATE
Sydney GPE
the Harbour Bridge FAC
Australia GPE
 
(Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.)
Sydney GPE
first ORDINAL
midnight TIME
 
In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo.
Japan GPE
first ORDINAL
the year DATE
Shinto PERSON
Tokyo GPE
 
In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight.
Philippines GPE
the Eastwood Mall FAC
Manila GPE
midnight TIME
 
Big pots of tea were prepared for New Year’s Eve celebrations in Beijing.
New Year’s Eve EVENT
Beijing GPE
 
The country will also celebrate the Lunar New Year, in February.
the Lunar New Year EVENT
February DATE
 
I

In [None]:
# for installing tagme:
# !pip install tagme

In [30]:
import tagme
# Set the authorization token for subsequent calls.
# add your own token here!
# you can get it from here: https://sobigdata.d4science.org/web/tagme/tagme-help

tagme.GCUBE_TOKEN = "0ff2cd5e-79d7-4dac-b721-186a1a8df1ca-843339462"

annotated_article = tagme.annotate(article,lang="de")


In [23]:
# check the type
print (article)

Advertisement By JACEY FORTINDEC. 31, 2017  In Sydney, rainbow fireworks sparkled off the Harbour Bridge in celebration of Australia’s recent legalization of gay marriage. (Sydney was among the first major cities to celebrate with fireworks at the stroke of midnight.) In Japan, people paraded in fox masks to attend the first prayer of the year at a Shinto shrine in Tokyo. In the Philippines, revelers gathered — phones in hand — at the Eastwood Mall in Manila to watch balloons and confetti rain down at midnight. Big pots of tea were prepared for New Year’s Eve celebrations in Beijing. The country will also celebrate the Lunar New Year, in February. It was raining in Singapore, but New Year’s Eve celebrants sheltered under umbrellas and raincoats as fireworks sparkled overhead. Tourists donned party hats to watch fireworks in front of the famous Petronas Twin Towers in Kuala Lumpur, Malaysia. Hundreds of couples got married at a mass wedding in Jakarta on New Year’s Eve. We’re interested

In [25]:
# read the documentation to learn other commands

for ann in annotated_article.get_annotations(0.3):
    print (ann)

fireworks -> Fireworks (score: 0.31419679522514343)
stroke of midnight -> A Stroke of Midnight (score: 0.5)
Shinto shrine -> Shinto shrine (score: 0.5452408194541931)
Tokyo -> Tokyo (score: 0.313284307718277)
Philippines -> Philippines (score: 0.44611889123916626)
Eastwood Mall -> Eastwood City (score: 0.7888547778129578)
Manila -> Manila (score: 0.32812315225601196)
confetti -> Confetti (score: 0.3147100806236267)
New Year’s Eve -> New Year's Eve (score: 0.3361709415912628)
Beijing -> Beijing (score: 0.3367815911769867)
Singapore -> Singapore (score: 0.3805890381336212)
raincoats -> The Raincoats (score: 0.31521740555763245)
Petronas Twin Towers -> Petronas Towers (score: 0.5)
Kuala Lumpur -> Kuala Lumpur (score: 0.4240269362926483)
Malaysia -> Malaysia (score: 0.5643056631088257)
Jakarta -> Jakarta (score: 0.4368698298931122)
Home Page -> Google Search (score: 0.47574180364608765)


In [26]:
# test with this
sent = tagme.annotate("Yesterday I watched the debate between Clinton and Sanders.")

# Print annotations with a score higher than 0.1

for ann in sent.get_annotations(0.1):
    print (ann)

# why is it still making mistakes?

debate -> United States presidential election debates (score: 0.1599879115819931)
Clinton -> Bill Clinton (score: 0.22683759033679962)
Sanders -> Bernie Sanders (score: 0.19481973350048065)


In [None]:
# computing entity relatedness
rels = tagme.relatedness_title(("Hillary Clinton", "Bernie Sanders"))
print ("Hillary and Bernie have a semantic relation of", rels.relatedness[0].rel)

rels = tagme.relatedness_title(("Bill Clinton", "Bernie Sanders"))
print ("Bill and Bernie have a semantic relation of", rels.relatedness[0].rel)

rels = tagme.relatedness_title(("Bill Clinton", "Hillary Clinton"))
print ("Bill and Hillary have a semantic relation of", rels.relatedness[0].rel)

In [None]:
#homework: extract the most popular NERs and entities from this new dataset
# use the library that you prefer
# the file is organized this way, each line contains an article
#each line has a date, a title, a topic and the content divided by tabs
#so you have to split over tabs and take the 4th element for getting the article

import codecs, nltk
from collections import Counter

# i'm skipping the first line, which is the header
dataset = codecs.open("../datasets/dataset.tsv", "r", "utf-8").read().strip().split("\n")[1:]


entities = []

for k in range(len(dataset)):
    article = dataset[k]
    print (article)
    break
