In [160]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
import re
from copy import deepcopy
import psycopg2
import uuid
from datetime import datetime
from getpass import getpass
import json
# nlp = en_core_web_sm.load()

In [145]:
def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html.parser')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

In [286]:
keywords = [
    'seized',
    'confiscation',
    'wildlife AND confiscation',
    'ivory',
    'ivory AND confiscation'
    'rhino',
    'pangolin',
    'elephant AND tusk',
    'rhino AND horn',
    'pangolin',
    'pangolin AND scale',
    'illegal AND wildlife AND trade',
    'wildlife AND trafficking',
    'hunting AND trophies',
    'hunting AND trophy',
    'endangered AND species',
    'poaching',
    'poaching AND turtle',
    'poaching AND tiger',
    'poaching AND pangolin',
    'poaching AND elephant',
    'poaching AND rhino'
]

articles = []
for keyword in keywords:
    response = requests.get('https://newsapi.org/v2/everything?q={query}&from=2019-10-09&sortBy=popularity&apiKey=cf360c62f2484be5a525532545283cc7'.format(query=keyword))
    json_response = json.loads(response.text)
    articles.extend(json_response['articles'])

In [193]:
def get_connection(password=None):
    if not password:
        password=str(getpass())

    conn = psycopg2.connect(database='zoohackathon', user='postgres', password=password)
    return conn
    

In [276]:
def insert_article(article):
    conn = get_connection(password=getpass())
    cursor = conn.cursor()

    id = str(uuid.uuid1())
    values = (
        id, 
        article['source']['name'], 
        article['author'], 
        article['title'], 
        article['description'], 
        article['url'],
        article['urlToImage'], 
        article['publishedAt'],
        article['content'], 
        datetime.now().isoformat()
    )
    cursor.execute("""
        INSERT INTO zoohackathon.public.articles(
            id, 
            source,
            author,
            title,
            description,
            link,
            link_to_image,
            publishedAt,
            content,
            loaded_at) 
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);
        """, 
   values)
    
    conn.commit()
    
    return id
    

In [197]:
def insert_words(words):
    conn = get_connection(password=getpass())
    cursor= conn.cursor()
    cursor.executemany('INSERT INTO zoohackathon.public.words(id, content, label, article_id) VALUES(%s, %s, %s, %s);', words)
    conn.commit()

In [289]:
text = url_to_string('https://people.com/pets/endangered-baby-pangolin-rescued-poacher-released/')

In [290]:
nlp_text = nlp(text)

In [292]:
[(token.text, token.label_) for token in nlp_text.ents]

[('Baby Pangolin Rescued', 'PERSON'),
 ('Poacher', 'PERSON'),
 ('Released', 'PRODUCT'),
 ('Stars', 'PRODUCT'),
 ('Mickey', 'PERSON'),
 ('Disney                                                                                                                                                     Mickey-Shaped Foods',
  'ORG'),
 ('Disney                                                                                                    Our',
  'ORG'),
 ('Catch', 'PERSON'),
 ('Instagram                                                                                                                                               Hot Celeb Men Who Bared',
  'ORG'),
 ('Instagram                                                                                                ',
  'ORG'),
 ('Lifestyle                                                The Best Photos from Celebrity Vacations',
  'WORK_OF_ART'),
 ('Profile Menu Subscribe', 'PERSON'),
 ('Baby Pangolin Kept', 'PERSON'),
 ('Baby Pangolin Kept'

In [288]:
for article in articles:
    text = url_to_string(article['url'])
    nlp_text = nlp(text)
    
    article_id = insert_article(article=article)
    words = []
    for word in nlp_text.ents:
        words.append((str(uuid.uuid1()), word.text, word.label_, article_id))
        
    insert_words(tuple(words))


In [64]:
text_and_labels = [(x.text, x.label_) for x in nlp_text.ents]
text_and_labels[:5]

[('Excise Department - Press', 'ORG'),
 ('Opens', 'ORG'),
 ('Opens', 'ORG'),
 ('JavaScript', 'ORG'),
 ('Javascript', 'PERSON')]

In [33]:
labels = [x.label_ for x in nlp_text.ents]
Counter(labels)

Counter({'ORG': 24,
         'PERSON': 7,
         'CARDINAL': 7,
         'DATE': 11,
         'LOC': 5,
         'MONEY': 5,
         'GPE': 3,
         'NORP': 1})

In [52]:
texts = [x.text for x in nlp_text.ents]
Counter(texts)

Counter({'Excise Department - Press': 1,
         'Opens': 3,
         'JavaScript': 1,
         'Javascript': 1,
         'Hong Kong Customs': 2,
         'FEHD': 5,
         'Yangcheng Lake': 8,
         '25': 2,
         'October': 1,
         '2019': 1,
         'Kong Customs': 1,
         'the Food and Environmental Hygiene Department': 2,
         'October 21': 2,
         '19': 3,
         'about $1,300': 2,
         'Causeway Bay': 2,
         'Hong Kong': 2,
         'Customs': 7,
         'the Trade Descriptions Ordinance': 1,
         'Japan': 1,
         'TDO': 1,
         '500,000': 1,
         'five years': 1,
         '2545 6182': 1,
         'crimereport@customs.gov.hk': 1,
         'two': 1,
         'this month': 1,
         'about 1 500': 1,
         'about $30,000': 1,
         'the Import and Export Ordinance': 1,
         '$2 million': 1,
         'seven years': 1,
         'October 25': 1,
         '2010': 1,
         'Last revision date': 1})

In [35]:
tokens = [(token.text, token.pos_, token.dep_) for token in nlp_text]
print(tokens)

[(' ', 'SPACE', ''), ('Hong', 'PROPN', 'compound'), ('Kong', 'PROPN', 'nmod'), ('Customs', 'PROPN', 'nmod'), ('and', 'CCONJ', 'cc'), ('Excise', 'PROPN', 'conj'), ('Department', 'PROPN', 'compound'), ('-', 'PUNCT', 'punct'), ('Press', 'PROPN', 'compound'), ('Releases', 'PROPN', 'nsubj'), ('Skip', 'PROPN', 'ROOT'), ('to', 'ADP', 'prep'), ('main', 'ADJ', 'amod'), ('content', 'NOUN', 'pobj'), (' ', 'SPACE', ''), ('(', 'PUNCT', 'punct'), ('Opens', 'VERB', 'parataxis'), ('a', 'DET', 'det'), ('new', 'ADJ', 'amod'), ('window', 'NOUN', 'dobj'), (')', 'PUNCT', 'punct'), ('  ', 'SPACE', ''), ('(', 'PUNCT', 'punct'), ('Opens', 'VERB', 'parataxis'), ('a', 'DET', 'det'), ('new', 'ADJ', 'amod'), ('window', 'NOUN', 'dobj'), (')', 'PUNCT', 'punct'), (' ', 'SPACE', ''), ('Your', 'PRON', 'poss'), ('browser', 'NOUN', 'nsubj'), ('does', 'AUX', 'aux'), ('not', 'PART', 'neg'), ('support', 'VERB', 'ROOT'), ('JavaScript', 'PROPN', 'nsubjpass'), ('or', 'CCONJ', 'cc'), ('Javascript', 'PROPN', 'conj'), ('is', 'AU

In [27]:
sentences = [x for x in nlp_text.sents]
print(sentences[11])

Investigation by Customs revealed that the food premises concerned earlier purchased a batch of hairy crabs, for which the country of origin was Japan and issued with relevant health certificates, from a local wholesaler.


In [47]:
for sentence in sentences:
    displacy.render(nlp(str(sentence)), jupyter=True, style='ent', options={})

  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


  "__main__", mod_spec)


In [62]:
set([e.label_ for e in nlp_text.ents])

{'CARDINAL', 'DATE', 'GPE', 'LOC', 'MONEY', 'NORP', 'ORG', 'PERSON'}

In [63]:
[(e.text, e.label_) for e in nlp_text.ents if e.label_ == 'NORP']

[('crimereport@customs.gov.hk', 'NORP')]

In [86]:
for word in sentences[11]:
    word_with_label = nlp(word)
    print(word_with_label.label_)

TypeError: Argument 'string' has incorrect type (expected str, got spacy.tokens.token.Token)