In [175]:
import numpy as np
import pandas as pd
import requests
import json
import seaborn as sns
from bs4 import BeautifulSoup
import random
import matplotlib.pyplot as plt
from collections import Counter
import ast
import sqlite3
import re
from urllib3.exceptions import MaxRetryError
import threading
import time
import timeit
import html2text
import traceback
import plotly.graph_objects as go
import tldextract
from urllib.parse import urlparse
import ssl
import importlib


from urllib import parse
from langdetect import detect
from requests.exceptions import ReadTimeout, TooManyRedirects, ConnectionError, ConnectTimeout,\
    InvalidSchema, InvalidURL
from langdetect.lang_detect_exception import LangDetectException
from urllib3.exceptions import ProtocolError
import urllib.request
import socket
import langid
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.sparql import return_sparql_query_results

import mechanicalsoup

In [176]:
wikidata_processed_db = sqlite3.connect('/mnt/data/group3/wikidata/part/wikidata_processed_apr.db')

sql_cursor = wikidata_processed_db.cursor()

def kill(conn):
    while True: 
        with open('iskill.txt','r') as f:
            time.sleep(1)
            if f.readline().strip() == 'yes':
                print('killed')
                conn.interrupt()
                break
                
th = threading.Thread(target=kill,args=[wikidata_processed_db])
th.start()

In [177]:
def get_entity(item_id):
    while True:
        try:
            entity = get_entity_dict_from_api(item_id)
            return entity
        except ConnectionError:
            #traceback.print_exc()
            continue
        except MaxRetryError:
            #traceback.print_exc()
            time.sleep(1)
        except LdiResponseNotOk:
            #traceback.print_exc()
            return 'deleted'

def get_label(item):
    if type(item) == str:        
        entity = get_entity(item)
        if entity == 'deleted':
            return entity
        labels = entity['labels']
    elif type(item) == dict:
        labels = item['labels']
    languages = ['en','fr','es','pt','pt-br','it','de']
    for l in languages:
        if l in labels:
            return labels[l]['value']
    return 'no-label'

def get_datatype(item):
    try:
        if type(item) == str:
            entity = get_entity(item)
            if entity == 'deleted':
                return entity
            datatype = entity['datatype']
        elif type(item) == dict:
            datatype = item['datatype']
        return datatype
    except KeyError:
        return 'none'

def get_claim_values_of(item, property_id):
    if type(item) == str:
        entity = get_entity(item)
        if entity == 'deleted':
            return entity
        claims = entity['claims']
    elif type(item) == dict:
        claims = item['claims']
    if property_id in claims:
        instance_of_claims = claims[property_id]
        return [i['mainsnak']['datavalue']['value']['id'] for i in instance_of_claims]
    else:
        return []
    
def aggregate_other(df, by, count_name='count', other_thr=1):
    df_c = df.copy()
    df_c = df_c[[by,count_name]]
    total_count = df_c[count_name].sum()
    df_c['per'] = df_c[count_name].apply(lambda x: 100*x/total_count)

    other_df_c = df_c[df_c['per'] < other_thr].sum()
    other_df_c[by] = 'other'

    df_c = df_c.append(
        other_df_c, ignore_index=True
    ).sort_values('per',ascending=False).reset_index(drop=True)
    df_c = df_c[df_c['per'] >= other_thr]

    return df_c

In [178]:
sql_cursor.execute(
    ''' select * from reference_nodes_to_urls; '''
)
reference_nodes_to_urls = pd.DataFrame(sql_cursor.fetchall())
reference_nodes_to_urls.columns = ['reference_id','url','url_type','is_inferred_from','stated_in','external_id_prop','external_id'
,'internal_urls','external_url','wikimedia_import_urls','retrieved','publication_date','ref_node']
reference_nodes_to_urls = reference_nodes_to_urls[['reference_id','url','url_type']]

reference_nodes_to_urls

Unnamed: 0,reference_id,url,url_type
0,fdf0b8f0849c8ebb730fdb94f346318d0e28c658,https://viaf.org/viaf/88147602427757640745,internal_url
1,8bc0812b630395efbc002320f3e413f0cd2d14c0,https://www.ebi.ac.uk/europepmc/webservices/re...,external_url
2,f2dc58329cd520af8bcafe68035e07dbcf90261a,,
3,b78281d9509c5d1ed955e612ad947d3ed54e5a1c,http://www.ebi.ac.uk/QuickGO/annotations?prote...,external_url
4,292ee644c9159a430884ec220199a3d4579ae846,https://echa.europa.eu/substance-information/-...,internal_url
...,...,...,...
599990,0e99b2e1281c49c195ea4efdcd12aebd02a903f1,https://maps.google.com/?cid=18437117778692745281,internal_url
599991,73b514bf01c621601385b9db56761f8e89dd5ac6,http://europepmc.org/abstract/MED/4883756,external_url
599992,589dc7e204d93b8a9abc51628db0abce86d1ad1a,https://viaf.org/viaf/15092262,internal_url
599993,d9a0abfb21b8b78a9efa5e439812978042589e86,https://www.ebi.ac.uk/europepmc/webservices/re...,external_url


In [179]:
from markdown import Markdown
from io import StringIO


def unmark_element(element, stream=None):
    if stream is None:
        stream = StringIO()
    if element.text:
        stream.write(element.text)
    for sub in element:
        unmark_element(sub, stream)
    if element.tail:
        stream.write(element.tail)
    return stream.getvalue()


# patching Markdown
Markdown.output_formats["plain"] = unmark_element
__md = Markdown(output_format="plain")
__md.stripTopLevelTags = False


def unmark(text):
    return __md.convert(text)

In [235]:
def html_to_markdown(html):
    text_maker = html2text.HTML2Text()
    text_maker.ignore_links = True
    text_maker.ignore_anchors = True
    text_maker.ignore_emphasis = True
    text_maker.images_to_alt = True
    text_maker.ignore_tables  = True
    text = text_maker.handle(html)
    return text

def clean_text_line_by_line(text, ch_join = '\n'):
    # text = soup.body.get_text()
    # break into lines and remove leading and trailing space on each
    lines = (line.strip() for line in text.splitlines())
    # break multi-headlines into a line each
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    # drop blank lines
    text = ch_join.join(chunk for chunk in chunks if chunk)
    return text

def get_text_p_tags(soup):
    p_tags = soup.find_all('p')
    text = [p.getText().strip() for p in p_tags if p.getText()]
    return '\n'.join(text)

In [542]:
def apply_manual_rules(text):
    # RULE: A line ending with a ':' followed by whitespaces and a newline is likely a continuing line and should be joined
    text = re.sub(
       r':\s*\n', 
       r': ', 
       text
    )
    # RULE: Remove [1] reference numbers
    text = re.sub(
        r'\[[0-9]+\]',
        '',
        text
    )
    # RULE: Correcting punctuation that is spaced to the left
    text = re.sub(
        r' ([.,:!?\\-])',
        r'\1',
        text
    )
    # RULE: Correcting punctuation that has no space to the right and next character is upper case
    # TODO: Include more cases, exclude emails and datetime
    text = re.sub(
        r'([.,:!?])([A-Z])',
        r'\1 \2',
        text
    )
    return text

In [1]:
from sentence_splitter import SentenceSplitter, split_text_into_sentences
splitter = SentenceSplitter(language='en')

from nltk.tokenize import sent_tokenize
import nltk

from boilerpy3 import extractors
# USING DIFFERENT EXTRACTORS MIGHT BE A GOOD THING FOR THE PAPER
#extractor = extractors.ArticleExtractor()# DOES NOT EXTRACT SMALL TEXT AREAS
#extractor = extractors.DefaultExtractor()# FAILED AT FINDAGRAVE
#extractor = extractors.CanolaExtractor() # MISSES SOME SMALL SENTENCES IN PARAGRAPHS
#extractor = extractors.ArticleSentencesExtractor() # DOES NOT EXTRACT ALL SENTENCES, FAILED AT WIKIPEDIA
#extractor = extractors.NumWordsRulesExtractor() # MISSES SOME SMALL SENTENCES IN PARAGRAPHS
#extractor = extractors.LargestContentExtractor()# DOES NOT EXTRACT ALL SENTENCES, FAILED AT WIKIPEDIA


import pysbd
seg = pysbd.Segmenter(language="en", clean=False)

import spacy
nlp = spacy.load("en_core_web_sm")

timeout = (10,60*2)
n=41
for i, row in reference_nodes_to_urls[:n].iterrows():
    print(i,end=' - ')
    reference_id = row[0]
    print(reference_id,end=' - ')
    url = row[1]
    if i == 0:
        reference_id = 'test'
        url = 'https://www.findagrave.com/memorial/221547219/larry-king'
    if url == 'None':
        print('no url')
    else:
        with requests.Session() as s:
            try:
                while 1:
                    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
                    resp = s.get(url=url, timeout=timeout, stream=True, verify=True, headers=headers)
                    if resp.status_code == 429:
                        time.sleep(int(resp.headers['Retry-After']))
                    else:
                        break
            except (TooManyRedirects) as e:
                print('TooManyRedirects')
                continue
            except (ReadTimeout, ConnectTimeout) as e:
                print('Timeout')
                continue
            except (ConnectionError) as e:
                print('ConnectionError')
                continue
            except InvalidSchema as e:                
                print('InvalidSchema')
                continue
            except (UnicodeError, InvalidURL) as e:                
                print('InvalidURL')
                continue
            except (Exception, KeyboardInterrupt) as e:
                #print(r,e)
                #traceback.print_exc()
                raise
        if 'application/json' in resp.headers['Content-Type']:
            print('JSON')
            continue
        html = resp.content
        soup = BeautifulSoup(html, "html")
        filter_element(soup.body)
        for script in soup(["script", "style"]):
            script.decompose()
        if soup.body == None:
            print('No body')
        else:
            #jusText
            #text = justext.justext(html, justext.get_stoplist("English"))
            #text = '\n'.join([paragraph.text for paragraph in text if not paragraph.is_boilerplate])
            
            #boilerpy3
            #html = soup.prettify()
            #text = extractor.get_content(html)
            
            #simple rules
            text = get_text_p_tags(soup)
            
            #all text
            #text = soup.body.getText() # NOT GETTING FROM THE WHOLE SOUP, JUST BODY TO AVOID TITLES
            
            text = clean_text_line_by_line(text, ch_join = ' ')
            text = apply_manual_rules(text)
            
            if not text:
                print('No extractable text')
                continue
            lang = langid.classify(text)
            if lang[0] != 'en':
                print('Not English, actually %s.' % lang[0])
                continue
            else:    
                with open('parsed_files/'+reference_id+'.txt','w+') as f:
                    # TODO Get statement nodes related to this reference here
                    # look at http://localhost:8889/notebooks/phd/Wikidata/Crowdsourcing/GenerateTaskSets.ipynb
                    
                    f.write('URL:'+url+'\n')
                    #f.write('Language:'+lang[0]+'\n')
                    f.write('ID:'+str(i)+'\n')

                    f.write('-'*20+'\n')
                    
                    #sents = sent_tokenize(text)
                    #sents = seg.segment(text)
                    #print(text+'\n')
                    text = nlp(text)
                    c = 0
                    #for sent in sents:
                    for sent in text.sents:
                        if sent.text:
                            #words = nltk.word_tokenize(sent)
                            #tags = nltk.pos_tag(words)
                            #verbs = [tag for tag in tags if tag[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']]
                            #print(tags)
                            #print('\n')
                            doc = nlp(sent.text)
                            good_words = []#[token for token in doc if token.pos_ in ['VERB']]
                            ents = [ent.text for ent in doc.ents]
                            verbs = [token for token in doc if token.tag_.startswith('V')]
                            subjs = [token for token in doc if token.dep_ in ['nsubj','nsubjpass']]
                            #print(doc.ents)
                            #print(sent.text)
                            #print([(token,token.pos_) for token in doc])
                            #print([(token,token.tag_) for token in doc])
                            #print([(token,token.dep_) for token in doc])
                            #print('\n')
                            if verbs and (ents or subjs): # TODO: MAYBE PUT AN 'and' FOR ALL?
                                f.write(sent.text.replace('\n',' ').strip()+'\n')
                                c+=1
                    if c == 0:
                        f.write('No good sentences')
                        print('No good sentences')
                    else:        
                        print('Written to file')

SyntaxError: invalid syntax (4014782792.py, line 83)

## TESTING BOILERPLATE REMOVAL THROUGH ENTITY/VERB DETECTION

In [321]:
import copy

In [551]:
with requests.Session() as s:
    url = 'http://www.ebi.ac.uk/QuickGO/annotations?protein=A0A2C9Y8V5&geneProductId=UniProtKB:A0A2C9Y8V5'
    headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}
    resp = s.get(url=url, timeout=(10,60*2), stream=True, verify=True, headers=headers)

soup = BeautifulSoup(resp.content)
for script in soup(["script", "style"]):
    script.decompose()

In [553]:
import bs4

def filter_string(s,debug):
    doc = nlp(s)
    ents = [ent.text for ent in doc.ents]
    verbs = [token for token in doc if token.tag_.startswith('V')]
    subjs = [token for token in doc if token.dep_ in ['nsubj','nsubjpass']]
    if verbs and (ents or subjs):
        if debug:
            print('OK:',s)
        return True
    else:
        if debug:
            print('NOT OK:',s)
        return False

def filter_element(e,pad=0,debug=False):
    if hasattr(e, 'children'):
        children = list(e.children)
        for ee in children:
            filter_element(ee,pad=pad+1,debug=debug)
    if debug:
        print('-'*pad,type(e),e.name,str(e))
    if isinstance(e,bs4.element.Comment):
        e.extract()
    elif isinstance(e,bs4.element.NavigableString):
        pass    
    elif e.name in ['a','span','i','b']:
        pass
    else:        
        if e.text is None or not filter_string(e.text.strip(), debug=debug):
            #if type(e) == bs4.element.Tag:
            if debug:
                print('decomposed!')
            e.decompose()
            #else:
            #    if debug:
            #        print('extracted!')
            #    e.extract()
    #if not hasattr(e, 'children') or len(list(e.children)) == 0:
        
soup1 = copy.copy(soup)
filter_element(soup1.body, pad=0, debug=True)
print(soup1.body.getText())

- <class 'bs4.element.NavigableString'> None  
-- <class 'bs4.element.NavigableString'> None  
--- <class 'bs4.element.NavigableString'> None  
----- <class 'bs4.element.NavigableString'> None Skip to main content
---- <class 'bs4.element.Tag'> a <a href="#content">Skip to main content</a>
--- <class 'bs4.element.Tag'> li <li><a href="#content">Skip to main content</a></li>
NOT OK: Skip to main content
decomposed!
--- <class 'bs4.element.NavigableString'> None  
----- <class 'bs4.element.NavigableString'> None Skip to local navigation
---- <class 'bs4.element.Tag'> a <a href="#local-nav">Skip to local navigation</a>
--- <class 'bs4.element.Tag'> li <li><a href="#local-nav">Skip to local navigation</a></li>
NOT OK: Skip to local navigation
decomposed!
--- <class 'bs4.element.NavigableString'> None  
----- <class 'bs4.element.NavigableString'> None Skip to EBI global navigation menu
---- <class 'bs4.element.Tag'> a <a href="#global-nav">Skip to EBI global navigation menu</a>
--- <class '

In [556]:
soup1.head.getText()

'     QuickGO                '

In [550]:
doc = nlp('Photo request sent successfully.')
print([(token,token.tag_) for token in doc])
print([(token,token.dep_) for token in doc])
print([ent for ent in doc.ents])

[(Photo, 'NN'), (request, 'NN'), (sent, 'VBN'), (successfully, 'RB'), (., '.')]
[(Photo, 'compound'), (request, 'nsubj'), (sent, 'ROOT'), (successfully, 'advmod'), (., 'punct')]
[]
