In [None]:
output_dir = './polar/'

In [None]:
import os, itertools, json

article_paths = [output_dir + 'articles/' + p + '/' for p in sorted(os.listdir(output_dir + 'articles/'))]
article_paths = list(itertools.chain.from_iterable([[p + _ for _ in os.listdir(p)] for p in article_paths]))

In [None]:
print('Total number of articles:', len(article_paths))

In [None]:
def replace_special(text):
    text = text.replace('``', "''")
    text = text.replace('`', "'")
    text = text.replace('“', '"')
    text = text.replace('”', '"')
    text = text.replace('’', "'")
    text = text.replace('‘', "'")
    text = text.replace("'", "'")
    text = text.replace('–', "-")
    text = text.replace('—', "-")
    text = text.replace('\"', '"')
    text = text.replace("\'", "'")
    
    return text

def encode(text): return text.encode(encoding="ascii",errors="ignore")
def decode(text): return text.decode("utf-8")

def uncontract(text):    
    text = re.sub(r"(\b)([Aa]re|[Cc]ould|[Dd]id|[Dd]oes|[Dd]o|[Hh]ad|[Hh]as|[Hh]ave|[Ii]s|[Mm]ight|[Mm]ust|[Ss]hould|[Ww]ere|[Ww]ould)n't", r"\1\2 not", text)
    text = re.sub(r"(\b)([Hh]e|[Ii]|[Ss]he|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'ll", r"\1\2 will", text)
    text = re.sub(r"(\b)([Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Yy]ou)'re", r"\1\2 are", text)
    text = re.sub(r"(\b)([Ii]|[Ss]hould|[Tt]hey|[Ww]e|[Ww]hat|[Ww]ho|[Ww]ould|[Yy]ou)'ve", r"\1\2 have", text)
    
    text = re.sub(r"(\b)([Cc]a)n't", r"\1\2n not", text)
    text = re.sub(r"(\b)([Ii])'m", r"\1\2 am", text)
    text = re.sub(r"(\b)([Ll]et)'s", r"\1\2 us", text)
    text = re.sub(r"(\b)([Tt]here)'s", r"\1\2 is", text)
    text = re.sub(r"(\b)([Ww])on't", r"\1\2ill not", text)
    text = re.sub(r"(\b)([Ss])han't", r"\1\2hall not", text)
    text = re.sub(r"(\b)([Yy])(?:'all|a'll)", r"\1\2ou all", text)
    
    return text

def pipeline_func(text, func_list):
    for f in func_list: text = f(text)
    return text

In [None]:
import gzip, json

def load_gzip(path, func=json.loads):
    with gzip.open(path, 'r') as f: data = func(f.read().decode('utf-8'))
    return data

def pre_process_article(path):

    with open(path, 'r') as f: article_obj = json.load(f)

    article_dict_str = json.dumps({
        'uid': article_obj['uid'],
        'text': pipeline_func(
            article_obj['text'],
            [replace_special, uncontract, lambda t: t.replace('\n', ' ')]
        )
    })

    output_folder = output_dir + 'pre_processed/' + path.split('/')[4] + '/'
    output_file = output_folder + article_obj['uid'] + '.json.gzip'    
    
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with gzip.open(output_file, 'w') as f: f.write(article_dict_str.encode('utf-8'))

    return True

In [None]:
%%time

from tqdm import tqdm
import multiprocessing, re
from multiprocessing import Pool

pool = Pool(multiprocessing.cpu_count())

for i in tqdm(
    pool.imap_unordered(
        pre_process_article,
        article_paths
    ),
    desc='Article Pre-processing',
    total=len(article_paths)
): pass

pool.close()
pool.join()

In [None]:
import spacy 

import pickle, spacy
from spacy.vocab import Vocab

def save_spacy_obj(spacy_doc, store_path):
    spacy_doc_bytes = spacy_doc.to_bytes()
    with open(store_path, 'wb') as f: pickle.dump(spacy_doc_bytes, f, pickle.HIGHEST_PROTOCOL)
        
def save_spacy_vocab(store_path):
    vocab_bytes = spacy_nlp.vocab.to_bytes()
    with open(store_path, 'wb') as f: pickle.dump(vocab_bytes, f, pickle.HIGHEST_PROTOCOL)
    
def load_spacy_obj(load_path, vocab):
    with open(load_path, 'rb') as f: spacy_doc_bytes = pickle.load(f)
    spacy_doc = Doc(vocab).from_bytes(spacy_doc_bytes)
    return spacy_doc
    
def load_spacy_vocab(load_path):
    vocab = Vocab()
    
    with open(load_path, 'rb') as f: vocab_bytes = pickle.load(f)
    vocab.from_bytes(vocab_bytes)
    
    return vocab

In [None]:
spacy.prefer_gpu()

spacy_nlp = spacy.load("en_core_web_trf")    

from spacy.tokens import Doc
from spacy.vocab import Vocab

for i, c in enumerate(spacy_nlp.pipe_names): print('-', str(i + 1) + '.', c)

In [None]:
%%time

import os

pre_processed_paths = [[o1 + '/' + p for p in o3] for o1, o2, o3 in os.walk(output_dir + 'pre_processed')]

In [None]:
pre_processed_paths = list(itertools.chain.from_iterable(pre_processed_paths))

print('Number of pre-processed:', len(pre_processed_paths))

In [None]:
def spacy_to_disk(path):
        
    article_obj = load_gzip(path, func=json.loads)
    if len(article_obj['text'].strip()) == 0: return None
    
    doc = spacy_nlp(article_obj['text'])
    doc._.trf_data = None
    
    torch.cuda.empty_cache()
    
    output_folder = output_dir + 'spacy/' + path.split('/')[4] + '/'
        
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)    
    save_spacy_obj(doc, output_folder + '{}.pckl'.format(article_obj['uid']))
    
    return True

In [None]:
%%time

import torch
from tqdm import tqdm

for pp_path in tqdm(pre_processed_paths): 
    try: spacy_to_disk(pp_path)    
    except Exception as ex:
        print('Path:', pp_path)
        print('Exception:', ex)
        print()

In [None]:
import os, itertools, json

spacy_paths = [output_dir + 'spacy/' + p + '/' for p in sorted(os.listdir(output_dir + 'spacy/'))]
spacy_paths = list(itertools.chain.from_iterable([[p + _ for _ in os.listdir(p)] for p in spacy_paths]))

In [None]:
print('Number of SpaCy articles:', len(spacy_paths))

Before executing `get_dbpedia_entities` start the `Spotlight` containerized service.

`$ docker run -tid --restart unless-stopped --name dbpedia-spotlight.en --mount source=spotlight-model,target=/opt/spotlight -p 2222:80 dbpedia/dbpedia-spotlight spotlight.sh en`

In [None]:
import requests, time, spotlight
from spotlight import SpotlightException

def get_dbpedia_entities(text, confidence = 0.45):
    
    req_data = {'lang': 'en', 'text': str(text), 'confidence': confidence, 'types': ['']}
    spot_entities = requests.post('http://127.0.0.1:2222/rest/annotate', data=req_data, headers = {"Accept": "application/json"})
    
    try: 
        if 'Resources' not in spot_entities.json(): raise SpotlightException()
        spot_entities = [{k[1:]:v for k,v in r.items()} for r in spot_entities.json()['Resources']]
    except SpotlightException as se: return []
        
    return [{  
        'begin': int(e['offset']),
        'end': int(e['offset']) + len(e['surfaceForm']),
        'title': e['URI'],
        'score': float(e['similarityScore']),
        'text': e['surfaceForm'],
        'types': [t.replace('Wikidata:', '') for t in e['types'].split(',') if 'Wikidata' in t],
        'wikid': e['URI'],
        'dbpedia': e['URI']      
    } for e in spot_entities]

    time.sleep(0.250)

For additional accuracy, use the d4science `WAT` service. Sign-up to get your own `GCUBE_TOKEN` key.

In [None]:
import requests, time

GCUBE_TOKEN = 'GCUBE_TOKEN_API'

def get_wat_entities(text, confidence = 0.2):
    
    wat_return = requests.get('https://wat.d4science.org/wat/tag/tag', params={
        'lang': 'en',
        'gcube-token': GCUBE_TOKEN,
        'text': text
    })
    
    wat_return = wat_return.json()
    
    if not 'annotations' in wat_return: raise Exception('No annotations have been found.')
    return [{
        'begin': wats['start'],
        'end': wats['end'],
        'title': wats['title'],
        'score': wats['rho'],
        'text': wats['spot'],
        'wikid': wats['id'],
        'dbpedia': 'http://dbpedia.org/resource/' + wats['title']        
    } for wats in wat_return['annotations'] if wats['rho'] >= confidence]

    time.sleep(0.750)

In [None]:
def get_mention_resources(e_list):
    wat_mentions = defaultdict(lambda: [])
    dbpedia_mentions = defaultdict(lambda: [])

    t = ' and '.join(e_list)
    if len(t) == 0: return None
    
    try: 
        
        wat_entities = get_wat_entities(t, confidence=0.5)
        for e in wat_entities: wat_mentions[e['text']].append(e['dbpedia'])
            
    except ProxyError: 
        
        time.sleep(0.750)
        
        try:
        
            wat_entities = get_wat_entities(t, confidence=0.5)
            for e in wat_entities: wat_mentions[e['text']].append(e['dbpedia'])
        
        except ProxyError:
            
            wat_mentions = {
                'message': 'ProxyError',
                'input': t
            }      
        
    dbpedia_entities = get_dbpedia_entities(t, confidence=0.45)
    for e in dbpedia_entities: dbpedia_mentions[e['text']].append(e['dbpedia'])
        
    return (dict(wat_mentions), dict(dbpedia_mentions))

In [None]:
ner_types = ['PERSON', 'LOC', 'NORP', 'ORG', 'GPE', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW']

In [None]:
def export_entity_pairs(path):
    
    doc = load_spacy_obj(path, spacy_nlp.vocab)
    
    ner_pairs = []

    for np in doc.noun_chunks:
        
        for e in np.ents:
            
            if e.label_ in ner_types: ner_pairs.append(e.text)

    return ner_pairs

In [None]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

In [None]:
import json, jsonpickle

from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool

pool = Pool(multiprocessing.cpu_count()-32)

ner_entries = []

for e_np_ents in tqdm(
    pool.imap_unordered(export_entity_pairs, spacy_paths),
    desc='Loading Entities',
    total=len(spacy_paths)
): 
    ner_entries.append(e_np_ents)

pool.close()
pool.join()

In [None]:
import multiprocessing
from multiprocessing import Pool
from collections import defaultdict
from requests.exceptions import ProxyError

pool = Pool(16)

wat_mentions = []
dbpedia_mentions = []

for wd in tqdm(
    pool.imap_unordered(get_mention_resources, ner_entries),
    desc='Fetching Entity Mentions',
    total=len(ner_entries)
): 
    if not wd: continue
    wat_mentions.append(wd[0])
    dbpedia_mentions.append(wd[1])

pool.close()
pool.join()

In [None]:
wat_fails = [w['input'] for w in wat_mentions if 'message' in w]

print('Proxy Errors:', len(wat_fails))

In [None]:
wat_mentions_freq = defaultdict(lambda: [])
dbpedia_mentions_freq = defaultdict(lambda: [])
        
for wm in wat_mentions:
    for w, m in wm.items(): wat_mentions_freq[w] += m

for dm in dbpedia_mentions:
    for d, m in dm.items(): dbpedia_mentions_freq[d] += m

In [None]:
entity_list = list(itertools.chain.from_iterable(ner_entries))

entity_set = list(set(entity_list))

In [None]:
entity_linking_dict = {}

for e in tqdm(entity_set):
    entity_linking_dict[e] = []
    entity_linking_dict[e] += [r.replace('dbpedia', 'wat') for r in wat_mentions_freq[e]]
    entity_linking_dict[e] += dbpedia_mentions_freq[e]

In [None]:
with open(output_dir + 'entity_linking_dict.pckl', 'wb') as f: pickle.dump(entity_linking_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from collections import Counter

def discover_entities(path):
    
    domain_folder = path.split('/')[-2]
    uid = path.split('/')[-1].replace('.pckl', '')
    
    output_folder = output_dir + 'entities/' + domain_folder + '/'
    output_file = output_folder + uid + '.json'
    
    if os.path.exists(output_file): return None
        
    spacy_doc = load_spacy_obj(path, spacy_nlp.vocab)
    
    entity_list = []
        
    for e in spacy_doc.ents:

        if not e.label_ in ner_types: continue
        if not e.text in entity_linking_dict or len(entity_linking_dict[e.text]) == 0: continue
            
        uri_freq = Counter(entity_linking_dict[e.text]).most_common(1)[0]
            
        entity_list.append({
            'begin': int(e.start_char),
            'end': int(e.end_char),
            'title': uri_freq[0],
            'score': uri_freq[1],
            'text': e.text,
            'types': [e.label_],
            'wikid': uri_freq[0],
            'dbpedia': uri_freq[0]   
        })

    output_folder = output_dir + 'entities/' + domain_folder + '/'
    output_file = output_folder + uid + '.json'
    
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'entities': entity_list
    }))
        
    return True    

In [None]:
%%time

from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing, re, os, json

pool = Pool(multiprocessing.cpu_count()-2)

for i in tqdm(
    pool.imap_unordered(discover_entities, spacy_paths),
    desc='Bringing Articles to the Spotlight',
    total=len(spacy_paths)
): pass

pool.close()
pool.join()

In [None]:
entity_paths = [output_dir + 'entities/' + p + '/' for p in sorted(os.listdir(output_dir + 'entities/'))]
entity_paths = list(itertools.chain.from_iterable([[p + _ for _ in os.listdir(p)] for p in entity_paths]))

In [None]:
def sentence_token_indices(spacy_document):
    sentence_aware_token_indices = {}
    for i, sentence in enumerate(spacy_document.sents):
        for j, token in enumerate(sentence): 
            sentence_aware_token_indices[token.i] = j
            
    return sentence_aware_token_indices

from collections import defaultdict

def assign_entity_2_span(span, entities):
    x = range(span[0], span[1])
    for e in entities:
        y, xs = range(e['begin'], e['end']), set(x)
        if len(xs.intersection(y)) > 0: return e
        
    return None

For `coreference resolution` start the `Neuralcoref Service` from here: https://github.com/dpasch01/neuralcoref-service

To start execute `$ docker-compose up`.

In [None]:
import requests

def get_coreferences(text, host='http://127.0.0.1:8150/coref'): return requests.post(host, data={'text': text}).json()

In [None]:
def get_coref_clusters(spacy_document, coref_annotations, entity_list):
    coreference_clusters = []
    
    for cc in coref_annotations['coref']:
        
        cchain_obj = {'chain': [], 'main': {
                'start': cc['main']['start'], 'end': cc['main']['end'], 'text': cc['main']['text'],
                'entity': assign_entity_2_span((cc['main']['start'], cc['main']['end']), entity_list),
            }
        }

        for m in cc['chain']: 
            _cchain_mention = { 'start': m['start'], 'end': m['end'],
                'entity': assign_entity_2_span((m['start'], m['end']), entity_list),
                'text': m['text']
            }

            cchain_obj['chain'].append(_cchain_mention)

        coreference_clusters.append(cchain_obj)
    
    return coreference_clusters

In [None]:
from collections import Counter
import numpy as np

def get_coref_main(coref_cluster):
    max_freq = 0
    coref_dict = defaultdict(lambda: {'scores': [], 'frequency': 0})
    
    for c in coref_cluster['chain']:
        if not c['entity']: continue
        e = c['entity']
        
        coref_dict[e['title']]['scores'].append(e['score'])
        coref_dict[e['title']]['frequency'] += 1
        
        max_freq = max(coref_dict[e['title']]['frequency'], max_freq)
    
    freq_counter = dict(Counter([coref_dict[c]['frequency'] for c in coref_dict]))
    
    if max_freq == 0: return None
    elif freq_counter[max_freq] == 1: 
        for title in coref_dict:
            if coref_dict[title]['frequency'] == max_freq: 
                return title
                break
    else:
        max_score = 0
        max_title = None
        
        for title in coref_dict:
            score = np.mean(coref_dict[title]['scores'])
            if score > max_score: 
                max_score = score
                max_title = title
                
        return max_title

In [None]:
from collections import Counter
import numpy as np

def transform_to_core_nlp_obj(spacy_document, document_coreferences, sentence_aware_token_indices, article_entities, np_flag=True):
    
    spacy_annotation_obj, spacy_dependencies_obj = {}, {}

    for i, sentence in enumerate(spacy_document.sents):
        spacy_annotation_obj[i], spacy_dependencies_obj[i] = {'index': i, 'tokens': []}, []
        
        if np_flag: 
            
            np_indices_dict = {np_tokens.text: [np.i for np in np_tokens] for np_tokens in sentence.noun_chunks}

            for e_obj in article_entities:
                
                if e_obj['end'] > sentence[-1].idx: break
                    
                overlap_flag = False

                for np_k, np_v in np_indices_dict.items(): 
                    overlap_flag = len(set(range(e_obj['begin'], e_obj['end'])).intersection(set(np_v))) > 0

                    if overlap_flag: break

                if overlap_flag: del np_indices_dict[np_k]
        
        for j, token in enumerate(sentence):
            
            spacy_annotation_obj[i]['tokens'].append({
                'originalText': token.text,
                'characterOffsetBegin': token.idx,
                'characterOffsetEnd': token.idx + len(token.text),
                'lemma': token.lemma_,
                'sentence_index': i,
                'index': j + 1,
                'pos': token.pos_
            })
                
            if np_flag: 
                for np_str, np_is in np_indices_dict.items(): 
                    if token.i in np_is: spacy_annotation_obj[i]['tokens'][j]['entity_id'] = ('np', np_str)
                    
            if token.dep_.lower() == 'root': spacy_dependencies_obj[i].append({
                    'dep': token.dep_,
                    'governor': 0,
                    'governorGloss': 'ROOT',
                    'dependent': token.i + 1,
                    'dependentGloss': token.text,
                })
            else: spacy_dependencies_obj[i].append({
                    'dep': token.dep_,
                    'governor': token.head.i + 1,
                    'governorGloss': token.head.text,
                    'dependent': token.i + 1,
                    'dependentGloss': token.text,
                })    

    for i, sentence in enumerate(spacy_document.sents):
        for dep in spacy_dependencies_obj[i]: 
            gov_idx = sentence_aware_token_indices[dep['governor'] - 1] if dep['governor'] > 0 else -1
            dep_idx = sentence_aware_token_indices[dep['dependent'] - 1]
            dep_type = dep['dep'].lower()

            if dep_type == 'root': spacy_annotation_obj[i]['dep_root'] = dep['dependent']
            else:
                if 'dependents' not in spacy_annotation_obj[i]['tokens'][gov_idx]: spacy_annotation_obj[i]['tokens'][gov_idx]['dependents'] = set()
                spacy_annotation_obj[i]['tokens'][gov_idx]['dependents'].add((dep_idx, dep_type))

            if 'governor' not in spacy_annotation_obj[i]['tokens'][dep_idx]: spacy_annotation_obj[i]['tokens'][dep_idx]['governor'] = set()
            spacy_annotation_obj[i]['tokens'][dep_idx]['governor'].add((gov_idx, dep_type))
            
    for i, sentence in enumerate(spacy_document.sents):
        for j, token in enumerate(spacy_annotation_obj[i]['tokens']):
            token_range = set(range(token['characterOffsetBegin'], token['characterOffsetEnd']))
            for entity in article_entities:
                entity_range = set(range(entity['begin'], entity['end']))
                if len(token_range.intersection(entity_range)) > 0: 
                    spacy_annotation_obj[i]['tokens'][j]['entity_id'] = ('dbpedia', entity['title'])
                    
    for i, sentence in enumerate(spacy_document.sents):
        for j, token in enumerate(spacy_annotation_obj[i]['tokens']):
            if 'governor' in spacy_annotation_obj[i]['tokens'][j]:
                spacy_annotation_obj[i]['tokens'][j]['governor'] = list(spacy_annotation_obj[i]['tokens'][j]['governor'])
            if 'dependents' in spacy_annotation_obj[i]['tokens'][j]:
                spacy_annotation_obj[i]['tokens'][j]['dependents'] = list(spacy_annotation_obj[i]['tokens'][j]['dependents'])
                 
    for coref_cluster in get_coref_clusters(spacy_document, document_coreferences, article_entities):
        main_title = get_coref_main(coref_cluster)
        if not main_title: continue

        for i, sentence in enumerate(spacy_document.sents):
            for j, token in enumerate(spacy_annotation_obj[i]['tokens']):
                token_range = set(range(token['characterOffsetBegin'], token['characterOffsetEnd']))
                for c_chain in coref_cluster['chain']:
                    c_range = set(range(c_chain['start'], c_chain['end']))
                    if len(token_range.intersection(entity_range)) > 0: 
                        spacy_annotation_obj[i]['tokens'][j]['entity_id'] = ('dbpedia', main_title)
 
    return spacy_annotation_obj

In [None]:
import utilities, entity_extraction, sentiment_features

def generate_article_entity_obj(spacy_annotation_obj):
    entity_list = entity_extraction.EntityExtractor()
    
    for i, sentence in spacy_annotation_obj.items():

        start_idx = curr_eid = None
        for token in sentence['tokens']:

            if 'entity_id' in token and curr_eid is None:
                start_idx = token['index'] - 1
                curr_eid = tuple(token['entity_id'])

            if tuple(token.get('entity_id', ())) != curr_eid and curr_eid is not None:

                end_idx = token['index'] - 1
                raw_text = utilities.get_text(sentence['tokens'][start_idx:end_idx])

                if curr_eid not in entity_list._occurances or raw_text not in entity_list._entity_to_id: 
                    if len(curr_eid) == 2 and 'trumpnote' in curr_eid[1]: 
                        print(curr_eid)
                        print(token)
                    entity_list.create_entity(('BLANK', raw_text), curr_eid, dangermode=True)

                entity_list.add_occurance(('BLANK', raw_text), sentence['index'], start_idx, end_idx)
                start_idx = curr_eid = None

        if curr_eid is not None:

            end_idx = token['index']
            raw_text = utilities.get_text(sentence['tokens'][start_idx:end_idx])
            entity_list.add_occurance(('BLANK', raw_text), sentence['index'], start_idx, end_idx)
    
    id_to_entity = {str(e[1][1]): e[0][1] for e in entity_list._entity_to_id.items()}
    
    return entity_list

In [None]:
def make_annotation_object(spacy_document, coreferences, article_entities):
    
    sentence_aware_token_indices = sentence_token_indices(spacy_document)

    spacy_annotation_obj = transform_to_core_nlp_obj(
        spacy_document,
        coreferences,
        sentence_aware_token_indices,
        article_entities,
        np_flag=False
    )
    
    spacy_np_annotation_obj = transform_to_core_nlp_obj(
        spacy_document,
        coreferences,
        sentence_aware_token_indices,
        article_entities,
        np_flag=True
    )

    entity_list = generate_article_entity_obj(spacy_annotation_obj)
    entity_np_list = generate_article_entity_obj(spacy_np_annotation_obj)
    
    dependency_feature_list = sentiment_features.dependency_features(
        [],
        list(spacy_annotation_obj.values()),
        entity_list
    )
    
    dependency_np_feature_list = sentiment_features.dependency_features(
        [],
        list(spacy_np_annotation_obj.values()),
        entity_np_list
    )
            
    return (
        spacy_annotation_obj,
        spacy_np_annotation_obj,
        entity_list,
        entity_np_list,
        dependency_feature_list,
        dependency_np_feature_list
    )

In [None]:
import math

def split_to_p_ids(s=3, n=140):
    div = math.floor(n/s)
    
    p_list = [list(range(div))]
        
    for i in range(1, s): 
        p_max = max(p_list[i-1])
        p_list.append([p_max + 1 + v for v in list(range(div))])
        
    return {
        'p_ranges': [(min(p), max(p)) for p in p_list],
        'p_ids': list(itertools.chain.from_iterable([[p[i] for p in p_list] for i in range(min([len(p) for p in p_list]))]))
    } 

In [None]:
load_balance = split_to_p_ids(5, len(spacy_paths))

spacy_queue = list(zip(spacy_paths, load_balance['p_ids']))

In [None]:
def resolve_spacy_coref(path):
    
    p_id = path[1]
    path = path[0]
    
    host = 'http://127.0.0.1:'
    port = 8150
    
    for i, r in enumerate(load_balance['p_ranges']):
        if p_id >= r[0] and p_id <= r[1]: 
            host += str(port + i) + '/coref'
            break
            
    uid = path.split('/')[-1].replace('.pckl', '')
    daily_folder = path.split('/')[-2]
    
    output_folder = output_dir + 'coreferences/' + daily_folder + '/'
    output_file = output_folder + uid + '.pckl'
    
    if os.path.exists(output_file): return None
    
    spacy_article = load_spacy_obj(path, spacy_nlp.vocab)
    
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'wb') as f: pickle.dump({
        'uid': uid,
        'coref': get_coreferences(spacy_article.text, host=host)['coref']
    }, f)  
        
    return True

In [None]:
from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing, re, os, json, pickle

pool = Pool(5)

for i in tqdm(
    pool.imap_unordered(resolve_spacy_coref, spacy_queue),
    desc='Article Coreference',
    total=len(spacy_queue)
): pass

pool.close()
pool.join()    

In [None]:
import jsonpickle

def annotate_articles(path):

    uid = path.split('/')[-1].replace('.pckl', '')
    daily_folder = path.split('/')[-2]

    if not os.path.exists(output_dir + 'entities/' + daily_folder + '/' + uid + '.json'): return None
  
    spacy_article = load_spacy_obj(path, spacy_nlp.vocab)
    spacy_entities = load_entities(output_dir + 'entities/' + daily_folder + '/' + uid + '.json')['entities']
    
    spacy_entities = [e for e in spacy_entities if e]
    
    try:
        with open(output_dir + 'coreferences/' + daily_folder + '/' + uid + '.pckl', 'rb') as f: spacy_corefs = pickle.load(f)
    except Exception: 
        
        return None
    
    try: 
        spacy_annotation_obj, \
        spacy_np_annotation_obj, \
        entity_list, \
        entity_np_list, \
        dependency_feature_list, \
        dependency_np_feature_list = make_annotation_object(spacy_article, spacy_corefs, spacy_entities)
    except Exception as ex: 
        print(ex)
        print(path)
        print()
        
        return None
        
    output_folder = output_dir + 'annotations/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: json.dump({
        'uid': uid,
        'annotations': spacy_annotation_obj
    }, f)

    output_folder = output_dir + 'np_annotations/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: json.dump({
        'uid': uid,
        'annotations': spacy_np_annotation_obj
    }, f)

    output_folder = output_dir + 'annotated_entities/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: json.dump({
        'uid': uid,
        'entities': jsonpickle.encode(entity_list)
    }, f)

    output_folder = output_dir + 'np_annotated_entities/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: json.dump({
        'uid': uid,
        'entities': jsonpickle.encode(entity_np_list)
    }, f)            

    output_folder = output_dir + 'dependency_features/' + daily_folder + '/'
    output_file = output_folder + uid + '.pckl'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'wb') as f: pickle.dump({
        'uid': uid,
        'dependency_features': dependency_feature_list
    }, f)    

    output_folder = output_dir + 'np_dependency_features/' + daily_folder + '/'
    output_file = output_folder + uid + '.pckl'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'wb') as f: pickle.dump({
        'uid': uid,
        'dependency_features': dependency_np_feature_list
    }, f)  

    return True

In [None]:
import os, itertools

def load_entities(path):
    with open(path, 'r') as f: entity_list = json.load(f)
    return entity_list

In [None]:
%%time

from tqdm import tqdm
from multiprocessing import Pool
import multiprocessing, re, os, json, pickle

pool = Pool(multiprocessing.cpu_count() - 8)

for i in tqdm(
    pool.imap_unordered(annotate_articles, spacy_paths),
    desc='Article Dependency Annotation',
    total=len(spacy_paths)
): pass

pool.close()
pool.join()

In [None]:
from utilities import DepDirection, find_dep_path
import sentiment_features, itertools
from collections import defaultdict

def extract_pair_annotations(path):

    daily_folder = path.split('/')[-2]

    spacy_article = load_spacy_obj(path, spacy_nlp.vocab)
   
    path = path.replace('/spacy/', '/{}/').replace('.pckl', '{}')

    if not os.path.exists(path.format('annotations', '.json')): return None
    if os.path.exists(path.format('pair_indices', '.json')): return None

    with open(path.format('annotations', '.json'), 'r') as f: spacy_annotation_obj = json.load(f)
        
    uid = spacy_annotation_obj['uid']
    spacy_annotation_obj = spacy_annotation_obj['annotations']

    entity_sentence_indices = defaultdict(lambda: [])

    for sentence_i, annotations in spacy_annotation_obj.items():
        for t in annotations['tokens']:
            if 'entity_id' in t and t['entity_id'][0] == 'dbpedia': 
                if sentence_i in entity_sentence_indices[t['entity_id'][1]]: continue
                entity_sentence_indices[t['entity_id'][1]].append(sentence_i)

    with open(path.format('annotated_entities', '.json'), 'r') as f: entity_list = json.load(f)
    entity_list = jsonpickle.decode(entity_list['entities'])
                    
    pair_sentence_indices = defaultdict(lambda: [])

    for sentence_i, annotations in spacy_annotation_obj.items():
        entity_list = [t['entity_id'][1] for t in annotations['tokens'] if 'entity_id' in t and t['entity_id'][0] == 'dbpedia']
        for pair in itertools.combinations(entity_list, 2):
            if pair[0] == pair[1]: continue
            _pair = [pair[0], pair[1]]
            _pair.sort()

            pair = (_pair[0], _pair[1])

            if sentence_i in pair_sentence_indices[pair]: continue
            pair_sentence_indices[pair].append(sentence_i)

    entity_sentence_indices, pair_sentence_indices = dict(entity_sentence_indices), dict(pair_sentence_indices)

    output_folder = output_dir + 'entity_indices/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'indices': jsonpickle.encode(entity_sentence_indices)
    }))

    output_folder = output_dir + 'pair_indices/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'indices': jsonpickle.encode(pair_sentence_indices)
    }))

    entity_annotation_dict = defaultdict(lambda: [])
    entity_sentence_dict = defaultdict(lambda: [])
    entity_pair_annotation_dict = defaultdict(lambda: [])
    entity_pair_sentence_dict = defaultdict(lambda: [])

    article_sents = list(spacy_article.sents)

    for entity in entity_sentence_indices:

        for sentence_i in entity_sentence_indices[entity]:
            
            #######################################################################
            # entity_sentence_dict[entity].append(article_sents[int(sentence_i)]) #
            #######################################################################

            entity_annotation_dict[entity].append(spacy_annotation_obj[sentence_i])

    for pair in pair_sentence_indices:

        for sentence_i in pair_sentence_indices[pair]:
            
            ##########################################################################
            # entity_pair_sentence_dict[pair].append(article_sents[int(sentence_i)]) #
            ##########################################################################

            entity_pair_annotation_dict[pair].append(spacy_annotation_obj[sentence_i])

    entity_annotation_dict = dict(entity_annotation_dict)
    entity_pair_annotation_dict = dict(entity_pair_annotation_dict)

    ###############################################################
    # entity_pair_sentence_dict = dict(entity_pair_sentence_dict) #
    # entity_sentence_dict = dict(entity_sentence_dict)           #
    ###############################################################

    output_folder = output_dir + 'entity_annotations/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'indices': jsonpickle.encode(entity_annotation_dict)
    }))

    output_folder = output_dir + 'pair_annotations/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'indices': jsonpickle.encode(entity_pair_annotation_dict)
    }))

    pair_annotation_dependency_features = defaultdict(lambda: [])

    for pair in list(entity_pair_annotation_dict.keys()):

        np_dep_path_feature = []

        for annotation in entity_pair_annotation_dict[pair]:

            tokens = annotation['tokens']
            source_indices = [i for i, t in enumerate(tokens) if 'entity_id' in t and t['entity_id'][1] == pair[0]]
            destination_indices = [i for i, t in enumerate(tokens) if 'entity_id' in t and t['entity_id'][1] == pair[1]]

            for source_destination in itertools.product(source_indices, destination_indices):

                dep_path = find_dep_path(tokens, source_destination[0], source_destination[1])

                dep_path = [
                    (( DepDirection.DEP if dep_dir == DepDirection.GOV else DepDirection.GOV, dep_type), dep_idx) 
                    for (dep_dir, dep_type), dep_idx in dep_path
                ]

                dep_path_features = sentiment_features.dep_path_features([], tokens, dep_path)

                if len(dep_path_features) > 0: np_dep_path_feature += [dpf[1] for dpf in dep_path_features]

                dep_path = find_dep_path(tokens, source_destination[1], source_destination[0])

                dep_path = [
                    (( DepDirection.DEP if dep_dir == DepDirection.GOV else DepDirection.GOV, dep_type), dep_idx) 
                    for (dep_dir, dep_type), dep_idx in dep_path
                ]

                dep_path_features = sentiment_features.dep_path_features([], tokens, dep_path)

                if len(dep_path_features) > 0: np_dep_path_feature += [dpf[1] for dpf in dep_path_features]

        pair_annotation_dependency_features[pair] = np_dep_path_feature

    pair_annotation_dependency_features = dict(pair_annotation_dependency_features)

    output_folder = output_dir + 'pair_dependency_features/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'dependency_features': jsonpickle.encode(pair_annotation_dependency_features)
    }))
        
    return True

In [None]:
import json, os, jsonpickle
import multiprocessing
from multiprocessing import Pool
from tqdm import tqdm

pool = Pool(multiprocessing.cpu_count() - 8)

for i in tqdm(
    pool.imap_unordered(
        extract_pair_annotations,
        spacy_paths
    ),
    desc='Entity Pair Extraction',
    total=len(spacy_paths)
): pass

pool.close()
pool.join()

In [None]:
import gzip, json

def load_gzip(path, func=json.loads):
    with gzip.open(path, 'r') as f: data = func(f.read().decode('utf-8'))
    return data

In [None]:
import json, pickle, gzip, gc

def export_entity_np_pairs(path):
    
    uid = path.split('/')[-1].replace('.pckl', '')
    daily_folder = path.split('/')[-2]
    
    _path = path.replace('/spacy/', '/{}/').replace('.pckl', '{}')

    if not os.path.exists(_path.format('entity_indices', '.json')): return None
    if os.path.exists(output_dir + 'entity_np_sentences/' + daily_folder + '/' + uid + '.json') and \
       os.path.exists(output_dir + 'entity_np_annotations/' + daily_folder + '/' + uid + '.json'): return None

    spacy_doc = load_spacy_obj(path, spacy_nlp.vocab)
    path = _path
    
    with open(path.format('entity_indices', '.json'), 'r') as f: entity_sentence_indices = json.load(f)
    entity_sentence_indices['indices'] = jsonpickle.decode(entity_sentence_indices['indices'])
    entity_sentence_indices = entity_sentence_indices['indices']

    with open(path.format('np_annotations', '.json'), 'r') as f: np_annotation_objects = json.load(f)
    np_annotation_objects = np_annotation_objects['annotations']

    entity_np_occuring_sentences = defaultdict(lambda: defaultdict(lambda: []))
    
    entity_np_occuring_annotations = defaultdict(lambda: defaultdict(lambda: []))

    article_sents = list(spacy_doc.sents)

    for entity in entity_sentence_indices:

        for sentence_i in entity_sentence_indices[entity]:
            sentence = article_sents[int(sentence_i)]

            for np in sentence.noun_chunks: 
                
                entity_np_occuring_sentences[entity][np.text].append(sentence.text)
                        
                entity_np_occuring_annotations[entity][np.text].append(np_annotation_objects[str(sentence_i)])
    
    for k1 in entity_np_occuring_sentences:
        for k2 in entity_np_occuring_sentences[k1]:
            entity_np_occuring_sentences[k1] = dict(entity_np_occuring_sentences[k1])

    entity_np_occuring_sentences = dict(entity_np_occuring_sentences)
    
    for k1 in entity_np_occuring_annotations:
        for k2 in entity_np_occuring_annotations[k1]:
            entity_np_occuring_annotations[k1] = dict(entity_np_occuring_annotations[k1])

    entity_np_occuring_annotations = dict(entity_np_occuring_annotations)

    output_folder = output_dir + 'entity_np_sentences/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
            
    with gzip.open(output_file, 'wt', encoding='UTF-8') as f: json.dump({
        'uid': uid,
        'entity_np_sentences': entity_np_occuring_sentences
    }, f)
    
    output_folder = output_dir + 'entity_np_annotations/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    
    with gzip.open(output_file, 'wt', encoding='UTF-8') as f: json.dump({
        'uid': uid,
        'entity_np_annotations': entity_np_occuring_annotations
    }, f)
    
    entity_np_occuring_annotations = None
    entity_np_occuring_sentences = None
    spacy_doc = None
    
    gc.collect()
    
    return True

In [None]:
%%time

from spacy.tokens import Doc
from spacy.vocab import Vocab

from collections import defaultdict

from tqdm import tqdm
import json, jsonpickle, multiprocessing, os
from multiprocessing import Pool

pool = Pool(multiprocessing.cpu_count() - 16)

for d in tqdm(
    pool.imap_unordered(export_entity_np_pairs, spacy_paths),
    desc='Export Entity-NP',
    total=len(spacy_paths)
): del d

pool.close()
pool.join()