# Tabula Data Extraction API

In [522]:
import pandas as pd
import numpy as np
import spacy
import re

In [523]:
nlp = spacy.load("en_core_web_sm")

In [527]:
#https://www.cnbc.com/2019/05/03/hsbc-reports-2019-first-quarter-earnings.html
txt = """\
The bank said its reported profit before tax in the first quarter \
was $6.213 billion, a 30.7% jump from last year’s $4.755 billion. \
Analyst forecasts compiled by Refinitiv showed that the \
bank’s reported profit before tax was expected to come in \
at $5.399 billion for the January to March period. \
"""
doc = nlp(txt)

In [518]:
txt = """\
For example, U.S. imports from China almost doubled within five years \
from $51.5 billion ($84.2 billion in 2019 dollars) in 1996 to $102 billion \
($148 billion in 2019 dollars) in 2001.\
"""
doc = nlp(txt)

In [549]:
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

the first quarter 48 65 DATE
$6.213 billion 70 84 MONEY
30.7% 88 93 PERCENT
last year’s 104 115 DATE
$4.755 billion 116 130 MONEY
Refinitiv 162 171 ORG
’s 192 194 ORG
$5.399 billion 249 263 MONEY
the January to March period 268 295 DATE


In [541]:
# def self_with_ancestor(token):
#     return [token] + list(token.ancestors)

# def n_common_tail(lst1, lst2):
#     L = min(len(lst1), len(lst2))
#     for i in range(1, L+1):
#         if lst1[-i] != lst2[-i]:
#             return i-1
#     return L

# def closest_entry_in_tree(token_ancestor, other_tokens_ancestors, default=None):
#     # token_ancestor is (token, ancestor_list)
#     # other_tokens_ancestors is list of (token, ancestor_list)
#     if not other_tokens_ancestors:
#         return default
#     tok_ancestors = list(token_ancestor[1])
#     # restrict to be in the same tree, i.e. at least one common ancestor
#     n_commons = [(x, n_common_tail(tok_ancestors, x[1])) for x in other_tokens_ancestors]
#     n_commons_in_tree = [x for x in n_commons if x[1] > 0]
#     if not n_commons_in_tree:
#         return default
#     closest = max(n_commons_in_tree, key=lambda x: x[1])
#     return closest[0]

# def closest_token_in_tree(token_ancestor, other_tokens_ancestors, default=None):
#     entry = closest_entry_in_tree(token_ancestor, other_tokens_ancestors, default)
#     if entry:
#         return entry[0]
#     return entry

# def tabular_data_extraction_1(ent):
#     ents_with_ancestors = sorted([(ent, self_with_ancestor(ent.root))
#                                   for ent in doc.ents],
#                                  key=lambda x: x[0].start_char)
#     values_with_ancestors = [x for x in ents_with_ancestors if x[0].label_ in ['MONEY', 'PERCENT', 'CARDINAL', 'QUANTITY']]
#     dates_with_ancestors = [x for x in ents_with_ancestors if x[0].label_ in ['DATE', 'TIME']]
#     return([(x[0], closest_token_in_tree(x, dates_with_ancestors)) for x in values_with_ancestors])

In [598]:
def get_date(dates, sent, ent_order):
    if (len(dates) == 0):
        ent_date = []
    else:
        ent_date = [d['value'] for d in date_ents if d['sent'] == sent]
        if (len(ent_date) == 0):
            ent_date = [d['value'] for d in dates if d['order'] < ent_order]
    return([ent_date[0] if len(ent_date) != 0 else ''][0])

def tabular_data_extraction(doc):
    sents = [i for l in [str(s).split(',') for s in list(doc.sents)] for i in l]
    sents_start = np.array([txt.find(sents[i]) for i in range(len(sents))])
    sents_end = sents_start + np.array([len(sents[i]) for i in range(len(sents))])
    
    sents_idx = []
    for end in [e.end_char for e in doc.ents]:
        sents_idx.append([i for i, x in enumerate(end <= sents_end) if x][0])
        
    ents_data = []
    for i in range(len(doc.ents)):
        ent = doc.ents[i]
        sent = sents_idx[i]
        if ent.label_ in ['DATE', 'TIME', 'MONEY', 'PERCENT', 'CARDINAL', 'QUANTITY']:
            ents_data.append({
                'value' : ent.text,
                'item' : ent.label_,
                'start' : ent.start_char,
                'end' : ent.end_char,
                'sent' : sent,
                'order' : i
            })
            
    dates = [d for d in ents_data if d['item'] in ['DATE', 'TIME']]
    data = [d for d in ents_data if d['item'] in ['MONEY', 'PERCENT', 'CARDINAL', 'QUANTITY']]
    for v in data:
        v['date'] = get_date(dates, v['sent'], v['order'])
    data = [{'value': d['value'], 'item': d['item'], 'date': d['date']} for d in outputs]
    data = [list(d.values()) for d in data]
    data.insert(0, ['Date', 'Value', 'Item'])
    output = {
        'value' : data,
        'type' : 'table'
    }
    return(output)

In [599]:
tabular_data_extraction(doc)

{'value': [['Date', 'Value', 'Item'],
  ['$6.213 billion', 'MONEY', 'the first quarter'],
  ['30.7%', 'PERCENT', 'last year’s'],
  ['$4.755 billion', 'MONEY', 'last year’s'],
  ['$5.399 billion', 'MONEY', 'the January to March period']],
 'type': 'table'}