In [1]:
import sys
import pandas as pd
import torch 
import os 

sys.path.append("../..")
# For retrieval

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import CrossEncoder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chunkers import TableExtractor

from financerag.tasks import TATQA

  from tqdm.autonotebook import tqdm, trange


## Read and Process Data 

In [2]:
task = TATQA()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.


In [3]:
queries = task.queries
query_df = pd.DataFrame(queries.values(), index=queries.keys(), columns=["query"])

In [4]:
documents = task.corpus
documents_df = pd.DataFrame(documents.values(), index=documents.keys(), columns=["title", "text"])

In [5]:
def extract_paragraph(text):
    array = text.split("\n")
    return "\n".join([i for i in array if "|" not in i])

def extract_table(text):
    array = text.split("\n")
    array = [i for i in array if "-" in i or "|" in i]
    array = [i for i in array if "|" in i]
    table = "\n".join(array)
    return table.split("\n")

In [6]:
def clean(row):

    return_list = []

    for cell in row:

        if any(keyword in cell.strip().lower().split(" ")for keyword in ["(in", "millions", "thousands", "dollars", "note", "thousands)", "millions)", "dollars)", "by", "ended", 
                                                                         "(millions), (thousands)", "(dollars)", "millions), (dollars)", "(unaudited)"]):
            continue

        elif any(keyword in cell.strip().lower().split(" ")for keyword in ["december","january", "february", "march", "april", "june", "july", "august", "september", "october", "november"]):
            matched_keywords = [keyword for keyword in cell.strip().lower().split(" ") if keyword in ["2016", "2017", "2018", "2019", "2020", "2021", "2022", "2023", "2024", "2025", "2026", "2027", "2028", "2029", "2030"]]
            return_list.extend(matched_keywords)
                 
        else:
            return_list.append(cell)
    
    return return_list

In [7]:
def parse_table(table):
    
    header = table[0].split("|")
    header = [column.strip() for column in clean(header) if column.strip() != ""]
        
    first_row =  table[2].split("|")
    first_row = [column.strip() for column in clean(first_row) if column.strip() != ""]

    first_row = [ cell.strip() for cell in first_row if cell.strip() != ""]
    first_row = [ cell.strip() for cell in first_row if  "--" not in cell]
    
    header = [ cell.strip() for cell in header if cell.strip() != ""]
    header = [ cell.strip() for cell in header if "--" not in cell]

    num_cols = len(table[-1].split("|")) - 1

    start = 2

    print(header, first_row)
    print(num_cols  )
    
    if len(header) == num_cols or len(header) == num_cols + 1:
        columns = header
    
    elif len(first_row) == num_cols or len(first_row) == num_cols + 1:
        columns = first_row
        start = 3

    else:
        i = 3
        columns = clean(table[3].split("|"))
        while columns == []:
            columns = [column.strip() for column in clean(table[i].split("|")) if column != ""]
            columns = [column.strip() for column in clean(table[i].split("|")) if "--" not in column]
            i += 1
        
        start = i+1

    rows = []

    for row in table[start:]:
        row = row.split("|")
        row = [ cell.strip() for cell in row  ]
        row[1:] = clean(row[1:])
        rows.append(row)
    
    columns = [i for i in range(num_cols - len(columns) + 1)] + columns
    table = pd.DataFrame(rows, columns=columns)
    table.replace("", pd.NA, inplace=True)
    table.set_index(columns[0], inplace=True)
    table.index.name = None
    return table

In [8]:
import random

id_ = random.randint(0, len(documents_df) - 1)
text = documents_df.iloc[id_]["text"]
doc_id = documents_df.index[id_]

table = extract_table(text)
print("\n".join(table))
table = parse_table(table)
table

                                           |         | Years Ended December 31, |        
------------------------------------------ | ------- | ------------------------ | -------
                                           | 2019    | 2018                     | 2017   
Balances at beginning of period            | $2,084  | $2,312                   | $2,329 
Warranty acquired in business combinations | 4,818   | 305                      | 118    
Increases to accruals                      | 1,752   | 1,606                    | 2,029  
Warranty expenditures                      | (2,249) | (2,127)                  | (2,184)
Effect of changes in exchange rates        | 8       | (12)                     | 20     
Balances at end of period                  | $6,413  | $2,084                   | $2,312 
[] ['2019', '2018', '2017']
3


Unnamed: 0,2019,2018,2017
Balances at beginning of period,"$2,084","$2,312","$2,329"
Warranty acquired in business combinations,4818,305,118
Increases to accruals,1752,1606,2029
Warranty expenditures,"(2,249)","(2,127)","(2,184)"
Effect of changes in exchange rates,8,(12),20
Balances at end of period,"$6,413","$2,084","$2,312"
