In [1]:
import sys
import pandas as pd
import torch 
import os 

sys.path.append("../..")
# For retrieval

from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.docstore.document import Document
from sentence_transformers import CrossEncoder
from langchain.text_splitter import RecursiveCharacterTextSplitter
from chunkers import TableExtractor

from financerag.tasks import FinQA, ConvFinQA

  from tqdm.autonotebook import tqdm, trange


## Read Data

In [2]:
task = FinQA()

A Hugging Face repository is provided. This will override the data_folder, prefix, and *_file arguments.


In [3]:
queries = task.queries
query_df = pd.DataFrame(queries.values(), index=queries.keys(), columns=["query"])

In [4]:
documents = task.corpus
documents_df = pd.DataFrame(documents.values(), index=documents.keys(), columns=["title", "text"])

text = documents_df.iloc[120]["text"]

with open("text.txt", "w") as f:
    f.write(text)

In [5]:
def extract_paragraph(text):
    array = text.split("\n")
    return "\n".join([i for i in array if "|" not in i])

def extract_table(text):
    array = text.split("\n")
    array = [i for i in array if "-" in i or "|" in i]
    array = [i for i in array if "|" in i]
    table = "\n".join(array)
    return table.split("\n")

In [6]:
def clean(row):

    return_list = []

    for cell in row:

        if any(keyword in cell.strip().lower().split(" ")for keyword in ["(in", "millions", "thousands", "dollars", "note", "thousands)", "millions)", "dollars)",
                                                                         "(millions), (thousands)", "(dollars)", "millions), (dollars)", "(unaudited)"]):
            continue

        elif any(keyword in cell.strip().lower().split(" ")for keyword in ["december","january", "february", "march", "april", "june", "july", "august", "september", "october", "november"]):
            matched_keywords = [keyword for keyword in cell.strip().lower().split(" ") if keyword in [str(year) for year in range(2000, 2050)]]
            return_list.extend(matched_keywords)
                 
        else:
            return_list.append(cell)
    
    return return_list

In [7]:
def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def parse_table(table):

    for row in table:
        if "." == row.strip()[-1]:
            table.remove(row)
        
        if "---" in row:
            table.remove(row)

    columns = table[0].split("|")

    rows = []

    for row in table[1:]:
        row = row.split("|")
        row = [ cell.strip() for cell in row  ]
        row[1:] = clean(row[1:])
        rows.append(row)

    temp = columns.copy()

    for column in columns:
        if "$" in column:
            column = column.replace("$", "")
            if isfloat(column.strip()) or column.strip().isdigit():
                columns = ["Column " + str(i) for i in range(1, len(columns)+1)]
                rows = [temp] + rows

        elif (isfloat(column.strip()) or column.strip().isdigit()) and int(column.strip()) not in range(2000, 2050):
            columns = ["Column " + str(i) for i in range(1, len(columns)+1)]
            rows = [temp] + rows
            
    table = pd.DataFrame(rows, columns=columns)
    table.set_index(columns[0], inplace=True)
    
    return table

In [12]:
document_tables = {}

for id, text in documents_df.text.items():
    try:
        try:
            table = extract_table(text)
            table = parse_table(table)
            document_tables[id] = table
        except:
            table = extract_table(text)
            table = parse_table(table[:-1])
            document_tables[id] = table
    except Exception as e:
        print(id)
        print(e)

d61ea0b7a
2 columns passed, passed data had 5 columns
d61de83e0
2 columns passed, passed data had 3 columns
d61d02d2c
5 columns passed, passed data had 3 columns


## Initiliaze Database