In [None]:
import os
import pandas as pd
from collections import Counter



input_folder = '/content/drive/My Drive/ocr_output_read'
text_files = os.listdir(input_folder)
documents = []


recording_details_df = pd.DataFrame(columns=["filename", "header", "book_num", "page_num", "recording_date", "doc_date", "signature_date","effective_date", "document_number", "consideration", "taxes"])

num_matches = 0
count = 0
for i, text_file in enumerate(text_files):
  with open(os.path.join(input_folder, text_file), "r", encoding="utf-8") as file:

    print(text_file)
    content = file.read()
    valid_matches = extract_document_info(text_file, content)
    header, book_num, page_num, recording_date, all_doc_num_matches, doc_date, signature_date, effective_date = valid_matches


    if doc_date:
      doc_date = doc_date.strftime("%m/%d/%Y")
    if signature_date:
      signature_date = signature_date.strftime("%m/%d/%Y")
    if effective_date:
      effective_date = effective_date.strftime("%m/%d/%Y")


    if all_doc_num_matches is None:
      valid_doc_num_match = []

    else:
      valid_doc_num_match = [vdnm for vdnm in all_doc_num_matches]

    if recording_date is None:

      print("date not found")
      num_matches +=1

    doc_num_matches = list(map(lambda s: re.sub(r"\s+", "", s), valid_doc_num_match))
    counter = Counter(doc_num_matches)
    doc_num_matches = [item for item, count in counter.most_common()]

    matches = [x for x in doc_num_matches if str(x).startswith("202")]
    result = [matches[0]] if len(matches) == 1 else doc_num_matches


    # print("document number:")
    # print(doc_num_matches)



    consideration = find_sales_amount(content)
    # print(consideration)
    taxes = find_tax_fee(content)
    print(taxes)

    # print(book_num, ", ", page_num)
    # print(consideration)
    # print("\n")



    if valid_matches is not None:
      recording_details_df.loc[i] = [text_file, header, book_num, page_num, recording_date, doc_date, signature_date, effective_date, result, consideration, taxes]

    content = clean_text(content, patterns)
    documents.append([text_file, content])

print("total matches: ", num_matches)


def calculate_attention_mask(seq_length, input_len):
  return [1]*seq_length + [0]*(input_len - seq_length)


def prepare_text_data_for_inference(documents):

  inference_text = []
  input_len = 512
  start_token_id = [101]
  end_token_id = [102]
  overlap_window = 50

  for fname, document in documents:
    tokenized_text = tokenizer(document, truncation=False, return_offsets_mapping=False)
    input_id = tokenized_text["input_ids"]

    if len(input_id) > input_len - 2:

      inference_text.append([input_id[0:input_len - 1] + end_token_id,
                  calculate_attention_mask(input_len, input_len),
                  0,
                  fname
                  ])


      start = input_len - overlap_window - 1
      while start < len(input_id) - input_len + 1 :

          inference_text.append([start_token_id + input_id[start:start + input_len - 2] + end_token_id,
                      calculate_attention_mask(input_len, input_len),
                      0,
                      fname
                      ])
          start = start + input_len - overlap_window - 2

      inference_text.append([start_token_id + input_id[start:len(input_id)],
            calculate_attention_mask(len(input_id) + 1 - start, input_len),
            1,
            fname
            ])

    else:

      inference_text.append([input_id,
                  calculate_attention_mask(len(input_id) + 2, input_len),
                  1,
                  fname
                    ])
  return inference_text


def prepare_df_for_inference(data):

  input_ids = [entry[0] for entry in data]
  attention_mask = [entry[1] for entry in data]
  is_last_chunk = [entry[2] for entry in data]
  file_name = [entry[3] for entry in data]

  df = pd.DataFrame({
    "input_ids": input_ids,
    "attention_mask" : attention_mask,
    "is_last_chunk": is_last_chunk,
    "file_name": file_name
  })

  df["input_ids"] = df["input_ids"].apply(lambda x: x + ([0]*(512-len(x))))

  return df


data = prepare_text_data_for_inference(documents)
df = prepare_df_for_inference(data)
df.columns

pd.set_option('display.max_rows', None)



In [None]:
import torch
import numpy as np

def perform_inference_on_processed_text(df):

  model.eval()

  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
  model.to(device)

  input_ids = torch.tensor(df["input_ids"].tolist())
  attention_mask = torch.tensor(df["attention_mask"].tolist())

  batch_size = 8 # Adjust based on memory availability

  predictions = []

  # Process in batches
  for i in range(0, len(input_ids), batch_size):
      batch_input_ids = input_ids[i:i+batch_size].to(device)
      batch_attention_mask = attention_mask[i:i+batch_size].to(device)

      with torch.no_grad():
          outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)

      # Get predictions for the current batch
      batch_predictions = torch.argmax(outputs.logits, dim=-1)
      predictions.extend(batch_predictions.cpu().tolist())

  # Add predictions to DataFrame
  df["predictions"] = predictions
  return df

print(np.sum(df["is_last_chunk"]))
df = perform_inference_on_processed_text(df)



In [None]:
import numpy as np


def aggregate_predictions(predictions):

  all_entities = ['BUYER ORG', 'SELLER ORG', 'APN', 'BUYER NAME', 'SELLER NAME', 'BUYER ADDRESS', 'SELLER ADDRESS']
  final_prediction = []

  for entity in all_entities:

    entity_level_predictions = [p[1] for p in predictions if p[0] == entity]
    final_prediction.append([entity, entity_level_predictions])

  return final_prediction


def view_predictions(df):

  predictions_df = pd.DataFrame()
  document_level_predictions = []

  for i in range(len(df)):
      predictions = df.iloc[i]["predictions"]

      filtered_predictions = np.array(predictions)
      predictions_mapped = list(map(lambda x: reverse_entity_mapping[x], filtered_predictions))
      predicted_masks, predicted_entity_names = find_entities_in_text(predictions_mapped)
      input_ids = np.array(df.iloc[i]["input_ids"])

      predicted_entities, predicted_text = show_sample(
          input_ids, predicted_masks, predicted_entity_names
      )

      for e, p in zip(predicted_entities, predicted_text):
        document_level_predictions.append((e[0],p))

      tokens = tokenizer.convert_ids_to_tokens(np.array(input_ids))
      original_text = " ".join(tokens).replace(" ##", "")

      predictions_df_entry = {}
      predictions_df_entry["IMAGENAME"] = df.iloc[i]["file_name"]

      if(df.iloc[i]["is_last_chunk"] == 1):

        final_prediction = aggregate_predictions(document_level_predictions)
        for e, p in final_prediction:

          if len(p) == 0:
            continue

          predictions_df_entry[e] = " ^ ".join(map(str, p))

        predictions_df = pd.concat([predictions_df, pd.DataFrame([predictions_df_entry])], ignore_index=True)
        document_level_predictions = []


  return predictions_df

predictions_df = view_predictions(df)
predictions_df = predictions_df.merge(recording_details_df, left_on="IMAGENAME", right_on="filename", how="inner").drop(columns=["filename"])

all_files = []
for index, row in predictions_df.iterrows():
    row_dict = row.dropna().to_dict()  # Convert row to dict after dropping NaNs
    imagename = row_dict.pop("IMAGENAME", "Unknown")  # Extract IMAGENAME

    all_files.append(imagename)
    print(f"{imagename}")
    print(f"{row_dict}")  # 'filename' field is no longer here
    print("\n")
    print("\n")


In [None]:
predictions_df["SELLER ADDRESS"] = predictions_df["SELLER ADDRESS"].dropna().apply(
    lambda x: max(x.split("^"), key=len).strip()
)

predictions_df["BUYER ADDRESS"] = predictions_df["BUYER ADDRESS"].dropna().apply(
    lambda x: max(x.split("^"), key=len).strip()
)



