<a href="https://colab.research.google.com/github/gwc4github/VectorFormRead/blob/main/VectorFormRead.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import boto3
import json
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Set up AWS credentials and S3 bucket information
ACCESS_KEY = 'your_access_key'
SECRET_KEY = 'your_secret_key'
BUCKET_NAME = 'your_bucket_name'

# Set up S3 client and resource
s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3_resource = boto3.resource('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)

# Load LayoutLMFT SER model and tokenizer
model_name = "microsoft/layoutlmv2-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Function to get S3 object
def get_object_s3(bucket, key):
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    return obj['Body'].read()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text and create input IDs
    encoded_input = tokenizer(text, return_tensors='pt')
    return encoded_input

# Function to run LayoutLMFT SER model and return results and embeddings
def run_layoutlmft_ser(s3_path):
    # Load document from S3
    document_text = get_object_s3(BUCKET_NAME, s3_path).decode('utf-8')

    # Preprocess text and get input IDs
    input_ids = preprocess_text(document_text)

    # Run model and get results
    outputs = model(**input_ids)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    predicted_tags = [model.config.id2label[tag] for tag in predictions]

    # Get all embeddings
    all_embeddings = outputs.last_hidden_state.tolist()

    # Return results and embeddings as JSON
    results = {'predicted_tags': predicted_tags, 'embeddings': all_embeddings}
    return json.dumps(results)


In [1]:
!pip install torch==1.10
!python -m pip install detectron2 -f \
  https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html

!pip install boto3
!pip install transformers
!pip install urllib3


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.10/index.html
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


NameError: ignored

In [2]:
# This is the new version that can use either an S3 file OR a local file
###
# In the modified code, the get_file_contents() function checks if the input path starts with 's3://'. 
# If it does, the function reads the contents of the S3 object specified by the path. If not, the function assumes 
# the input path is a local file path and reads the contents of the file using Python's built-in open() function.
# In this program, the run_layoutlmft_ser() function takes a local or S3 path to a document as input and
# returns the predicted tags and all embeddings as a JSON object. 
# https://guillaumejaume.github.io/FUNSD/download/
# https://github.com/facebookresearch/detectron2/blob/master/INSTALL.md
###

import boto3
import json
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

print(torch.__version__)
! nvcc --version
! python --version

# Set up AWS credentials and S3 bucket information
ACCESS_KEY = 'your_access_key'
SECRET_KEY = 'your_secret_key'
BUCKET_NAME = 'your_bucket_name'

# Set up S3 client and resource
s3_client = boto3.client('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)
s3_resource = boto3.resource('s3', aws_access_key_id=ACCESS_KEY, aws_secret_access_key=SECRET_KEY)

# Load LayoutLMFT SER model and tokenizer
model_name = "microsoft/layoutlmv2-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Function to get S3 object or local file contents
def get_file_contents(path):
    if path.startswith('s3://'):
        # If the path starts with 's3://', assume it's an S3 object path
        obj = s3_client.get_object(Bucket=BUCKET_NAME, Key=path[5:])
        return obj['Body'].read().decode('utf-8')
    else:
        # Otherwise, assume it's a local file path
        with open(path, 'r') as f:
            return f.read()

# Function to preprocess text
def preprocess_text(text):
    # Tokenize text and create input IDs
    encoded_input = tokenizer(text, return_tensors='pt')
    return encoded_input

# Function to run LayoutLMFT SER model and return results and embeddings
def run_layoutlmft_ser(path):
    # Load document from S3 or local file
    document_text = get_file_contents(path)

    # Preprocess text and get input IDs
    input_ids = preprocess_text(document_text)

    # Run model and get results
    outputs = model(**input_ids)
    predictions = torch.argmax(outputs.logits, dim=2)[0].tolist()
    predicted_tags = [model.config.id2label[tag] for tag in predictions]

    # Get all embeddings
    all_embeddings = outputs.last_hidden_state.tolist()

    # Return results and embeddings as JSON
    results = {'predicted_tags': predicted_tags, 'embeddings': all_embeddings}
    return json.dumps(results)


1.10.0+cu102
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Tue_Mar__8_18:18:20_PST_2022
Cuda compilation tools, release 11.6, V11.6.124
Build cuda_11.6.r11.6/compiler.31057947_0
Python 3.8.10


  warn(f"Failed to load image Python extension: {e}")


RuntimeError: ignored

In [None]:
test_file = '/Users/greggwcasey/Google Drive/PycharmProjectsLocal/FUNSD_layoutlmv2/dataset/training_data/images/00040534.png'
embedings = run_layoutlmft_ser(test_file)