In [None]:
!pip install -U flash_attn==2.5.8 numpy==1.24.4 Pillow==10.3.0 Requests==2.31.0 torch==2.3.0 torchvision==0.18.0 transformers==4.40.2 accelerate

In [None]:
from PIL import Image
import requests
from transformers import AutoModelForCausalLM
from transformers import AutoProcessor

In [None]:
model_id = "microsoft/Phi-3-vision-128k-instruct"

model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", trust_remote_code=True, torch_dtype="auto")

processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)


In [None]:
def extract_applicant(image):
    messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a completed Authorized Representative Identity Verification Form.\n
                                      Near the top, you have Authorized Representative's name, address, city, state, zip code and telephone number \n
                                      they are followed by the Applicant's name, address, city, state, zip code, birthdate,\n
                                      Social Security Number and phone number\n"""},
    {"role": "user", "content": "Provide the Applicant's name, address, birthdate, phone number and Social Security Number in a JSON format."}
    ]


    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

    generation_args = {
        "max_new_tokens": 500,
        "temperature": 0.0,
        "do_sample": False,
    }

    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    return response

In [None]:
url = "https://raw.githubusercontent.com/chandc/Transformer_OCR/6346c0ceab6477939b3022dd4299354a6ab2c7e9/data/form_1.jpeg" # Modified URL to point directly to the raw image file
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
print(extract_applicant(image))

In [None]:

url = "https://raw.githubusercontent.com/chandc/Transformer_OCR/6346c0ceab6477939b3022dd4299354a6ab2c7e9/data/form_2.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
print(extract_applicant(image))

In [None]:
def extract_AR(image):
    messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a completed Authorized Representative Identity Verification Form.\n
                                      Near the top, you have Authorized Representative's name, address, city, state, zip code and telephone number \n
                                      they are followed by the Applicant's name, address, city, state, zip code, birthdate,\n
                                      Social Security Number and phone number\n"""},
    {"role": "user", "content": "Provide the Authorized Representative's name, street address, city, state, zip and phone number in a JSON format."}
    ]


    prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

    inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

    generation_args = {
        "max_new_tokens": 500,
        "temperature": 0.0,
        "do_sample": False,
    }

    generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
    generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

    return response


In [None]:
print(extract_AR(image))

In [None]:


url = "https://raw.githubusercontent.com/chandc/Transformer_OCR/6346c0ceab6477939b3022dd4299354a6ab2c7e9/data/nyDL.png"
image = Image.open(requests.get(url, stream=True).raw)
image


In [None]:
messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a Driver License with an individual's name, street address, city, state, zip code
    and other attributes"""},
    {"role": "user", "content": "Provide the individual's name, street address, city, state, zip, DOB and Height in a JSON format."}
]


prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)

In [None]:
messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a Driver License with an individual's name, street address, city, state, zip code
    and other attributes"""},
    {"role": "user", "content": "Who issued the Driver License?."}
]


prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)

In [None]:
messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a Driver License with an individual's name, street address, city, state, zip code
    and other attributes"""},
    {"role": "user", "content": "What is the Driver License ID?."}
]


prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)

In [None]:
messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a Driver License with an individual's name, street address, city, state, zip code
    and other attributes"""},
    {"role": "user", "content": "Whom is the Driver License issued to?."}
]

prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)


In [None]:
messages = [
    {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
    {"role": "assistant", "content": """The image shows a Driver License with an individual's name, street address, city, state, zip code
    and other attributes"""},
    {"role": "user", "content": "Provide the individual's last name, first name, ID number, street address, city, state, zip, DOB and Height in a JSON format."}
]


prompt = processor.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")

generation_args = {
    "max_new_tokens": 500,
    "temperature": 0.0,
    "do_sample": False,
}

generate_ids = model.generate(**inputs, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)

# remove input tokens
generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

print(response)