In [2]:
!pip install -Uqqq pip --progress-bar off
!pip install -qqq torch==2.1 --progress-bar off
!pip install -qqq transformers==4.34.0 --progress-bar off
!pip install -qqq accelerate==0.23.0 --progress-bar off
!pip install -qqq bitsandbytes==0.41.1 --progress-bar off

[0m

In [1]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    GenerationConfig,
    TextStreamer,
    pipeline,
)

MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, device_map="auto", torch_dtype=torch.float16, load_in_8bit=True
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 4000
generation_config.temperature = 0.0001
generation_config.do_sample = True

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)fetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [2]:
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

In [3]:
llm = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.eos_token_id,
    streamer=streamer,
)

In [29]:
ocr_text="shreeji clinic\\ndr. nirav k. gandhi\\nb.h.m.s.\\n2, honey park apartment,\\nregd. no. g-4163\\nadajan, surat.\\nshivaniben\\nno. 245\\npt. name\\nbill no .:\\nsaini .\\n24/03/28\\ndescription\\namount\\nconsulting charge\\n1000/-\\nvisit charge\\ndressing charge\\nmedicine charge\\n200\\n-\\ntotal\\n1200/-\\nfever\\ndr. nin ---\\nconsultation\\nregd.\\nh.m.s.\\nsigh\\nhoney park, adajan, surat-2\\npagebreak\\n"

In [30]:
text = """<|system|> Given the following raw text, identify and extract the following details in JSON format:

        Data to be Extracted:
        1. Extract the invoice date (in dd-mm-yyyy format).
        2. Extract the name of the healthcare provider (hospital, clinic, lab, etc.).
        3. Extract the name of the doctor who provided the consultation or service.
        4. Extract the name of the customer who took the consultation or service.
        5. Extract the invoice number.
        6. Extract the total bill amount(Amount Paid).
        7. Extract the address of the service provider.
        8. Return a JSON of bill items mentioned in the document text along with the amount.</s>

        ONLY RETURN THE ASSISTANT RESPONSE.

        <|user|> "maatoshree dental clinic\\ndr. siddant jajoo bds, mds\\nperiodontist & implantologist\\nshop no. 8, hari vitthal plaza\\nemail : siddhantjajoo@gmail.com\\nabove federal bank\\nweb : www.maatoshreedentalclinic.com\\nnear ahire gate, shivane\\npune - 23. @ 9765 41 9539\\nname\\nmera\\nupadlyay\\nreceipt\\naddress\\nno.\\n336\\nmob.\\ndate\\n28/3/23\\ntreatment\\n..\\namount\\nroot canal treatment\\n65001\\nmaatoshree dental clinic\\ndr. siddh/psd ez( 45)\\nregistration no. a 18609\\nshivano punc:411023 (03\\n10\\ntotal\\n6500-\\npagebreak\\n"
        </s>
        <|assistant|> {{  "date": "28-03-2023",
                "provider": "Maatoshree Dental Clinic",
                "doctor": "Dr. Siddhant Jajoo",
                "customer": "Mera Upadhyay",
                "invoice": "336",
                "amount": "6500",
                "address": "Shop no. 8, Hari Vitthal Plaza, above federal bank, near Ahire gate, Shivane, Pune- 23",
                "bill_items": "not found",

            }}
        </s>
        <|user|> {ocr_text}
        </s>
        <|assistant|>

          """





In [31]:
%%time
prompt=text.format(ocr_text=ocr_text)
result = llm(prompt)



{
              "date": "24/03/28",
              "provider": "Shreeji Clinic",
              "doctor": "Dr. Nirav K. Gandhi",
              "customer": "Shivaniben",
              "invoice": "245",
              "amount": "1200",
              "address": "2, Honey Park Apartment, Regd. No. G-4163, Adajan, Surat.",
              "bill_items": {
                 "Consulting Charge": "1000/-",
                 "Visit Charge": "200/-",
                 "Dressing Charge": "0/-",
                 "Medicine Charge": "0/-"
              }
          }
       

CPU times: user 59.1 s, sys: 197 ms, total: 59.3 s
Wall time: 1min
