## Get Files From Storage & Covert to png

#### Install Dependencies

###### Please note to install poppler-utils for pdf2image. 
###### Run 'sudo apt-get install poppler-utils'
###### Also can check the path by running  'which pdfinfo'


In [None]:
%pip install azure-storage-blob
%pip install pdf2image
%pip install python-dotenv

In [None]:
## Import packages
from dotenv import load_dotenv
import os
from azure.storage.blob import BlobServiceClient
import tempfile
from pdf2image import convert_from_path
from pathlib import Path

#### Get config details

In [None]:
# Load environment variables
load_dotenv('../config.env')

# Get the connection strings and other secrets
stg_connection_string = os.getenv('AZURE_STORAGE_CONNECTION_STRING')
stg_container_name = os.getenv('CONTAINER_NAME')
stg_blob_path = os.getenv('BLOB_PATH')
img_path = os.getenv('IMAGE_PATH')

# Print to verify
print(f'Storage container name: {stg_container_name}')
print(f'Storage blob path: {stg_blob_path}')
print(f'Image path: {img_path}')

#### Helper functions to get blob content & Convert pdf to png

In [None]:
# Get blob content function
def get_blob_content(blob_service_client, container_name, blob_name):
    # Get a reference to the blob
    blob_client = blob_service_client.get_blob_client(container_name, blob_name)
    
    # Download the blob data
    blob_data = blob_client.download_blob().readall()
    
    return blob_data

# Convert pdf to png
def convert_pdf_to_image(data, name):
    # Create the directory if it doesn't exist
    if not os.path.exists(img_path):
        os.makedirs(img_path)
    with tempfile.NamedTemporaryFile(suffix='.pdf') as temp_pdf:
        temp_pdf.write(data)
        temp_pdf.flush()
        images = convert_from_path(temp_pdf.name)
        # Save each page as a PNG
        for i, image in enumerate(images):
            image.save(f"{img_path}/{i}_{name}.png", "PNG")

#### Pre-Process Invoice files

###### Convert pdf files to image and save 

In [None]:
# Create a BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(stg_connection_string)
container_client = blob_service_client.get_container_client(stg_container_name)

# Get list of blobs
blob_list = container_client.list_blobs(stg_blob_path)

for blob in blob_list:  
    print(blob.name)
    if blob.name.endswith('.pdf'):
        # Get blob content
        blob_content = get_blob_content(blob_service_client, stg_container_name, blob.name)

        #print(type(blob_content))
        # Use pathlib.Path to get the file name without extension
        blob_name = Path(blob.name).stem

        # Save as png
        convert_pdf_to_image(blob_content,blob_name)