In [2]:
import sys
!conda install --yes --prefix {sys.prefix} -c conda-forge poppler
!pip install pdf2image

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

[0m

In [119]:
import boto3
import pandas as pd
import re 
import requests 
import logging 
import time
from pdf2image import convert_from_path
import io
import json
import numpy as np
import os
import pdf2image
from concurrent import futures
import base64

#from PIL import Image
pd.set_option('display.max_columns', None)

In [120]:
ENDPOINT_NAME = 'n2-data-certification-model-deploy-dev-endpoint'
MODEL_VERSION = '13'
MAX_LABELS = 200
MIN_CONFIDENCE = 5
FEATURES = ['Text', 'Handwriting', 'Symbol', 'Number', 'Paper', 'Document', 'Apparel', 'Clothing', 'Signature', 'Autograph', 
            'Animal', 'Letter', 'Alphabet', 'Page', 'Plant', 'Accessories', 'Accessory', 'Food', 'Envelope', 'Beverage', 
            'Drink', 'Electronics', 'Musical Instrument', 'Word', 'File Binder', 'Alcohol', 'Mail', 'Footwear', 'Invertebrate', 'Furniture', 
            'QR Code', 'Gray', 'Calligraphy', 'Jewelry', 'Broom']

client_rekognition = boto3.client("rekognition", 'us-east-1')
client_sagemaker = boto3.client("runtime.sagemaker")
client_s3 = boto3.resource('s3')

In [126]:
#url de un pdf que se encuentra en base 64
url_s3='https://stage-upload-management-service.s3.amazonaws.com/384ca7bc-728f-4bfb-9cbf-99e70c69d8ea-AUTOREGISTRO.docx.pdf'
response = requests.get(url_s3)

<Response [200]>


In [127]:
"""Decodificacion en base 64 a formatos: PDF/PNG/JPG/BMP"""
in_64=response.content.decode("utf-8") #decodifico a TEXTO 
result = re.sub(r"data.+base64.\b",  ' ',  in_64) #extraigo la parte del .txt que NO es parte de la conversion "data:application/pdf;base64,"

#Busco que tipo de archivo es originalmente para poder reconvertirulo
x = re.findall((r"(pdf;|png;|jpeg;|bmp;)\b") , in_64) #findall devuelve lista con los elementos encontrados
#uso el ; al final de pdf,png,etc pues necesito garantizar que en el resto del .txt NO exista otras 3 letras que en realidad no representen un tipo de formato de archivo
print(x)

if ("pdf;" in x):
    with open('pdf_file.pdf', 'wb') as theFile:
        theFile.write(base64.b64decode(result))
        #print(type(base64.b64decode(result)))
elif("png;" in x):
    with open('file_png.png', 'wb') as theFile:
        theFile.write(base64.b64decode(result))
        #print(type(base64.b64decode(result)))
elif("jpeg;" in x):
    with open('file_jpg.jpg', 'wb') as theFile:
        theFile.write(base64.b64decode(result))
        #print(type(base64.b64decode(result)))
elif("bmp;" in x):
    with open('file_bmp.bmp', 'wb') as theFile:
        theFile.write(base64.b64decode(result))
        #print(type(base64.b64decode(result)))



['pdf;']


In [81]:
def detect_labels_from_pdf(client, pdf_file, max_labels, min_confidence):
    response = requests.get(pdf_file, timeout=30)
    #print(type(response))
    pages = pdf2image.convert_from_bytes(response.content, dpi=300)
    img = pages[0].convert('RGB')
    buf = io.BytesIO()
    img.save(buf, format='JPEG')
    byte_im = buf.getvalue()
    return client.detect_labels(Image={'Bytes': byte_im}, MaxLabels=max_labels, MinConfidence=min_confidence)

def detect_labels_from_bytes(client, file, max_labels, min_confidence):
    response = requests.get(file)
    return client.detect_labels(Image={'Bytes': response.content}, MaxLabels=max_labels, MinConfidence=min_confidence)

def detect_labels(client, file, max_labels, min_confidence):
    try:
        file_extension = file.split('.')[-1].lower()
        if file_extension == 'pdf':
            return detect_labels_from_pdf(client, file, max_labels, min_confidence)
        else:
            return detect_labels_from_bytes(client, file, max_labels, min_confidence)
    except:
        print('Error al procesar la imagen en rekognition')
        logging.error("Exception occurred", exc_info=True)
        return {}
    
def query_endpoint(encoded_tabular_data, endpoint_name, content_type='text/csv'):
    response = client_sagemaker.invoke_endpoint(
        EndpointName=endpoint_name, ContentType=content_type, Body=encoded_tabular_data
    )
    return response

def parse_response(query_response):
    model_predictions = json.loads(query_response["Body"].read())
    predicted_probabilities = model_predictions["probabilities"]
    return np.array(predicted_probabilities)

def save_rekognition_labels(id_image, response):
    if (os.getenv('BUCKET_REKOGNITION_LABELS', default = None) is not None) & (os.getenv('FOLDER_REKOGNITION_LABELS', default = None) is not None):
        try:
            s3object = client_s3.Object(os.getenv('BUCKET_REKOGNITION_LABELS'), 
                                                '{}/image_{}.json'.format(os.getenv('FOLDER_REKOGNITION_LABELS'), id_image))
            s3object.put(Body=(bytes(json.dumps(response).encode('UTF-8'))))
        except:
            logging.error("Exception occurred", exc_info=True)

def get_predictions(data_image):
    labels = ["Rechazado", "Aprobado"]
    try:
        logging.info(f'get_predictions id_image: {data_image["id_image"]}')
        print((f'get_predictions id_image: {data_image["id_image"]}'))
        # Call rekognition
        logging.info(f'call rekognition id_image: {data_image["id_image"]}')
        print((f'call rekognition id_image: {data_image["id_image"]}'))
        response = detect_labels(client_rekognition, data_image['url_image'], max_labels=MAX_LABELS, min_confidence=MIN_CONFIDENCE)
        # Save rekognition features
        logging.info(f'save rekognition features id_image: {data_image["id_image"]}')
        print((f'save rekognition features id_image: {data_image["id_image"]}'))
        save_rekognition_labels(id_image=data_image['id_image'], response=response)
        # Create dataframe
        logging.info(f'create dataframe id_image: {data_image["id_image"]}')
        print((f'create dataframe id_image: {data_image["id_image"]}'))
        df = pd.DataFrame([{i['Name']:i['Confidence'] for i in response['Labels']}])
        columns = df.columns.tolist()
        empty_columns = list(set(FEATURES) - set(columns))
        for column in empty_columns:
            df[column] = 0
        df['file_is_pdf'] = int(data_image['url_image'].split('.')[-1].lower() == 'pdf')
        # Endpoint request
        logging.info(f'call sagemaker id_image: {data_image["id_image"]}')
        print((f'call sagemaker id_image: {data_image["id_image"]}'))
        query_response_batch = query_endpoint(
            df[['file_is_pdf'] + FEATURES].to_csv(header=False, index=False).encode("utf-8"),
            endpoint_name=ENDPOINT_NAME
        )
        predict_prob = np.concatenate(parse_response(query_response_batch), axis=0)
        logging.info(f'response id_image: {data_image["id_image"]}')
        print((f'response id_image: {data_image["id_image"]}'))
        print({
                "id_image": data_image["id_image"],
                "url_image": data_image["url_image"],
                "score": predict_prob[1],
                "prediction_label": labels[np.argmax(predict_prob)],
                "model_version_cla": MODEL_VERSION,
                "end_point": ENDPOINT_NAME,
                "prediction_status": "Ok"
            })
        return {
            "id_image": data_image["id_image"],
            "url_image": data_image["url_image"],
            "score": predict_prob[1],
            "prediction_label": labels[np.argmax(predict_prob)],
            "model_version_cla": MODEL_VERSION,
            "end_point": ENDPOINT_NAME,
            "prediction_status": "Ok"
        }
    except Exception as err:
        print(f'Error al procesar imagen {data_image["id_image"]}')
        logging.error("Exception occurred", exc_info=True)
        return {
            "id_image": data_image["id_image"],
            "url_image": data_image["url_image"],
            "score": 0,
            "prediction_label": "None",
            "model_version_cla": MODEL_VERSION,
            "end_point": ENDPOINT_NAME,
            "prediction_status": format(err)
        }

In [128]:
get_predictions(data_image={'id_image':2879,
                            "url_image": "https://prod-gcba-us-east-1-upload.s3.amazonaws.com/us-east-1%3Aa55bc7d5-0df0-4f0d-a6ea-8657da1ebb1b/us-east-1%3Aa28bffa3-7707-461b-9265-cfacc6829d9a/certificates/1617395950/UMA.PDF"})

#Para probar si pudier

#abspath = os.path.abspath("pdf_file.pdf")
#print(abspath)

with open('pdf_file.pdf', 'rb') as file_64:
    in_64 = file_64.read(1)
    in_64.decode('utf-8')
    #print(in_64.decode('utf-8'))
    #print(type(in_64))
    #print(in_64)
#print(in_64)
#    result = re.sub(r"data.+base64.\b",  ' ',  in_64)
    
#with open('pdf_file.pdf', 'b') as theFile:
#    theFile.write(base64.b64decode(result))
#print(type(theFile))

#temp = open('pdf_file.pdf', 'r')
#print(type(temp))
    
#parto del pdf convertido
#get_predictions(data_image={'id_image':2879,
#                            "url_image":in_64})



get_predictions id_image: 2879
call rekognition id_image: 2879
<class 'requests.models.Response'>
save rekognition features id_image: 2879
create dataframe id_image: 2879
call sagemaker id_image: 2879
response id_image: 2879
{'id_image': 2879, 'url_image': 'https://prod-gcba-us-east-1-upload.s3.amazonaws.com/us-east-1%3Aa55bc7d5-0df0-4f0d-a6ea-8657da1ebb1b/us-east-1%3Aa28bffa3-7707-461b-9265-cfacc6829d9a/certificates/1617395950/UMA.PDF', 'score': 0.9307692733315528, 'prediction_label': 'Aprobado', 'model_version_cla': '13', 'end_point': 'n2-data-certification-model-deploy-dev-endpoint', 'prediction_status': 'Ok'}
%
<class 'bytes'>


In [131]:
data_image={'id_image':2879,
                            "url_image": "https://prod-gcba-us-east-1-upload.s3.amazonaws.com/us-east-1%3Aa55bc7d5-0df0-4f0d-a6ea-8657da1ebb1b/us-east-1%3Aa28bffa3-7707-461b-9265-cfacc6829d9a/certificates/1617395950/UMA.PDF"}


In [135]:
#print(data_image)

In [136]:
response = detect_labels(client_rekognition, data_image['url_image'], max_labels=MAX_LABELS, min_confidence=MIN_CONFIDENCE)

<class 'requests.models.Response'>


In [137]:
response

{'Labels': [{'Name': 'QR Code',
   'Confidence': 91.76808166503906,
   'Instances': [],
   'Parents': []},
  {'Name': 'Text',
   'Confidence': 48.97624206542969,
   'Instances': [],
   'Parents': []},
  {'Name': 'Page',
   'Confidence': 39.8924560546875,
   'Instances': [],
   'Parents': [{'Name': 'Text'}]},
  {'Name': 'File',
   'Confidence': 37.031089782714844,
   'Instances': [],
   'Parents': []},
  {'Name': 'Webpage',
   'Confidence': 29.667753219604492,
   'Instances': [],
   'Parents': [{'Name': 'File'}]},
  {'Name': 'Hoop',
   'Confidence': 27.35634422302246,
   'Instances': [],
   'Parents': []},
  {'Name': 'Coffee Table',
   'Confidence': 26.658458709716797,
   'Instances': [],
   'Parents': [{'Name': 'Table'}, {'Name': 'Furniture'}]},
  {'Name': 'Furniture',
   'Confidence': 26.658458709716797,
   'Instances': [],
   'Parents': []},
  {'Name': 'Table',
   'Confidence': 26.658458709716797,
   'Instances': [],
   'Parents': [{'Name': 'Furniture'}]},
  {'Name': 'Racket',
   'Co

In [139]:
df = pd.DataFrame([{i['Name']:i['Confidence'] for i in response['Labels']}])
df

Unnamed: 0,QR Code,Text,Page,File,Webpage,Hoop,Coffee Table,Furniture,Table,Racket,Ceiling Light,Super Mario,Stain,Ticket,Paper,Clinic,Kiosk,Puddle,Id Cards,Document,Crossword Puzzle,Game,Hospital,Bucket,Glasses,Accessories,Accessory,Snowflake,Safe,Gun,Weapon,Weaponry,French Door,Cassette,Oval,Buckle,Piercing,Adapter,Magazine,Watercraft,Transportation,Vehicle,Vessel,Suburb,Building,Urban,File Binder,Scorpion,Animal,Invertebrate,Hanger,Percussion,Musical Instrument,Digital Clock,Clock,Knot,Salamander,Wildlife,Amphibian,Crib,Box,Gift,Light Fixture,Red Wine,Beverage,Alcohol,Wine,Drink,Cylinder,Wire,Mailbox,Letterbox,Pebble,Graphics,Art,Earring,Jewelry,Handwriting,Wood,Crystal,Hearth,Maroon,Cone,Reptile,Tile,Tub,Heart,Necklace,Plastic,Rubble,Tartan,Plaid,Plywood,White,Texture,College,Fork,Cutlery,Electrical Device,Sun Hat,Clothing,Hat,Apparel,Hole,Floor,T-Shirt,Black Hair,Hair,Tree,Plant,Head,Ground,Bench,Number,Symbol,Label,Tabletop,Indoors,Logo,Trademark,Gray,Purple,Brick
0,91.768082,48.976242,39.892456,37.03109,29.667753,27.356344,26.658459,26.658459,26.658459,26.033447,23.949528,23.770763,23.689709,23.018785,23.018785,22.949526,22.624613,22.386595,22.341145,22.341145,22.012026,22.012026,20.758144,20.354124,20.028334,20.028334,20.028334,20.026018,19.885622,19.711262,19.711262,19.711262,19.664825,19.633532,19.528339,19.124769,18.940889,18.499119,18.446083,18.114428,18.114428,18.114428,18.114428,18.108994,18.108994,18.108994,18.004301,17.739925,17.739925,17.739925,17.666777,17.626299,17.626299,17.610298,17.610298,17.401936,17.308025,17.308025,17.308025,17.166578,16.805376,16.603376,16.167921,16.00482,16.00482,16.00482,16.00482,16.00482,15.818394,15.801585,15.735049,15.735049,15.683432,15.54816,15.54816,15.510712,15.510712,15.469807,15.346185,15.280615,14.946071,14.703887,14.570193,14.483565,14.320212,14.200002,14.166963,14.138896,14.000862,13.598641,13.49549,13.49549,13.484771,12.970341,12.970341,12.881462,12.381587,12.381587,12.319483,12.291475,12.291475,12.291475,12.291475,11.419221,10.325684,10.23169,10.229004,10.229004,9.885695,9.885695,9.723143,9.692065,9.593167,9.558621,9.558621,8.975556,8.949109,8.565667,8.255448,8.255448,8.119462,5.959358,5.840192
