In [1]:
#!pip install google-cloud
#!pip install google-cloud-speech
#!pip install google-cloud-storage
#!pip install google-cloud-vision

In [2]:
import urllib.request
from urllib.request import urlretrieve
from google.cloud import storage
from google.cloud import speech
from google.cloud import vision
from google.cloud import language_v1
import yaml
import json
import os
import re

In [3]:
def format_url(document_id):
    url = []
    url.append('https://download.industrydocuments.ucsf.edu')
    for i in range(4):
        url.append(document_id[i])
    url.append(document_id)
    url.append(document_id + '.pdf')
    
    return '/'.join(url)

In [4]:
# download the pdf 
document_id = 'ytxk0091'
download_url = format_url(document_id)
print('downloading', download_url)
urlretrieve(download_url, 'pdfs/' + document_id + ".pdf")

downloading https://download.industrydocuments.ucsf.edu/y/t/x/k/ytxk0091/ytxk0091.pdf


('pdfs/ytxk0091.pdf', <http.client.HTTPMessage at 0x7fcae84eaf50>)

In [5]:
with open('properties.yaml') as file:
    properties = yaml.full_load(file)

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = properties['google_application_credentials']

bucket_name = properties['bucket_name']
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)

In [6]:
source_file_name = document_id
blob = bucket.blob(source_file_name)
blob = bucket.blob("pdf_files/" + source_file_name + '.pdf')
blob.upload_from_filename('pdfs/' + source_file_name + '.pdf')

In [None]:
gcs_source_uri = 'gs://idl-dsi-bucket/pdf_files/' + source_file_name + '.pdf'
gcs_destination_uri = 'gs://idl-dsi-bucket/pdf_files/' + source_file_name + '.json'

# Supported mime_types are: 'application/pdf' and 'image/tiff'
mime_type = "application/pdf"

# How many pages should be grouped into each json output file.
batch_size = 10

client = vision.ImageAnnotatorClient()

feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

gcs_source = vision.GcsSource(uri=gcs_source_uri)
input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)

gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
output_config = vision.OutputConfig(
    gcs_destination=gcs_destination, batch_size=batch_size
)

async_request = vision.AsyncAnnotateFileRequest(
    features=[feature], input_config=input_config, output_config=output_config
)

operation = client.async_batch_annotate_files(requests=[async_request])

print("Waiting for the operation to finish.")
operation.result(timeout=420)

# Once the request has completed and the output has been
# written to GCS, we can list all the output files.
storage_client = storage.Client()

match = re.match(r"gs://([^/]+)/(.+)", gcs_destination_uri)
bucket_name = match.group(1)
prefix = match.group(2)

bucket = storage_client.get_bucket(bucket_name)

# List objects with the given prefix, filtering out folders.
blob_list = [
    blob
    for blob in list(bucket.list_blobs(prefix=prefix))
    if not blob.name.endswith("/")
]
print("Output files:")
for blob in blob_list:
    print(blob.name)

# Process the first output file from GCS.
# the first response contains
# the first batch_size number pages of the input file.
output = blob_list[0]

json_string = output.download_as_bytes().decode("utf-8")
response = json.loads(json_string)

# The actual response for the first page of the input file.

first_page_response = response["responses"][0]
annotation = first_page_response["fullTextAnnotation"]

In [21]:
# Here we print the full text from the first page.
# The response contains more information:
# annotation/pages/blocks/paragraphs/words/symbols
# including confidence scores and bounding boxes

print(annotation["text"])

To All Jobacco Cos.
Oct. 20/999
I
In my own handwriting,
swear people
are not taking
responsibility for their own
actions. No tone forced any
ong to buy ous that to blame
childish
a co
to smoke or not smoke.
company for cree shove
I quit alcohol 24 year
II was a beer drinkers
ago
I shaved sue Lucky Lagers
for years dost & a devance.
think not! I also had
jaundiect hadn't
solid food for 2.40 3.
Let us get realt
tape response blaming
Yeater
our
others - People or Com-
panies.
52279 1744
Source: https://www.industrydocuments.ucsf.edu/docs/ytxk0091


In [8]:
response

{'inputConfig': {'gcsSource': {'uri': 'gs://idl-dsi-bucket/pdf_files/ytxk0091.pdf'},
  'mimeType': 'application/pdf'},
 'responses': [{'fullTextAnnotation': {'pages': [{'property': {'detectedLanguages': [{'languageCode': 'en',
         'confidence': 0.95961183}]},
      'width': 609,
      'height': 808,
      'blocks': [{'boundingBox': {'normalizedVertices': [{'x': 0.28735632,
           'y': 0.22153465},
          {'x': 0.73399013, 'y': 0.18688118},
          {'x': 0.7389163, 'y': 0.22277227},
          {'x': 0.29228243, 'y': 0.25742576}]},
        'paragraphs': [{'boundingBox': {'normalizedVertices': [{'x': 0.28735632,
             'y': 0.22153465},
            {'x': 0.73399013, 'y': 0.18688118},
            {'x': 0.7389163, 'y': 0.22277227},
            {'x': 0.29228243, 'y': 0.25742576}]},
          'words': [{'property': {'detectedLanguages': [{'languageCode': 'en',
               'confidence': 1}]},
            'boundingBox': {'normalizedVertices': [{'x': 0.28735632,
           

In [9]:
# number of pages in the response
len(response['responses'])

4

In [10]:
# parsing the json document
response["responses"][1].keys()

dict_keys(['fullTextAnnotation', 'context'])

In [26]:
page_1_text = response["responses"][0]['fullTextAnnotation']['text']
page_1_confidence = response["responses"][0]["fullTextAnnotation"]['pages'][0]['confidence']

In [27]:
print(page_1_text)

To All Jobacco Cos.
Oct. 20/999
I
In my own handwriting,
swear people
are not taking
responsibility for their own
actions. No tone forced any
ong to buy ous that to blame
childish
a co
to smoke or not smoke.
company for cree shove
I quit alcohol 24 year
II was a beer drinkers
ago
I shaved sue Lucky Lagers
for years dost & a devance.
think not! I also had
jaundiect hadn't
solid food for 2.40 3.
Let us get realt
tape response blaming
Yeater
our
others - People or Com-
panies.
52279 1744
Source: https://www.industrydocuments.ucsf.edu/docs/ytxk0091


In [28]:
print(page_1_confidence)

0.760178


In [23]:
type_ = language_v1.Document.Type.PLAIN_TEXT

language = "en"
document = {"content": page_1_text, "type_": type_, "language": language}

client = language_v1.LanguageServiceClient()

sentiment_response = client.analyze_sentiment(request = {'document': document})

In [24]:
print(sentiment_response.document_sentiment.score)
print(sentiment_response.document_sentiment.magnitude)

-0.6000000238418579
3.9000000953674316
