In [1]:
!pip3 install nltk



In [2]:
import os
import warnings
import pandas as pd
import numpy as np
import time

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

import string
import re
import json
from google.cloud import vision
from google.cloud import storage

import traceback
from joblib import Parallel, delayed

pd.set_option('max_colwidth', 500)
pd.set_option('display.max_rows', 500)

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
warnings.filterwarnings("ignore")

stopwords = set(nltk.corpus.stopwords.words())
lemma = nltk.WordNetLemmatizer()

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/jupyter/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [3]:
def async_detect_document(gcs_source_uri, gcs_destination_uri):
    """OCR with PDF/TIFF as source files on GCS"""
    # Supported mime_types are: 'application/pdf' and 'image/tiff'
    mime_type = 'application/pdf'

    # How many pages should be grouped into each json output file.
    batch_size = 2

    client = vision.ImageAnnotatorClient()

    feature = vision.Feature(
        type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)

    gcs_source = vision.GcsSource(uri=gcs_source_uri)
    input_config = vision.InputConfig(
        gcs_source=gcs_source, mime_type=mime_type)

    gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)
    output_config = vision.OutputConfig(
        gcs_destination=gcs_destination, batch_size=batch_size)

    async_request = vision.AsyncAnnotateFileRequest(
        features=[feature], input_config=input_config,
        output_config=output_config)

    operation = client.async_batch_annotate_files(
        requests=[async_request])

    print('Waiting for the operation to finish.')
    operation.result(timeout=420)

    # Once the request has completed and the output has been
    # written to GCS, we can list all the output files.
    storage_client = storage.Client()
    match = re.match(r'gs://([^/]+)/(.+)', gcs_destination_uri)
    bucket_name = match.group(1)
    prefix = match.group(2)

    bucket = storage_client.get_bucket(bucket_name)

    # List objects with the given prefix, filtering out folders.
    blob_list = [blob for blob in list(bucket.list_blobs(
        prefix=prefix)) if not blob.name.endswith('/')]
    print('Output files:')
    text = []
    for blob in blob_list:
        name = blob.name
        json_string = blob.download_as_string()
        data = json.loads(json_string)
        for pages in data['responses']:
            try:
                text.append(pages['fullTextAnnotation']['text'])
            except:
                print(traceback.format_exc())
                text.append('{}'.format(np.nan))
                print(name)
        all_text = " ".join(text)
        blob.delete()
    return all_text

In [15]:
def remove_url(text):
    text = text.replace('-\n','')
    text = text.encode().decode()
    text = text.replace('\n', ' ')
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

def remove_html(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def clean_stopwords(text, stopwords):
    res = []
    for word in text:
        if word not in stopwords:
            res.append(word)
    return res

def lemmatize(tokens_no_sw, lemma):
    tokens_no_sw_lemma = [lemma.lemmatize(each) for each in tokens_no_sw]
    return tokens_no_sw_lemma

def bigram(text):
    return list(nltk.bigrams(text))

def cleaning(text):
    text = text.replace('.','')
    text = re.sub("[\(\[].*?[\)\]]", "", text)
    text = re.sub('(?<=\d)[,.](?=\d)','',text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(' +', ' ', text)
    text = re.sub('[!@#$]', '', text)
    text = re.sub('[^a-z \n\.]', '', text)
    
    return text

In [16]:
def func_blob(blob, bucket_uri, root, destination_folder):
    if '.pdf' in blob.name:
        file_path = blob.name.split('/')[-1]
        print(file_path)
        gcs_source_uri= bucket_uri + root + file_path
        gcs_destination_uri =  bucket_uri + destination_folder + file_path.split('.')[0] + '/'
        all_text = async_detect_document(gcs_source_uri, gcs_destination_uri)
        response = {'name': [file_path], 'text': [all_text]}
        return pd.DataFrame(response)
    else:
        response = {'name': [np.nan], 'text': [np.nan]}
        return pd.DataFrame(response)
def main():
    output_bucket = "bucket_name"
    bucket_uri = 'gs://bucket_name/'
    root = 'raw_data/'
    # root = 'poly_test/'
    destination_folder = 'vision_ocr_json/'
    all_files=[]
    df = pd.DataFrame(columns = ['name','text'])
    # Instantiates a client
    client = storage.Client()
    # Get GCS bucket
    bucket = client.get_bucket(output_bucket)
    # Get blobs in bucket (including all subdirectories) and add specific subirectory
    blobs = bucket.list_blobs(prefix=root)
    #select only pdf files
    all_data = Parallel(n_jobs=-1, prefer="threads")( delayed(func_blob)(blob, bucket_uri, root, destination_folder) for blob in blobs)
    for item in all_data:
        df = pd.concat([df,item], axis = 0)
    data = df.dropna().reset_index(drop=True)
    data['text'] = data['text'].map(lambda row: row.lower())
    data['cleaned'] = data['text'].map(lambda row: remove_url(row))
    data['cleaned'] = data['cleaned'].map(lambda row: remove_html(row))
    data['cleaned'] = data['cleaned'].map(lambda row: remove_emoji(row))
    data['cleaned'] = data['cleaned'].map(lambda row: cleaning(row))
    data['word_token'] = data['cleaned'].map(lambda row: word_tokenize(row))
    data['no_stopwords'] = data['word_token'].map(lambda row: clean_stopwords(row, stopwords))
    data['lemmatize'] = data['no_stopwords'].map(lambda row: lemmatize(row, lemma))
    data['bigram'] = data['lemmatize'].map(lambda row: bigram(row))
    return data

In [None]:
start = time.time()
data = main()
print('total_time', time.time() - start)
data.to_csv('cleaned_data.csv')
!gsutil cp "cleaned_data.csv" "gs://bucket_name/csv/cleaned_data.csv"

AU2002320466B2.pdf
AU2003249081B2.pdf
AU2003262600B2.pdf
AU2004216397B2.pdf
AU2004226599B2.pdf
AU2004290052B2.pdf
AU2004290076B2.pdf
AU2004291896B2.pdfAU2005232297A1.pdf
AU2005245407B9.pdf

AU2005250155B2.pdf
AU2005265323B2.pdf
AU2005265323C1.pdf
AU2005265434B2.pdf
AU2005267202A1.pdf
AU2006203837B2.pdf
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Waiting for the operation to finish.
Output files:
Output files:
Output files:
AU2008217190B2.pdf
Waiting for the operation to finish.
Output

In [None]:
data.tail()