In [1]:
import requests
import json
import pandas
import os
import time

# Loading credentials

## Step 1: Function to get the client credentials and token

In [2]:
#loading client_id and client_secret from file
path = "adobeservice/credentials.json"
con_file = open(path)
config = json.load(con_file)
con_file.close()

#getting token
client_id = config["client_credentials"]["client_id"]
client_secret = config["client_credentials"]["client_secret"]

url = 'https://pdf-services.adobe.io/token'
headers = {
    'Content-Type': 'application/x-www-form-urlencoded'
}
data = {
    'client_id': client_id,
    'client_secret': client_secret
}

response = requests.post(url, headers=headers, data=data)
token = response.json()["access_token"]

## Step 2 : Uploading an asset

### Generating Uri

In [3]:
def upload_asset(url):

    url = url
    
    headers = {'X-API-key': client_id,
               'Authorization': f"Bearer {token}",
               'Content-Type': 'application/json'
    }
    
    data = {"mediaType": "application/pdf"}
    
    response = requests.post(url, headers=headers, json=data)

    return response.json()

In [4]:
asset = upload_asset("https://pdf-services.adobe.io/assets")

### Loading file

In [5]:
def load_file(file_name):

    file = open(file_name, 'rb')

    upload_uri = asset["uploadUri"]

    headers = {'Content-Type': 'application/pdf'}
    response = requests.put(upload_uri, headers=headers, data=file)

    return response

In [6]:
loading = load_file("test.pdf")
loading_status = loading.status_code

## Step 3: Creating the job

In [7]:
def extract_job(url):

    headers = {'X-API-key': client_id,
               'Authorization': f"Bearer {token}"
    }

    params = {
    "assetID": asset["assetID"],
    "getCharBounds": "false",
    "includeStyling": "false",
    "elementsToExtract": ["text"],
    "includeHeaderFooter": "false"
    }


    response = requests.post(url, headers=headers, json=params)

    return response.headers

In [8]:
job = extract_job("https://pdf-services-ue1.adobe.io/operation/extractpdf")

## Step 4: Fetching the status

In [9]:
def fetch_status(location):

    url_location = location
    
    headers = {'X-API-key': client_id,
               'Authorization': f"Bearer {token}"
    }
    
    response = requests.get(url_location, headers=headers)

    return response.json()

In [13]:
status = "in progress"

while status == "in progress":
    fetch = fetch_status(job["location"])
    print("job in progress")
    status = fetch["status"]
    time.sleep(5)
    if status == "done":
        break


urin = fetch["content"]["downloadUri"]

job in progress


## Step 5: Downloading the asset

In [14]:
def download(urin):
    response = requests.get(urin)
    return response.json()

In [15]:
extracted_pdf = download(urin)
pdf = extracted_pdf["elements"]

## Retrieving text elements

In [18]:
text_elements = []
for i in range(len(pdf)):
    try:
        text_elements.append(pdf[i]["Text"])
    except:
        pass

In [19]:
text_elements

['Contents lists available at (<https://www.elsevier.com/locate/egyr>)ScienceDirect ',
 'Energy Reports ',
 'journal homepage: (<http://www.elsevier.com/locate/egyr>)www.elsevier.com/locate/egyr ',
 'Review article ',
 'Energetics Systems and artificial intelligence: Applications of industry 4.0 ',
 '∗',
 'Tanveer Ahmad b (<>), Hongyu Zhu a (<>), Dongdong Zhang a(<>),, Rasikh Tariq c (<>), A. Bassam c (<>), Fasee Ullah d (<>), Ahmed S AlGhamdi e (<>), Sultan S. Alshamrani f (<>)',
 'a School of Electrical Engineering, Guangxi University, Nanning, 530004, China ',
 'b Energy and Electricity Research Center, International Energy College, Jinan University, Zhuhai, Guangdong Province, 519070, China ',
 'c Facultad de Ingeniería, Universidad Autónoma de Yucatán, Av. Industrias No Contaminantes por Anillo Periférico Norte, Apdo. Postal 150, Cordemex, Mérida 97203, Yucatán, Mexico ',
 'd Department of Computer Science and IT, Sarhad University of Science and Information Technology, Pakistan '