In [1]:
import boto3
import pandas as pd
from dotenv import load_dotenv
import os
import openai
import json

load_dotenv()

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
AWS_SESSION_TOKEN = os.getenv('AWS_SESSION_TOKEN')



In [3]:
textract = boto3.client('textract', 
                      region_name='us-east-1',
                      aws_access_key_id=AWS_ACCESS_KEY_ID,
                      aws_secret_access_key=AWS_SECRET_ACCESS_KEY,
                      aws_session_token=AWS_SESSION_TOKEN
                    )

In [4]:
response = textract.detect_document_text(
    Document={
        'S3Object': {
            'Bucket': 'jf-test-general-bucket',
            'Name': 'test-textract/documents/Data Science Resume.pdf'
        }
    }
)

In [5]:
len(response['Blocks'])

347

In [6]:
response['Blocks'][16]

{'BlockType': 'LINE',
 'Confidence': 99.8028793334961,
 'Text': 'AWS (Step',
 'Geometry': {'BoundingBox': {'Width': 0.07312901318073273,
   'Height': 0.012267205864191055,
   'Left': 0.5098416805267334,
   'Top': 0.23090672492980957},
  'Polygon': [{'X': 0.5098416805267334, 'Y': 0.23090672492980957},
   {'X': 0.5829640626907349, 'Y': 0.23090963065624237},
   {'X': 0.5829706788063049, 'Y': 0.24317392706871033},
   {'X': 0.5098479986190796, 'Y': 0.24317088723182678}]},
 'Id': '29e0516e-89e9-4942-8b7f-426895faaf7a',
 'Relationships': [{'Type': 'CHILD',
   'Ids': ['807f9ad1-8a16-4289-8d75-bdaae2d21c3b',
    '0a578188-6a7a-4783-964b-f262f496e2e8']}]}

In [7]:
full_text = ''

for block in response['Blocks']:
    if block['BlockType'] == 'LINE':
        full_text += block['Text'] + '\n'

full_text = full_text[:-1] # remove last white space character

In [8]:
full_text[0:25]

'John Funk\nData Engineer\nf'

In [9]:
openai.api_key = OPENAI_API_KEY

In [10]:
person_prompt_system = """From the Resume text for a job aspirant below, extract Entities strictly as instructed below
1. First, look for the Person Entity type in the text and extract the needed information defined below:
`id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring to this property to define the relationship between entities. NEVER create new entity types that aren't mentioned below. Document must be summarized and stored inside the Person entity under `description` property
    Entity Types:
    label:'Person',id:string,role:string,description:string //Person Node
2. Description property should be a crisp text summary and MUST NOT be more than 100 characters
3. If you cannot find any information on the entities & relationships above, it is okay to return empty value. DO NOT create fictitious data
4. Do NOT create duplicate entities
5. Restrict yourself to extract only Person information. No Position, Company, Education or Skill information should be focussed.
6. NEVER Impute missing values
7. Respond ONLY with output JSON and nothing else
8. Each resume should only contain one person entity
Example Output JSON:
{"entities": [{"label":"Person","id":"person1","role":"Prompt Developer","description":"Prompt Developer with more than 30 years of LLM experience"}]}
"""

person_prompt_user= """Question: Now, extract the Person for the text below -
"""

person_prompt_user += full_text
person_prompt_user += """
Answer
"""
person_prompt_system

'From the Resume text for a job aspirant below, extract Entities strictly as instructed below\n1. First, look for the Person Entity type in the text and extract the needed information defined below:\n`id` property of each entity must be alphanumeric and must be unique among the entities. You will be referring this property to define the relationship between entities. NEVER create new entity types that aren\'t mentioned below. Document must be summarized and stored inside Person entity under `description` property\n    Entity Types:\n    label:\'Person\',id:string,role:string,description:string //Person Node\n2. Description property should be a crisp text summary and MUST NOT be more than 100 characters\n3. If you cannot find any information on the entities & relationships above, it is okay to return empty value. DO NOT create fictious data\n4. Do NOT create duplicate entities\n5. Restrict yourself to extract only Person information. No Position, Company, Education or Skill information 

In [11]:
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo-16k",
  messages=[
    {"role": "system", "content": person_prompt_system},
    {"role": "user", "content": person_prompt_user}
  ]
)

In [12]:
json.loads(completion.choices[0].message['content'])['entities']

[{'label': 'Person',
  'id': 'person1',
  'role': 'Data Engineer',
  'description': 'Data Engineer at Travelers Insurance with experience in developing streaming data processing pipeline using pySpark for ingesting real-time policy data into a graph structure in Neo4j, developing a POC for a Feature Store solution and machine learning feature engineering pipeline for Data Scientists, and developing an aerial image processing pipeline for ingestion into solar roof classification models'}]