In [1]:
from PyPDF2 import PdfReader
import os
import string
from google.cloud import storage
import json

In [2]:
def read_pdf(file_location):
    reader = PdfReader(file_location)
    accum = []
    for page in reader.pages:
        accum.append(page.extract_text())
    return '\n'.join(accum)

In [3]:
import string

test = string.Formatter().parse("""
    In the following resume, summarize the candidate's experience. Keep the response to five or less concise sentences.
    resume: {}
    """)
for item in test:
    print(item)

("\n    In the following resume, summarize the candidate's experience. Keep the response to five or less concise sentences.\n    resume: ", '', '', None)
('\n    ', None, None, None)


In [3]:
class DocumentExtractionPromptGenerator:
    def __init__(
        self,
        persona, 
        prompt_template, 
        output_fields=None, 
        examples=None
    ):
        self.persona=persona
        self.prompt_template=prompt_template
        
        self.format_parser = string.Formatter()
        self.output_fields = output_fields
        self.examples=examples
        
    def _check_all_args_present(self, kwargs):
        filtered_kwargs = {}
        document_present = False
        if 'document' in kwargs.keys(): raise ValueError('prompt args has document, possible duplication')
        for tup in self.format_parser.parse(self.prompt_template):
            if tup[1] is None: break
            if tup[1]=='': raise ValueError('prompt arguments must be named')
            if tup[1] == 'document': 
                document_present=True
                continue
            if tup[1] not in kwargs:
                raise ValueError('prompt argument {} unspecified'.format(tup[1]))
            
            
            filtered_kwargs[tup[1]] = kwargs[tup[1]]
        if not document_present: raise ValueError('document must be an argument')
        return filtered_kwargs
            
                                 
                    
    
    
    def generate_prompt(self, document, **kwargs):
        filtered_kwargs = self._check_all_args_present(kwargs)
        
        accum = []
        if self.persona: accum.append(self.persona)
        prompt_template = self.prompt_template.format(document=document, **filtered_kwargs)

        
        accum.append(prompt_template)
        
        if self.output_fields: accum.append(self.output_fields)
        if self.examples: accum.append(self.examples)

        return '\n\n'.join(accum)
    
    

In [5]:
default_persona = "you are a highly trained social sciences researcher. you strive to be objective but discerning without extrapolating too much. You use precise language and are not verbose"


resume_experience_prompt_template = """
    In the following resume, summarize the candidate's experience. Keep the response to five or less concise sentences.
    resume: {document}
    """
resume_experience_output_fields = "experiences:"

resume_experience_prompt = DocumentExtractionPromptGenerator(
    default_persona, 
    resume_experience_prompt_template, 
    resume_experience_output_fields
)


resume_industry_prompt_template = """
    List the industries the candidate has experience in and order them from most to least experience.
    resume: {document}
    """
resume_industry_output_fields = "industries:"
resume_industry_prompt = DocumentExtractionPromptGenerator(
    default_persona, 
    resume_industry_prompt_template, 
    resume_industry_output_fields
)

resume_skills_prompt_template = """
    List the top 10 skills the candidate has and order them from most to least experience based on what they have written in their resume. If the candidate has less than 10 discernable technical skills, truncate the list.
    resume: {}
    """
resume_skills_output_fields = "skills:"
resume_skills_prompt = DocumentExtractionPromptGenerator(
    default_persona, 
    resume_skills_prompt_template, 
    resume_industry_output_fields
)

In [80]:
resume_summary_prompt.generate_prompt('my resume')

"you are a highly trained social sciences researcher. you strive to be objective but discerning without extrapolating too much. You use precise language and are not verbose\n\n\n    In the following resume, summarize the candidate's experience. Keep the response to five or less concise sentences.\n    resume: my resume\n    \n\nexperiences:"

In [82]:
def write_batch_job_file(input_document_paths, write_batch_blob, prompt_generator):
    for filepath in input_document_paths:
        document = read_pdf(filepath)
        prompt = prompt_generator.generate_prompt(document)
        write_batch_blob.write(json.dumps({'prompt': prompt}) + '\n')

In [87]:
storage_client = storage.Client()

In [99]:
blob  = storage_client.bucket('career-catalyst-standard-resume/experience/input').blob('test')

In [102]:
filepath = 'career-catalyst-standard-resume/experience/input/test'
with open(filepath, 'w') as write_file:
    write_file.write('test')

# bucket/blobs aren't working with cloud storage through vertex ai notebook.
I rely on mounted directory as below

bucket_name = 'career-catalyst-standard-resume/{}/input'.format('experience')
bucket = storage_client.bucket(bucket_name)

blob_name = 'data_analyst_resumes_{}.jsonl'.format('experience')
blob = bucket.blob(blob_name)

with blob.open("w") as write_file:
    input_docs = ['Data Analyst/' + file for file in os.listdir('Data Analyst') if file.split('.')[-1]=='pdf']
    write_batch_job_file(input_docs, write_file, resume_experience_prompt)

In [103]:
filepath = 'career-catalyst-standard-resume/{dimension}/input/data_analyst_resumes_{dimension}'.format(dimension='experience')
with open(filepath, 'w') as write_file:
    input_docs = ['Data Analyst/' + file for file in os.listdir('Data Analyst') if file.split('.')[-1]=='pdf']
    write_batch_job_file(input_docs, write_file, resume_experience_prompt)

incorrect startxref pointer(3)
