### DESCRIPTION:
    This example shows how to extract information in a pre-defined JSON format from employees' CVs using GPT3.5  
    The JSON format we will be using can be found here:   
        https://github.com/denisa-ms/azure-data-and-ai-examples/blob/master/openai/data/CV/expected_format.json

### REQUIREMENTS:
    Create an .env file with your OpenAI API key and save it in the root directory of this project.


In [1]:
from dotenv import load_dotenv
import pandas as pd
import utils
import openai
import json 
from langchain.text_splitter import TokenTextSplitter
from langchain.document_loaders import TextLoader
from langchain.document_loaders import DirectoryLoader

load_dotenv()
openai_object = utils.init_OpenAI()
llm = utils.init_llm()

In [3]:
def cleanUpJsonString(text):
    # replace "\n" with "" 
    text = text.replace("\n", "")
    return text

def validateJSON(text):
    result = json.loads(text)
    return result


def read_file(file_name):
    path = "./data/CV/"+file_name
    with open(path, "r") as f:
        text = f.read()
    return text

In [4]:
text = read_file("expected_format.json")
json_escaped = json.dumps(text)

In [8]:
def classifyDocument(file_name):
    document = read_file(file_name)
    prompt = """<|im_start|>system
    You are an assistant designed to extract entities from text. Users will input text and you will respond with entities you\'ve extracted from the text as a JSON object. 
    If there are entities you can extract but are not specified in the JSON object, you should add them to the JSON object in the other_info field as key value pairs with the info type and value.
    Here\'s an example of your output format:"""+json_escaped
    prompt = prompt + """
    <|im_end|>
    <|im_start|>user
    """
    prompt = prompt + document + """<|im_end|>
    <|im_start|>assistant"""

    response = openai.Completion.create(
        engine=utils.OPENAI_DEPLOYMENT_NAME,
        prompt=prompt,
        temperature=0,
        max_tokens=4096,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=["<|im_end|>"])
    response = response['choices'][0]['text']
    response = utils.remove_chars("\n", response)
    response=utils.start_after_string("Answer:", response)
    response=utils.remove_tail_tags("<|im_end|>", response)
    return response

In [9]:
utils.pretty_print_json_string(classifyDocument("cv1.txt"))

{
  "person_name": "Jordan Zhu",
  "email": "",
  "phone": "",
  "address": "",
  "linkedin_profile": "linkedin.com/in/jordanzhu",
  "education": [
    {
      "university": "University of Michigan",
      "degree": "B.S.E. in Computer Science",
      "start_date": "Sep 2014",
      "end_date": "Dec 2018"
    },
    {
      "university": "Coursera",
      "degree": "Machine Learning Specialization (3 courses)",
      "start_date": "Nov 2022",
      "end_date": "Jan 2023"
    }
  ],
  "work_experience": [
    {
      "company": "Stripe",
      "position": "Solutions Architect",
      "start_date": "Jun 2021",
      "end_date": "Nov 2022"
    },
    {
      "company": "Amazon",
      "position": "Software Engineer",
      "start_date": "Mar 2019",
      "end_date": "Apr 2021"
    }
  ],
  "other_info": [
    {
      "info_type": "Skills",
      "value": "AWS, Python, C++, Java, SQL, React, Node.js, Javascript, HTML, CSS, Ruby"
    }
  ]
}


In [10]:
utils.pretty_print_json_string(classifyDocument("cv17.txt"))

{
  "person_name": "Christopher L. Hall",
  "email": "christopher.hall@gmail.com",
  "phone": "(951) 756-5600",
  "address": "",
  "linkedin_profile": "linkedin.com/in/christopherhall",
  "education": [
    {
      "university": "Stanford University",
      "degree": "Bachelor of Science in Computer Science",
      "start_date": "",
      "end_date": "2014"
    }
  ],
  "work_experience": [
    {
      "company": "Jamma Technologies",
      "position": "Cloud Solution Architect",
      "start_date": "May 2017",
      "end_date": "July 2019"
    },
    {
      "company": "Leesing Square Capital Group",
      "position": "Enterprise Solution Architect",
      "start_date": "February 2015",
      "end_date": "May 2017"
    }
  ],
  "other_info": [
    {
      "info_type": "Summary of Qualifications",
      "value": "AWS-certified big data solution architect with 4+ years of experience driving information management strategy. Seeking to leverage high-level understanding of Amazon Web Servi