In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import json
import os
import openai
import asyncio
from tqdm.notebook import tqdm
from openai import AsyncOpenAI

<img src="https://images.unsplash.com/photo-1517836357463-d25dfeac3438?q=80&w=2940&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" width="500" height="300"/>

### 프로젝트 목적 : 호텔의 fitness program에 투입될 신규 인원 선발!

In [None]:
# initialize openai
os.environ['OPENAI_API_KEY']= "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

client = openai.OpenAI()

In [None]:
df = pd.read_csv("resume/Resume.csv")
df.shape

In [None]:
df.Category.unique()

In [None]:
df = df.loc[df.Category.isin(['CHEF', 'FITNESS'])].reset_index(drop=True)

In [None]:
df.shape

In [None]:
print(df.loc[10, 'Resume_str'])

---

### 정보 추출하기 step 1 : 필요한 정보 명확히 정의하기
- 우리가 원하는 후보자를 search하기 위해 필요한 정보
- 모든 이력서에 공통적으로 확보할 수 있는 정보

### 정보 추출하기 step 2 : 필요한 정보 추출을 위한 방법 구상하기
- chat completion을 활용할 수 있기 때문에, 생각의 틀을 넓혀서 추출을 위한 최적의 방법을 생각
- chat completion 이외에도 regex나 NER 등 다양한 방법을 활용해도 됨

In [None]:
prompt = """Given the following resume text, 
extract and categorize the information into the specified categories: skills, work experience in years, and summary of each project. 
Please provide the extracted information in a dictionary format with the keys as 'skills', 'work experience (years)' and 'summary.

Instructions:

    Skills: Identify and list all professional skills mentioned in the resume. Each element should be a word such as 'Python' or 'CSS'
    Work Experience (years): Total years of experience. It should be a number such as '7' or '10'. Leave it empty if there are no related information.
    Summary : For each career should be one summarized in one sentence. 
              Each sentence should be in a format of 'Worked as <job_title> from <start_date> to <end_date>, doing <work description> and accomplishing <accomplishments>'.
              Put in 'empty' for each blank if there are on relevant information.
    
Ensure that the information is accurately extracted and categorized according to the instructions. If certain information is not available or cannot be accurately determined, please indicate so appropriately.

Resume Text:
{}
"""


이 프롬프트는 주어진 이력서 텍스트에서 특정 정보를 추출하고 카테고리화하는 작업을 설명하고 있습니다. 사용자에게 이력서로부터 정보를 분류하고 사전 형식으로 제공하도록 지시하고 있으며, 이때 주요 카테고리는 '기술(skills)', '근무 경험 연수(work experience in years)', 그리고 '프로젝트 요약(summary)'입니다.

카테고리 별 상세 설명:
기술 (Skills):
이력서에서 언급된 모든 전문 기술을 식별하고 나열하라고 지시하고 있습니다.
각 요소는 'Python', 'CSS'와 같은 단어로 표현되어야 합니다.
근무 경험 연수 (Work Experience in years):
전체 근무 경험 연수를 숫자로 나타내야 합니다 (예: '7', '10').
관련 정보가 없을 경우 이 항목은 비워두라고 합니다.
프로젝트 요약 (Summary):
각 경력에 대해 한 문장으로 요약하라고 지시하고 있습니다.
요약 문장은 'Worked as <직무명> from <시작 날짜> to <종료 날짜>, doing <업무 설명> and accomplishing <성과>' 형식을 따라야 합니다.
관련 정보가 없는 경우 'empty'로 채워 넣으라고 합니다.
프롬프트의 목적:
이 프롬프트는 사용자가 이력서 내용을 정확하고 체계적으로 분석하고, 추출된 정보를 명시된 형식에 맞게 정리하도록 요구합니다. 만약 특정 정보를 정확히 파악하기 어렵거나 이용할 수 없는 경우, 이를 적절히 표시하도록 지시하고 있습니다. 이는 데이터 추출 및 처리 작업에서 정확성과 투명성을 확보하기 위함입니다.

In [None]:
async def chat_completion(input_prompt, model='gpt-4-turbo-preview'):
    client = AsyncOpenAI()
    
    SYSTEM_PROMPT = "You are a smart and intelligent program that understands information inside a resume, designed to output JSON"
    USER_PROMPT_1 = """Are you clear about your role?"""
    ASSISTANT_PROMPT_1 = """Sure, I'm ready to help you with your NER task. Please provide me with the necessary information to get started."""

    response = await client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": USER_PROMPT_1},
            {"role": "assistant", "content": ASSISTANT_PROMPT_1},
            {"role": "user", "content":input_prompt}
        ]
        )
    return response

async def run_async(main_prompt, information):
    tasks = [chat_completion(main_prompt.format(i)) for i in information]
    responses = await asyncio.gather(*tasks)
    return responses

def normal_chat_completion(input_prompt, model='gpt-4-turbo-preview'):
    client = openai.OpenAI()

    response = client.chat.completions.create(
        model=model,
        response_format={ "type": "json_object" },
        messages=[
            {"role": "system", "content": 'You are a smart and intelligent program that understands information and provides output in JSON format'},
            {"role": "user", "content":input_prompt}
        ]
        )
    return response

async (13분 이내)

- with async : 15개당 40~50초
- without async : 15개당 ~180초 (1개당 ~12초)

In [None]:
batches = [df.Resume_str[i : i+15].values.tolist() for i in range(0, len(df.Resume_str), 15)]
outputs = list()

for batch in tqdm(batches):
    output = await run_async(prompt, batch)
    outputs.extend(output)

In [None]:
extracted_info = [i.choices[0].message.content for i in outputs]
extracted_info = [json.loads(i) for i in extracted_info]

In [None]:
extracted_info[0]

In [None]:
# for i,info in enumerate(extracted_info):
#     extracted_info[i]['ID'] = str(df.loc[i, 'ID'])
#     extracted_info[i]['title'] = df.loc[i, 'title']

In [None]:
# with open("resume/resume_info_extracted.json", 'w') as file:
#     json.dump(extracted_info, file)

In [None]:
with open("resume/resume_info_extracted.json", 'r') as file:
    data = json.load(file)

In [None]:
data[0]

---

### Convert into embeddings

In [None]:
from utils import create_embeddings

In [None]:
emb_data = list()

for d in tqdm(data):
    emb_d = dict()
    for k, v in d.items():
        if k in ['skills', 'summary', 'title']:
            emb_ = create_embeddings(v) # list를 한 번에 embedding화
            emb_d[k] = emb_
        elif k in ['work experience (years)', 'ID']:
            emb_d[k] = v
        else:
            assert False, "Incorrect key"
    emb_data.append(emb_d)

In [None]:
# with open("resume/resume_info_extracted_emb.json", 'w') as file:
#     json.dump(emb_data, file)

In [None]:
with open("resume/resume_info_extracted_emb.json", 'r') as file:
    emb_data = json.load(file)

In [None]:
emb_data[0].keys()

In [None]:
emb_data[0]

### Example search

- skill
    - threshold를 정해서, 가장 유사하다고 판단이 되는 것을 가져온다. 유사도의 평균 또는 몇 개 일치하는지
- summary of project
    - 어떤 프로젝트를 했는지, description을 기준으로

In [None]:
input_dict = {'skills':['Flexibility Training', 'Nutrition', 'Anatomy', 'Strength Training'],
              'summary':"Extensive experience in designing and implementing personalized training programs for muscle growth, with a proven track record of helping clients achieve their fitness goals"}

In [None]:
df = pd.DataFrame(data)
emb_df = pd.DataFrame(emb_data)

In [None]:
df.head(3)

In [None]:
emb_df.head(3)

In [None]:
def batch_cosine_similarity(list1, list2, threshold):
    # sklearn의 cosine_similarity 함수를 사용하여 코사인 유사도 계산
    similarities = cosine_similarity(list1, list2)
    columns_over_threshold = (similarities > threshold).any(axis=0)
    
    count = columns_over_threshold.sum() # list2를 기준으로 한 개라도 threshold를 넘는 값이 있으면 +1
    column_indices = np.where(columns_over_threshold)[0]

    return column_indices, count

def candidate_search(input_list, nested_lists, top_k, search_type='skill', threshold=0.5):
    """
    score : 0-1 사이의 값. 높을 수록 더 많은 match. Match의 max는 nested_list의 개수와 동일
    현재 input으로 제공된 embedding 값과, nested_lists에 있는 element들의 embedding 값들의 cosine similarity를 계산
    """
    if search_type in ['experience', 'skill']:
        pass
    else:
        assert False, "Unsupported search type"

    scores = list()
    
    for i, nested_list in enumerate(nested_lists):
        # input_list와 nested_lists를 대상으로 cosine similarity를 계산, 각 element 별로 cos_sim이 threshold를 넘는 값들만 가져옴
        _, common_elements_count = batch_cosine_similarity(input_list, nested_list, threshold)
        # print(common_elements_count)
        # 정규화를 위해 nested_list의 길이 계산
        possible_matches = len(nested_list)
        # 점수 계산 (common_elements_count / possible_matches)
        score = common_elements_count / possible_matches if possible_matches > 0 else 0
        scores.append((i, score))
    
    top_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:top_k]

    return top_scores

#### skill based search

In [None]:
db = emb_df['skills'].values.tolist()
input = create_embeddings(input_dict['skills'])

In [None]:
input_dict['skills']

In [None]:
skill_based_findings = candidate_search(input, db, 10, threshold=0.5)
skill_based_findings

#### summary based search

In [None]:
summary_db = emb_df['summary'].values.tolist()
input = create_embeddings(input_dict['summary'])

In [None]:
input_dict['summary']

In [None]:
summary_based_findings = candidate_search(input, summary_db, 10, 'experience')
summary_based_findings

In [None]:
df.loc[29, 'summary']

In [None]:
df.loc[34, 'summary']

### 그렇다면 Input parsing은?

#### 1. 정형화된 input field
```python
input_dict = {'skills':['Flexibility Training', 'Nutrition', 'Anatomy', 'Strength Training'],
              'summary':"Extensive experience in designing and implementing personalized training programs for muscle growth, with a proven track record of helping clients achieve their fitness goals"}
```

#### 2. 비정형 input field (free text)
```python
???
```
    - step 1 : Input을 분석하여 search를 trigger하는 layer
    - step 2 : 요구사항을 분석하여 필요한 skill과 경험을 생성해주는 layer
    - step 3 : search

Step 1 : trigger layer 생성

In [None]:
job_req = "I want to grow muscle mass considering nutrient intake as well as various muscle training drills."

In [None]:
job_search_queries = ["Improving physical fitness through a combination of general physical education activities, balanced exercise routines, and nutritional awareness.",
                      "Enhancing overall health with a mix of diverse physical education exercises, targeted workouts, and mindful eating habits.",
                      "Increasing muscle volume by integrating nutritional strategies with multifaceted workout routines.",
                      "Building muscle density by focusing on nutrient-rich diets and comprehensive resistance training programs.",
                      "Streamlining trainer scheduling and client management to optimize the efficiency and effectiveness of a fitness facility.",
                      "Implementing cutting-edge fitness technology and equipment maintenance protocols to ensure a state-of-the-art workout environment.",
                      "Developing comprehensive staff training programs to elevate the expertise and service quality of personal trainers and fitness instructors.",
                      "Enforcing health and safety standards to provide a secure and hygienic environment for members and staff alike.",
                      "Cultivating a community-focused atmosphere through member engagement initiatives and personalized fitness guidance to enhance client retention and satisfaction."]

In [None]:
job_search_query_embs = create_embeddings(job_search_queries)

In [None]:
input_emb = create_embeddings(input_dict['summary'])

In [None]:
def route_selection(query_emb, emb_list, threshold=0.5):
    cos_sim = cosine_similarity(query_emb, emb_list)

    threshold_check = cos_sim > threshold

    if threshold_check.sum()>0:
        return True
    else:
        return False

In [None]:
route_selection(input_emb, job_search_query_embs)

Step 2 : 요구 사항을 분석하여 skill과 경력을 생성하는 layer (input dataset 생성)

In [None]:
prompt = """
Analyze the provided task description to identify and categorize the essential qualifications and expertise required for the job. 
The analysis should focus on extracting relevant skills and summarizing the job capabilities necessary for achieving the specified goal.
Organize this information into a structured dictionary format.

Categories: Skills and Summary.

Instructions:
- Skills: Enumerate the critical skills necessary for someone to effectively fulfill the job requirements. These should be simple words such as 'Anatomy' or 'Strength Training'
- Summary: Draft a concise job description that encapsulates the professional experience and competencies needed to successfully execute the job responsibilities. 
            One example would be : "Extensive experience in designing and implementing personalized training programs for muscle growth, with a proven track record of helping clients achieve their fitness goals"

Please provide the extracted information in a dictionary format with the keys as 'skills' and 'summary'.

Task description:
{}
"""

In [None]:
output = normal_chat_completion(prompt.format(job_req))

In [None]:
json.loads(output.choices[0].message.content)

Step 3 : Search!

--END--