In [None]:
import pandas as pd
import numpy as np
import json
import os
import openai
from tqdm.notebook import tqdm

In [None]:
# initialize openai
os.environ['OPENAI_API_KEY']= "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"
openai.api_key = os.environ["OPENAI_API_KEY"]

client = openai.OpenAI()

In [None]:
df = pd.read_csv("04.VectorDB_data/Resume.csv")
df.shape

In [None]:
df.Category.unique()

In [None]:
df.Category.unique()

In [None]:
df.loc[df['Category']=='CHEF']

In [None]:
with open("04.VectorDB_data/resume_info_extracted.json", 'r') as file:
    data = json.load(file)

with open("04.VectorDB_data/resume_info_extracted_emb.json", 'r') as file:
    emb_data = json.load(file)

In [None]:
data[0]

In [None]:
emb_data[0]['ID']

### Connect to Pinecone

In [1]:
from pinecone import Pinecone

pc = Pinecone(api_key="74e30e50-02fa-4e55-9bff-affa6a3817a0")
index = pc.Index("fastcampus")

In [None]:
pc.describe_index("fastcampus")

### Upset data

In [None]:
data[0]

- 필요한 데이터:
    - skills & work summary
- 문제점:
    - pinecone에 저장을 하기 위해서는 한 row당 index 하나 밖에 만들지 못 함
- 해결 방법:
    - meta data에 field를 추가하여 이력서 ID 등과 같은 데이터를 저장
- 기존 dataframe을 사용할 때와 다른 점 :
    - 각 data point가 하나의 embedding vector가 되어야 함

Desired input format : 
```json
work_experience = {
    "id" : 1234-work,
    "values" : [0.23432, 0.22149, ...],
    "metadata" : {
        "summary" : "Worked as a head chef in ...",
        "ID" : 1234
    }
}

skill = {
    "id" : 1234-skill,
    "values" : [0.92372, 0.678234, ...],
    "metadata" : {
        "summary" : "Cooking",
        "ID" : 1234
    }
}

```

![](images/indexing.png)

In [None]:
data[0]

In [None]:
exp_vectors = list()
skill_vectors = list()

for d, emb in zip(data, emb_data):
    assert d['ID']==emb['ID'], "You should check your embeddings again"

    for i, summary_emb in enumerate(emb['summary']):
        value = {
                "id" : str(d['ID']) + "-work" + str(i),
                "values" : summary_emb,
                "metadata" : {'summary' : d['summary'][i],
                            'ID':d['ID']},
                }
        exp_vectors.append(value)

    for i, skill_emb in enumerate(emb['skills']):
        value = {
                "id" : str(d['ID']) + "-skill" + str(i),
                "values" : skill_emb,
                "metadata" : {"skill" : d['skills'][i],
                              "ID":d['ID']}
                }
        skill_vectors.append(value)

In [None]:
def create_batches(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

upsert work experience

추천 사항을 읽어볼 것 : https://docs.pinecone.io/docs/limits

In [None]:
exp_batches = list(create_batches(exp_vectors, 50))

for batch in tqdm(exp_batches):
    index.upsert(vectors=batch,
                 namespace="work_exp")

In [None]:
len(exp_vectors)

upsert skills

In [None]:
skill_batches = list(create_batches(skill_vectors, 50))

for batch in tqdm(skill_batches):
    index.upsert(vectors=batch,
                 namespace="skill")

In [2]:
# index.delete(delete_all=True, namespace='work_exp')
# index.delete(delete_all=True, namespace='skill')

{}

---

### Search & retrieval (test)

In [None]:
from text_utils import create_embeddings

현재 예시에 hybrid search가 적합하지 않은 이유 : 
- 우리의 'skills'들이 사전에 정해진 단어들로만 이루어져 있으면 가능하겠지만, 그렇지 않은 상황이다.
- 사용자의 input을 통해 얻는 query가 우리가 갖고 있는 skill의 리스트에 국한되지 않는다.
- hybrid search에 많이 쓰이는, ranking function 중 하나인 BM25는 training 과정에서 보지 못 한 단어는 처리하지 못 한다 (sparse vector 상으로 표현 X)

hybrid search가 적합한 경우:
- Skill의 list를 활용하여 후보군들을 선정
- 만약 이력서 데이터셋에 있는 skill들의 variation이 정해져 있덨다면 아주 적합한 선택이었을 것임
- 전문 용어들을 활용하여 document search를 하는 경우 (semantic representation으로 나타내기 어려운 전문용어의 경우에도 활용 가능)

In [None]:
data[0]

In [None]:
search_skill = data[0]['skills'][0]
search_emb = create_embeddings(search_skill)
print(search_skill)

In [None]:
index.query(
    top_k=50,
    vector=search_emb,
    namespace='skill',
    include_metadata=True
    )

---

### Search & retrieval + postprocessing

In [None]:
data[0]

In [None]:
skills = ['Menu Development', 'Catering', 'Inventory Management']
exp = 'As the Executive Chef at Le Gourmet Quatre, a Michelin-starred fine dining restaurant, I led a team of 20 chefs in developing innovative French-Asian fusion menus, while also managing kitchen operations efficiently to uphold the highest standards of food safety and cost control'

search & retrieval

![](images/search&retrieval.png)

In [None]:
emb_skills = create_embeddings(skills)
emb_exp = create_embeddings(exp)[0]

In [None]:
def search_vdb(vdb_index, query_emb, top_k, namespace):
    output = vdb_index.query(
        namespace=namespace,
        top_k=top_k,
        vector=query_emb,
        include_metadata=True
    )

    return output['matches']

In [None]:
# search
skill_outputs = {s:search_vdb(index, i, 10, 'skill') for s,i in zip(skills, emb_skills)}
exp_outputs = search_vdb(index, emb_exp, 10, 'work_exp')

In [None]:
skill_outputs.keys()

In [None]:
skill_outputs['Catering']

In [None]:
exp_outputs

postprocessing

![](images/postprocessing.png)

In [None]:
skill_df = pd.DataFrame()

for k,v in skill_outputs.items():
    # score thresholding
    v = [i for i in v if i['score']>0.5]
    # top3개만 가져옴. 필요한 정보는 모두 metadata에 저장되어 있음
    v = [i['metadata'] for i in v][:3]
    # convert to dataframe (다루기 쉽도록)
    v_df = pd.DataFrame(v)
    v_df['query_skill'] = k
    # concat
    skill_df = pd.concat([skill_df, v_df], axis=0)

In [None]:
skill_df

In [None]:
exp_outputs = [i for i in exp_outputs if i['score']>0.5]
exp_outputs = [i['metadata'] for i in exp_outputs][:3]

In [None]:
exp_outputs

In [None]:
pd.DataFrame(exp_outputs)

하나의 function으로 변환

In [None]:
def search(index, skills, exp, top_k=10, threshold=0.5):
    emb_skills = create_embeddings(skills)
    emb_exp = create_embeddings(exp)[0]

    skill_outputs = {s:search_vdb(index, i, top_k, 'skill') for s,i in zip(skills, emb_skills)}
    exp_outputs = search_vdb(index, emb_exp, top_k, 'work_exp')
    
    skill_df = pd.DataFrame()

    for k,v in skill_outputs.items():
        # score thresholding
        v = [i for i in v if i['score']>threshold]
        # top3개만 가져옴. 필요한 정보는 모두 metadata에 저장되어 있음
        v = [i['metadata'] for i in v][:3]
        # convert to dataframe (다루기 쉽도록)
        v_df = pd.DataFrame(v)
        v_df['query_skill'] = k
        # concat
        skill_df = pd.concat([skill_df, v_df], axis=0)

    exp_outputs = [i for i in exp_outputs if i['score']>threshold]
    exp_outputs = [i['metadata'] for i in exp_outputs][:3]
    exp_df = pd.DataFrame(exp_outputs)

    return skill_df, exp_df

In [None]:
s, e = search(index, skills, exp)

In [None]:
s

In [None]:
e

---

### Query transformation

![](images/querying.png)

In [None]:
from text_utils import normal_chat_completion

In [None]:
skills = ['Menu Development', 'Catering', 'Inventory Management']
exp = 'As the Executive Chef at Le Gourmet Quatre, a Michelin-starred fine dining restaurant, I led a team of 20 chefs in developing innovative French-Asian fusion menus, while also managing kitchen operations efficiently to uphold the highest standards of food safety and cost control'

1. Query rewriting
- 사용자의 쿼리를 rewrite하여 search에 최적화된 형태로 변형

In [None]:
rewriting_prompt = """Convert the [user input] into a format as if it was written in a resume.
Example: 
    - [user input] : 내가 벤치 프레스, 스쿼트, 그리고 데드리프트를 총 500kg을 들 수 있는데 도와줄 수 있는 사람을 추천해줘.
    - [output] : Worked as a personal trainer, helping people to achieve their personal fitness goals in various fields such as weight lifting and losing weight.
    - [skills] : [Strength training, weight lifting, coathing, anatomy]
23
Desired output format:
    - json format with 'output' and 'skills'
    - the value of 'output' should be a sentence string in a format of 'Worked as a <job title>, <job description>'
    - the value of 'skills' should be a list of 5 strings. Each element in a list should be a realistic skill that matches with the job description
    
The [user input] : {}
[output] : 
"""

In [None]:
input = "3대 500을 할 수 있도록 도와줄 수 있는 사람을 찾아줘"

a = normal_chat_completion(rewriting_prompt.format(input))

In [None]:
json.loads(a.choices[0].message.content)

2. Breakdown multiple requests
- 사용자의 쿼리 내에 여러개의 요구사항이 있다면, 해당 요구사항들을 개별적으로 분리
- 분리된 요구사항들을 개별적으로 처리

In [None]:
sub_query_prompt = """The user will request a talent recommendation.
If the user's request contains mentions of multiple talent recommendation,
divide them into a separate but full sentences.

Example 1 :
    - [user input] : 내가 벤치 프레스, 스쿼트, 그리고 데드리프트를 총 500kg을 들 수 있는데 도와줄 수 있는 사람을 추천해주고, 그에 맞는 식단을 만들어줄 수 있는 사람도 추천해줘.
    - [output] : [내가 벤치 프레스, 스쿼트, 그리고 데드리프트를 총 500kg을 들 수 있는데 도와줄 수 있는 사람을 추천해줘, 
                          내가 벤치 프레스, 스쿼트, 그리고 데드리프트를 총 500kg을 드는데 도움을 줄 식단을 만들어 줄 수 있는 사람을 추천해줘]

If the user's request only mentions of one talent recommendation,
provide the exact same input as output.

Example 2 :
    - [user input] : 내가 벤치 프레스, 스쿼트, 그리고 데드리프트를 총 500kg을 들 수 있는데 도와줄 수 있는 사람을 추천해줘
    - [output] : [내가 벤치 프레스, 스쿼트, 그리고 데드리프트를 총 500kg을 들 수 있는데 도와줄 수 있는 사람을 추천해줘]

If the user's request does not mention clear descriptions about each talent,
provide 'False' as output.

Example 3 :
    - [user input] : 내 이사를 도와줄 사람 5명을 추천해줘
    - [output] : [False]

Desired output format :
    - json format with 'original_input' and 'output' as keys.
    - the 'original_input' should be the input I provide you.
    - the 'output' is the rewritten input by you.

[user input] : {}
"""

In [None]:
input = "파인다이닝 음식점에서 음식을 구상할 수 있는 사람 한 명, 그리고 그 음식을 서빙할 수 있는 사람을 추천해줘"

a = normal_chat_completion(sub_query_prompt.format(input))

json.loads(a.choices[0].message.content)

In [None]:
input = "내가 이번에 음식점을 오픈하는데, 거기에 들어갈 인원 5명을 추천해줘"

a = normal_chat_completion(sub_query_prompt.format(input))

json.loads(a.choices[0].message.content)

In [58]:
def query_transformation(input, sub_query_prompt, rewriting_prompt):
    transformed_query = normal_chat_completion(sub_query_prompt.format(input))
    transform_output = json.loads(transformed_query.choices[0].message.content)['output']
    
    if isinstance(transform_output, list):
        if transform_output[0]==False:
            return "조금 더 구체적으로 인풋을 작성해주세요"
        else:
            search_queries = [normal_chat_completion(rewriting_prompt.format(output)) for output in transform_output]
            search_queries = [json.loads(i.choices[0].message.content) for i in search_queries]
            return search_queries
    else:
        search_queries = [normal_chat_completion(rewriting_prompt.format(transform_output))]
        search_queries = [json.loads(search_queries.choices[0].message.content)]
        return search_queries

In [59]:
input = "내가 이번에 음식점을 오픈하는데, 거기에 들어갈 인원 5명을 추천해줘"

query_transformation(input, sub_query_prompt, rewriting_prompt)

'조금 더 구체적으로 인풋을 작성해주세요'

In [60]:
input = "내 사업을 성공시켜줄 사람을 추천해줘"

query_transformation(input, sub_query_prompt, rewriting_prompt)

[{'output': 'Worked as a Business Development Consultant, successfully implementing strategies for business growth and sustainability.',
  'skills': ['Strategic planning',
   'Market research',
   'Financial analysis',
   'Networking',
   'Project management']}]

---

In [61]:
input = "파인다이닝 음식점에서 음식을 구상할 수 있는 사람 한 명, 그리고 그 음식을 서빙할 수 있는 사람을 추천해줘"

queries = query_transformation(input, sub_query_prompt, rewriting_prompt)

In [62]:
queries

[{'output': 'Worked as a Fine Dining Chef, responsible for conceptualizing dishes and menus, ensuring each dish meets the highest standards of quality and creativity.',
  'skills': ['Menu development',
   'Culinary arts',
   'Food presentation',
   'Creativity in food preparation',
   'Knowledge of international cuisines']},
 {'output': 'Worked as a Fine Dining Server, providing high-quality service by serving meals, understanding detailed menu items, and ensuring customer satisfaction in a fine dining restaurant setting.',
  'skills': ['High-quality customer service',
   'Detailed knowledge of menu items',
   'Wine pairing and presentation',
   'Effective communication and interpersonal skills',
   'Ability to work in a fast-paced environment']}]

In [63]:
retrieved = [search(index, query['skills'], query['output']) for query in queries]

In [64]:
retrieved[0]

(         ID                                     skill  \
 0  34452806                          Menu development   
 1  86551046                          Menu development   
 2  25128608                          Menu development   
 0  16924102                                  Culinary   
 1  34452806                        Culinary education   
 2  53265899                          Culinary Science   
 0  34452806                         Food presentation   
 1  25128608                         Food presentation   
 2  14663897                         food presentation   
 0  35157762                          Food preparation   
 1  61322296                          Food preparation   
 2  29211359                                Creativity   
 0  16924102  Knowledge of different styles of cooking   
 1  18825446             International culinary skills   
 2  35157762                          Gourmet Cuisines   
 
                            query_skill  
 0                     Menu 

In [65]:
input = "보디빌딩 대회에서 우승을 할 수 있게끔 도와줄 수 있는 사람을 추천해줘"

queries = query_transformation(input, sub_query_prompt, rewriting_prompt)
retrieved = [search(index, query['skills'], query['output']) for query in queries]

In [68]:
retrieved[0][0]

Unnamed: 0,ID,skill,query_skill
0,13367322,Physically strong,Physique enhancement
1,61322296,Physically fit,Physique enhancement
0,28321954,Strength Trainer,Strength training
1,24994145,Fitness Training,Strength training
2,32517106,Weight training expertise,Strength training
0,28321954,Nutrition Plans,Diet planning
1,29449419,Menu planning,Diet planning
2,20321582,Menu planning,Diet planning
0,12938389,Process development,Routine development
1,23477199,Development and training,Routine development


In [69]:
retrieved[0][1]

Unnamed: 0,ID,summary
0,20565486,Worked as Fitness Coach from September 2015 to...
1,29425788,Worked as Wellness Coach and Fitness Specialis...
2,16474898,Worked as Program Co-coordinator and Coach; Fi...
