In [31]:
import os
import openai
openai.api_type = "azure"
openai.api_base = "https://<instance_name>.openai.azure.com/"
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("Azure_OPENAI_API_KEY")

In [16]:
import numpy as np
import pandas as pd
import pickle
import tiktoken
import time

COMPLETIONS_MODEL = "gpt4"
EMBEDDING_MODEL = "text-embedding-ada-002"  # dimension number 1536

In [17]:
pd.get_option("display.max_columns")
pd.get_option("display.max_colwidth")
pd.set_option('display.max_columns', 50)
pd.set_option('display.max_colwidth', 100)

In [18]:
df = pd.read_csv('./data/Jobs_NYC_Postings.csv', header=0)
df [0:2]

Unnamed: 0,Job ID,Agency,Posting Type,# Of Positions,Business Title,Civil Service Title,Title Classification,Title Code No,Level,Job Category,Full-Time/Part-Time indicator,Career Level,Salary Range From,Salary Range To,Salary Frequency,Work Location,Division/Work Unit,Job Description,Minimum Qual Requirements,Preferred Skills,Additional Information,To Apply,Hours/Shift,Work Location 1,Recruitment Contact,Residency Requirement,Posting Date,Post Until,Posting Updated,Process Date
0,606346,DEPARTMENT FOR THE AGING,External,1,Business Operations Analyst,ASSOCIATE STAFF ANALYST,Competitive-1,12627,00,"Administration & Human Resources Technology, Data & Innovation Policy, Research & Analysis",F,Experienced (non-manager),70611.0,81203.0,Annual,"2 Lafayette St., N.Y.",Human Resources,The Office of Human Resources (OHR) seeks a highly motivated individual to serve as a Business O...,"1. A masterâs degree from an accredited college or university, accredited by regional, nationa...",â¢ Project management experience and expertise is a plus. â¢ Self-starter; works independently...,,Please be sure to submit a resume & cover letter when applying. All current City Employees may a...,,,,"New York City residency is generally required within 90 days of appointment. However, City Emplo...",09/28/2023,27-DEC-2023,09/29/2023,11/20/2023
1,571361,DEPT OF ENVIRONMENT PROTECTION,External,1,Executive Program Manager,ADMINISTRATIVE CONSTRUCTION PR,Competitive-1,82991,M3,"Communications & Intergovernmental Affairs Engineering, Architecture, & Planning",F,Manager,72038.0,192152.0,Annual,96-05 Horace Harding Expway,BEDC EXEC / ADMINISTRATION,The NYC Department of Environmental Protection (DEP) provides more than a billion gallons of hig...,At least six years of full-time satisfactory experience in construction management work on capit...,"â¢\tPrior project management, construction management and/or engineering coursework or experien...",DEP is an equal opportunity employer with a strong commitment to the diversity of our organizati...,To apply click Apply Now,,,,New York City Residency is not required for this position,01/30/2023,,01/30/2023,11/20/2023


In [22]:
df = df[0:50]
df = df.rename(columns={'Job ID': 'jobid'})
df = df.rename(columns={'Job Description': 'jobdescription'})
print(df.shape)
df.dtypes

(50, 30)


jobid                              int64
Agency                            object
Posting Type                      object
# Of Positions                     int64
Business Title                    object
Civil Service Title               object
Title Classification              object
Title Code No                     object
Level                             object
Job Category                      object
Full-Time/Part-Time indicator     object
Career Level                      object
Salary Range From                float64
Salary Range To                  float64
Salary Frequency                  object
Work Location                     object
Division/Work Unit                object
jobdescription                    object
Minimum Qual Requirements         object
Preferred Skills                  object
Additional Information            object
To Apply                          object
Hours/Shift                       object
Work Location 1                   object
Recruitment Cont

In [23]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      engine=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[int, int], list[float]]:
    return {
        (idx, r.jobid): get_embedding(r.jobdescription) for idx, r in df.iterrows() # need to change based on specified dataframe
    }

In [None]:
document_embeddings = compute_doc_embeddings(df)
document_embeddings

In [25]:
with open(f'./data/document_embeddings.pkl', 'wb') as f:
	pickle.dump(document_embeddings, f)

In [26]:
with open(f'./data/document_embeddings.pkl', 'rb') as f:
	document_embeddings = pickle.load(f)

In [27]:
print(type(document_embeddings))

<class 'dict'>


In [28]:
# An example embedding:
example_entry = list(document_embeddings.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

(0, 606346) : [-0.03465575724840164, -0.008060249499976635, 8.603404421592131e-05, -0.03531914949417114, -0.02357705868780613]... (1536 entries)


In [29]:
def vector_similarity(x: list[float], y: list[float]) -> float:

    # return np.dot(np.array(x), np.array(y))
    return np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(int, int), np.array]) -> list[(float, (int, int))]:

    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [32]:
most_relevant_document_sections = order_document_sections_by_query_similarity("system engineer is ...", document_embeddings)
most_relevant_document_sections[0:20]

[(0.7955112260655106, (48, 552760)),
 (0.7919317851392924, (9, 552150)),
 (0.785408478935627, (20, 586358)),
 (0.7719230240141587, (42, 592944)),
 (0.7686674075723999, (14, 580562)),
 (0.7626958212180587, (46, 540899)),
 (0.7615506327336289, (6, 607141)),
 (0.759859156472747, (45, 527822)),
 (0.7565934921278077, (49, 602605)),
 (0.7497808357080238, (1, 571361)),
 (0.7463601386311924, (21, 610644)),
 (0.7460265763199324, (11, 567454)),
 (0.7397287681060001, (18, 615111)),
 (0.7394017546697347, (0, 606346)),
 (0.7381278053608447, (39, 571499)),
 (0.7369675596521301, (8, 572055)),
 (0.7364282673347158, (29, 527762)),
 (0.7364000826289051, (47, 534244)),
 (0.736113721229946, (15, 582062)),
 (0.7346991727005303, (40, 573794))]

In [33]:
print(type(most_relevant_document_sections))

<class 'list'>


In [34]:
print(most_relevant_document_sections[0][1])
print(most_relevant_document_sections[0][1][0])

(48, 552760)
48


In [35]:
df.loc[48, "jobdescription"]

'***IMPORTANT NOTE: Only those currently serving as a permanent Mechanical Engineer Intern will be considered.  The New York City Department of Environmental Protection (DEP) protects public health and the environment by supplying clean drinking water, collecting and treating wastewater, and reducing air, noise, and hazardous materials pollution. DEP is the largest combined municipal water and wastewater utility in the country, with nearly 6,000 employees. We deliver 1.1 billion gallons of high quality drinking water per day to 8.5 million New York City residents and more than 1 million people in Upstate New York, and we collect and treat an average of 1.3 billion gallons of wastewater per day.  The Bureau of Wastewater Treatment is responsible for the operation and maintenance of all facilities related to the treatment of sewage, including wastewater treatment plants, collections facilities (pumping stations, combined sewer overflow retention facilities, regulators, tide gates), waste

In [36]:
df.loc[most_relevant_document_sections[0][1][0], "jobdescription"]

'***IMPORTANT NOTE: Only those currently serving as a permanent Mechanical Engineer Intern will be considered.  The New York City Department of Environmental Protection (DEP) protects public health and the environment by supplying clean drinking water, collecting and treating wastewater, and reducing air, noise, and hazardous materials pollution. DEP is the largest combined municipal water and wastewater utility in the country, with nearly 6,000 employees. We deliver 1.1 billion gallons of high quality drinking water per day to 8.5 million New York City residents and more than 1 million people in Upstate New York, and we collect and treat an average of 1.3 billion gallons of wastewater per day.  The Bureau of Wastewater Treatment is responsible for the operation and maintenance of all facilities related to the treatment of sewage, including wastewater treatment plants, collections facilities (pumping stations, combined sewer overflow retention facilities, regulators, tide gates), waste

In [37]:
for f, i in most_relevant_document_sections[0:30]:
    # print(str(i) + ',' + df.loc[i[0], "jobdescription"] + ',' + str(f))
    print(i[1])
    print(df.loc[df["jobid"].eq(i[1]), ["jobid", "jobdescription"]])
    # print(df.loc[df["jobid"].isin([i[1]]), ["jobid", "jobdescription"]])
    

552760
     jobid  \
48  552760   

                                                                                         jobdescription  
48  ***IMPORTANT NOTE: Only those currently serving as a permanent Mechanical Engineer Intern will b...  
552150
    jobid  \
9  552150   

                                                                                        jobdescription  
9  Duties included, but are not limited, to: Deliver service and support to end-users. Interact wit...  
586358
     jobid  \
20  586358   

                                                                                         jobdescription  
20  ***IMPORTANT NOTE: Only those currently serving as a permanent Asst. Environmental Engineer will...  
592944
     jobid  \
42  592944   

                                                                                         jobdescription  
42  The NYC Department of Environmental Protection (DEP) enriches the environment and protects publi...  
580562
     