In [1]:
import pandas as pd

In [2]:
from langchain_community.document_loaders import JSONLoader
# -*- coding: utf-8 -*-
from langchain_community.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import json

In [3]:
with open('./IT_data.json', 'r') as f:
    dx = json.load(f)
dx

{'data': [{'id': 112,
   'title': 'IT Manager',
   'company': 'The Yoga Institute',
   'City': 'Mumbai',
   'State_code': 'MH',
   'Latitude': 19.076,
   'Longitude': 72.8777,
   'employmentType': 'Full-Time',
   'description': "The Yoga Institute is seeking a results-oriented IT Manager to manage our company's computer infrastructure, software applications, and supervise our IT department. Responsibilities include managing end-user requests, coordinating with IT vendors, managing cloud systems, ensuring digital security, upgrading IT systems with the latest tools and technologies, troubleshooting IT issues, and maintaining our networks and servers to ensure a state-of-the-art experience for end users.",
   'Edu_required': "Bachelor's degree in Computer Science, Information Technology, or similar; MBA in IT systems is a plus",
   'salaryRange': 75000.0,
   'datePosted': '2025-01-28 10:00:00 UTC',
   'page_content': ' IT Manager, Full-Time, Salary (₹ per month) 75,000, in Mumbai, MH, ed

In [8]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    if "source" in metadata:
        metadata["source"] = record.get("link")
    metadata["id"] = record.get("id")
    metadata["title"] = record.get("title")
    metadata["branch"] = record.get("branch")
    metadata["company"] = record.get("company")
    metadata["location"] = record.get("location")
    metadata["salaryRange"] = record.get("salaryRange")
    metadata["employmentType"] = record.get("employmentType")
    metadata["datePosted"] = record.get("datePosted")
    # metadata["timestamp_ms"] = record.get("timestamp_ms")

    # if "source" in metadata:
    #     source = metadata["source"].split("/")
    #     source = source[source.index("langchain"):]
    #     metadata["source"] = "/".join(source)

    return metadata


loader = JSONLoader(
    file_path='./IT_data.json',
    jq_schema='.data[]',
    content_key="page_content",
    metadata_func=metadata_func
)

data = loader.load()
data

[Document(metadata={'source': None, 'seq_num': 1, 'id': 101, 'title': 'IT Manager', 'branch': None, 'company': 'The Yoga Institute', 'location': 'Mumbai, MH', 'salaryRange': 75000, 'employmentType': 'Full-Time', 'datePosted': '2025-01-28 10:00:00 UTC'}, page_content=' IT Manager, Full-Time, Salary (₹ per month) 75,000, in Mumbai, MH, education required: Bachelors degree in Computer Science, Information Technology, or similar; MBA in IT systems is a plus, Company: The Yoga Institute\n\n The Yoga Institute is seeking a results-oriented IT Manager to manage our companys computer infrastructure, software applications, and supervise our IT department. Responsibilities include managing end-user requests, coordinating with IT vendors, managing cloud systems, ensuring digital security, upgrading IT systems with the latest tools and technologies, troubleshooting IT issues, and maintaining our networks and servers to ensure a state-of-the-art experience for end users.'),
 Document(metadata={'sou

In [None]:
import getpass
import os
os.environ["OPENAI_API_KEY"] = getpass.getpass()

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

In [5]:
from langchain_huggingface import HuggingFaceEmbeddings
embed_model = HuggingFaceEmbeddings(model_name="BAAI/bge-small-en-v1.5")

  from .autonotebook import tqdm as notebook_tqdm


In [122]:
data[0]

Document(metadata={'source': None, 'seq_num': 1, 'id': 111, 'title': 'Server', 'branch': None, 'company': 'Tacolicious', 'location': 'Palo Alto, CA', 'salaryRange': 8, 'employmentType': 'Part-Time', 'datePosted': '2014-08-16 15:35:36 UTC'}, page_content="Server, Part-Time, Salary ($ per hour/ day) 8.0, in Palo Alto, CA, education requirednan, Company Tacolicious\n\n Tacolicious' first Palo Alto store just opened recently, and we are hiring! If you love tacos, you will love working at our restaurant! \r\n\r\n ● Serve food/drinks to customers in a professional manner \r\n ● Act as a cashier when needed \r\n ● Clean up the dining space \r\n ● Train the new staff \r\n")

In [None]:
x = embed_model.embed_documents(data[:10])
x

In [9]:
db = FAISS.load_local(
    folder_path='./job_index_3k',
    embeddings=embed_model,
    allow_dangerous_deserialization=True,
)

In [10]:
from uuid import uuid4

uuids = [str(uuid4()) for _ in range(len(data))]

db.add_documents(documents=data, ids=uuids)
vector_db_path = "./job_index_3k"
db.save_local(vector_db_path)

In [11]:
db = FAISS.load_local(
    folder_path='./job_index_3k',
    embeddings=embed_model,
    allow_dangerous_deserialization=True,
)

In [61]:
results = db.similarity_search(
    "Jobs related to data science",
    k=10,
)
for res in results:
    if 'score' in res.metadata:
        print(f"* {res.page_content} [{res.metadata['score']}]")

* Data Scientist, Full-Time, in New York, NY, education required: Bachelor’s degree in Computer Science, Statistics, or related field, Company: Amazon.com Services LLC

As a Data Scientist at Amazon, you will work closely with various teams to analyze large datasets, develop machine learning models, and provide insights to drive business decisions. [0.4680019021034241]
* Staff Data Scientist - Monetization Science, Full-Time, Salary ($ per year) $164,000 - $238,000, Remote, education required: Not specified, Company: Indeed

The ideal candidate will bring a deep understanding of data science, machine learning, and optimization methodologies to develop and implement robust solutions. [0.5352334976196289]
* Staff Data Scientist - Monetization Science, Full-Time, Salary ($ per year) $164,000 - $238,000, Remote, education required: Master’s degree in Data Science, Economics, or related field, Company: Indeed

Develop and implement robust data science solutions to optimize monetization stra

In [53]:

retrieve = db.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"k": 10, "score_threshold": 0.1, "fetch_k": 15},
)

In [54]:
results = retrieve.invoke("Jobs related to data science")
# for res in results:
#     print(f"* {res.page_content} {}")

In [55]:
results[0].metadata

{'source': None,
 'seq_num': 10,
 'id': 110,
 'title': 'Data Scientist',
 'branch': None,
 'company': 'Amazon.com Services LLC',
 'location': 'New York, NY',
 'salaryRange': 'Not specified',
 'employmentType': 'Full-Time',
 'datePosted': '2025-01-29',
 'score': np.float32(0.4680019)}

In [67]:
results = [doc for doc in results if 'score' in doc.metadata]
results = sorted(results, key=lambda x: x.metadata['score'], reverse=True)
for doc in results:
    print(f"Content: {doc.page_content}, Score: {doc.metadata['score']}")

Content: Data Scientist, Full-Time, in Tallahassee, FL, education required: Not specified, Company: HNTB Corporation

Develops custom data models and algorithms to apply to data sets. Presents information using data visualization techniques., Score: 0.5433675050735474
Content: Staff Data Scientist - Monetization Science, Full-Time, Salary ($ per year) $164,000 - $238,000, Remote, education required: Master’s degree in Data Science, Economics, or related field, Company: Indeed

Develop and implement robust data science solutions to optimize monetization strategies., Score: 0.5367386341094971
Content: Staff Data Scientist - Monetization Science, Full-Time, Salary ($ per year) $164,000 - $238,000, Remote, education required: Not specified, Company: Indeed

The ideal candidate will bring a deep understanding of data science, machine learning, and optimization methodologies to develop and implement robust solutions., Score: 0.5352334976196289
Content: Data Scientist, Full-Time, in New York,

In [57]:
from typing import List
from langchain_core.documents import Document
from langchain_core.runnables import chain

@chain
def retriever(query: str) -> List[Document]:
    docs, scores = zip(*db.similarity_search_with_score(query, score_threshold=0.05, k=10))
    for doc, score in zip(docs, scores):
        doc.metadata["score"] = score

    return docs

In [58]:
result = retriever.invoke("Jobs related to data science")
for doc in result:
    print(f"Content: {doc.page_content}, Score: {doc.metadata['score']}")

ValueError: not enough values to unpack (expected 2, got 0)

In [39]:
result[0].metadata['score']

np.float32(0.4680019)