In [None]:
! pip install -qU langchain-openai langchain langchain-community sentence-transformers unstructured[all-docs]

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━[0m [32m665.6/981.5 kB[0m [31m19.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.5/80.5 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.2/106.2 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m67.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m61.7 MB/s

In [None]:
import docx
from unstructured.partition.docx import partition_docx
from pathlib import Path

import getpass
import os

from langchain_openai import ChatOpenAI
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser

from unstructured.cleaners.core import bytes_string_to_string
from unstructured.cleaners.core import clean
from unstructured.cleaners.core import clean_bullets
from unstructured.cleaners.core import clean_dashes
from unstructured.cleaners.core import clean_non_ascii_chars
from unstructured.cleaners.core import clean_ordered_bullets
from unstructured.cleaners.core import clean_trailing_punctuation
from unstructured.cleaners.core import remove_punctuation

# Load JD

In [None]:
from typing import List
from pathlib import Path
from docx import Document

def load_jd(folder: str | Path, pattern: str = "*.docx") -> List[str]:
    folder = Path(folder)
    if not folder.exists():
        raise FileNotFoundError(f"Folder not found: {folder.resolve()}")

    jd_list = []
    for file in sorted(folder.glob(pattern)):
        if file.is_file():
          file_path = str(file.absolute())
          doc = Document(file_path)
          jd_text = ""
          for para in doc.paragraphs:
            jd_text += para.text + "\n"
        jd_list.append(jd_text)
    return jd_list

In [None]:
jd = load_jd("/content/jd")

In [None]:
jd_text = jd[0]
print(jd_text)

Job Title:
Solution Architect – SAP Analytics & BI
Job Summary:
We are seeking a highly experienced Solution Architect – Analytics with 14 years of proven expertise in SAP BW/BI, BW/4HANA, ABAP, AMDP, Native HANA, and Power BI. The ideal candidate will architect and implement end-to-end enterprise analytics solutions, integrating SAP and non-SAP systems to deliver scalable, high-performance reporting and business intelligence platforms. This role requires a deep understanding of HANA data modeling, Power BI dashboarding, and modern analytics frameworks, with strong experience in leading upgrades, migrations, and performance optimization projects.
Key Responsibilities:
Lead the design and implementation of enterprise analytics solutions using SAP BW/4HANA, SAP HANA, and Power BI.
Architect and solution large-scale reporting platforms, including WEBI to Power BI conversions and BO 4.3 upgrades.
Design, develop, and optimize HANA Views, ADSO, Composite Providers, Open ODS Views, and CDS V

# Building JD Parser using LLM - Prompting

In [None]:
import getpass
import os

if "OPENAI_API_KEY" not in os.environ:
    os.environ["OPENAI_API_KEY"] = ""

In [None]:
prompt_template = """
You are a world-class information extraction system. Extract fields from a Job Description (JD) and return STRICT JSON only.

### JSON SCHEMA (MANDATORY KEYS) - Below are the valid key and value of the JSON Object

  "job_title": string,
  "job_summary": string,
  "key_responsibilities": string[],        // clean bullet points; one idea per item
  "required_skills": string[],             // normalize tech/tool names; dedupe
  "qualifications": string[],              // degrees, certifications, nice-to-have
  "experience":                           // parse if present; else keep strings or nulls
    "total_years_min": number|null,
    "total_years_max": number|null,
    "notes": string                        // original phrasing if ranges/mixed
  ,
  "location": string                       // City/State/Country + work mode if present


### NORMALIZATION RULES
- Preserve meaning, but clean text: remove leading bullets, numbering, and emojis.
- Keep **lists** as arrays of short, declarative lines (no trailing punctuation).
- Dedupe semantically identical items; prefer canonical tool names (e.g., "PyTorch", "TensorFlow").
- If a field is missing in the JD, return an empty string `""` or empty array `[]` as appropriate; keep keys.
- Experience parsing:
  - Extract numeric years where possible. If “12–15 years”, set `total_years_min=12`, `total_years_max=15`.
  - If only one value is present (“14+ years”), set `min=14`, `max=null` and put the original phrase in `notes`.
  - Always fill `notes` with the closest original text (e.g., “12–15 years total; 5+ years AI/ML; 2+ years GenAI”).
- Location: include work mode if present (e.g., “Bangalore, India — Hybrid/Remote”).

### OUTPUT
- Return **ONLY** valid JSON (double quotes for all strings, no comments, no markdown fences).
- Do not add extra keys.

### INPUT JD
{jd_text}

 """

In [None]:
prompt = ChatPromptTemplate.from_template(prompt_template)

In [None]:
llm = ChatOpenAI(model="gpt-4o", temperature=0.2,max_tokens=700)
chain = prompt | llm | JsonOutputParser()

llm_response_json = chain.invoke({"jd_text":jd_text})
llm_response_json

{'job_title': 'Solution Architect – SAP Analytics & BI',
 'job_summary': 'We are seeking a highly experienced Solution Architect – Analytics with 14 years of proven expertise in SAP BW/BI, BW/4HANA, ABAP, AMDP, Native HANA, and Power BI. The ideal candidate will architect and implement end-to-end enterprise analytics solutions, integrating SAP and non-SAP systems to deliver scalable, high-performance reporting and business intelligence platforms. This role requires a deep understanding of HANA data modeling, Power BI dashboarding, and modern analytics frameworks, with strong experience in leading upgrades, migrations, and performance optimization projects.',
 'key_responsibilities': ['Lead the design and implementation of enterprise analytics solutions using SAP BW/4HANA, SAP HANA, and Power BI',
  'Architect and solution large-scale reporting platforms, including WEBI to Power BI conversions and BO 4.3 upgrades',
  'Design, develop, and optimize HANA Views, ADSO, Composite Providers, 

In [None]:
llm_response_json['key_responsibilities']

['Lead the design and implementation of enterprise analytics solutions using SAP BW/4HANA, SAP HANA, and Power BI',
 'Architect and solution large-scale reporting platforms, including WEBI to Power BI conversions and BO 4.3 upgrades',
 'Design, develop, and optimize HANA Views, ADSO, Composite Providers, Open ODS Views, and CDS Views to support analytical requirements',
 'Build and automate Power BI dataflows, datasets, dashboards, and paginated reports using data from HANA, Azure Synapse, and SharePoint',
 'Apply best practices in code pushdown and performance tuning to optimize data models and query execution',
 'Work closely with business stakeholders to gather requirements, prepare technical specifications, and ensure successful delivery of BI solutions',
 'Oversee data migrations, upgrades, and cut-over activities during SAP landscape transitions (e.g., BW to BW/4HANA, HANA upgrades)',
 'Manage incidents, track deliverables through ServiceNow/JIRA, and lead offshore teams through 

In [None]:
WEAVIATE_URL = "eq9xrs3jtoep5ksrao86bw.c0.asia-southeast1.gcp.weaviate.cloud"
WEAVIATE_API_KEY = "cGxDeERtK0ZUUkQ5Wkt1Z19rWDJlWHNxZ0dzNnlmbFZSV2QvS2k3TER6ZDV4NEVMb2U3dTd3dGRteStZPV92MjAw"
OPENAI_API_KEY = ""
CollectionName = "resume"

In [None]:
# Reranker
RERANKER_MODEL="BAAI/bge-reranker-base"
RERANK_TOP_K=3

# Create Weaviate Client

In [None]:
! pip install weaviate-client==4.16.6

Collecting protobuf<7.0.0,>=6.31.1 (from grpcio-health-checking<1.80.0,>=1.59.5->weaviate-client==4.16.6)
  Using cached protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl.metadata (593 bytes)
Using cached protobuf-6.33.0-cp39-abi3-manylinux2014_x86_64.whl (323 kB)
Installing collected packages: protobuf
  Attempting uninstall: protobuf
    Found existing installation: protobuf 3.20.3
    Uninstalling protobuf-3.20.3:
      Successfully uninstalled protobuf-3.20.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.19.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.3, but you have protobuf 6.33.0 which is incompatible.
google-ai-generativelanguage 0.6.15 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<6.0.0dev,>=3.20.2, but you have protobuf 6.33.0 which is incompatible.
grpcio-stat

In [None]:
import weaviate
import os

headers = {
    "X-Openai-Api-Key": OPENAI_API_KEY
}

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,  # Replace with your WCD URL
    auth_credentials=WEAVIATE_API_KEY, # Replace with your WCD key
    headers=headers,
)

# Retrieval By Key Responsibilities, Skills,Job Summary - Hybrid Search -> Rerank

In [None]:
from weaviate.classes.query import Filter

min_exp = llm_response_json['experience']['total_years_min']
max_exp = llm_response_json['experience']['total_years_max']

filters = None
if min_exp and max_exp:
    filters = (
        Filter.by_property("year_of_experince").greater_or_equal(min_exp)
        & Filter.by_property("year_of_experince").less_or_equal(max_exp)
    )
elif min_exp:
    filters = Filter.by_property("year_of_experince").greater_or_equal(min_exp)
elif max_exp:
    filters = Filter.by_property("year_of_experince").less_or_equal(max_exp)

**1.Creating Query to find right chunk**

In [None]:
query_by_responsibility = ""
query_by_skills = ""
query_by_job_summary = llm_response_json['job_summary'].lower()

query1 = [responsibility.lower() for responsibility in llm_response_json['key_responsibilities']]
query2 = [skills.lower() for skills in llm_response_json['required_skills']]

for q in query1:
  query_by_responsibility += f" {q}"

for q in query2:
  query_by_skills += f" {q}"


In [None]:
query_by_job_summary

'we are seeking a highly experienced solution architect – analytics with 14 years of proven expertise in sap bw/bi, bw/4hana, abap, amdp, native hana, and power bi. the ideal candidate will architect and implement end-to-end enterprise analytics solutions, integrating sap and non-sap systems to deliver scalable, high-performance reporting and business intelligence platforms. this role requires a deep understanding of hana data modeling, power bi dashboarding, and modern analytics frameworks, with strong experience in leading upgrades, migrations, and performance optimization projects.'

**2.Perform Hybrid Search with Weaviate db**

In [None]:
resume = client.collections.use(CollectionName)

response_for_responsibility = resume.query.hybrid(
                query=query_by_responsibility,
                filters=filters,
                 alpha=0.5,
                 limit=10,
                target_vector="key_responsibilities",
            )

response_for_skill = resume.query.hybrid(
                query=query_by_skills,
                filters=filters,
                 alpha=0.5,
                 limit=10,
                target_vector="skills",
            )

response_for_job_summary = resume.query.hybrid(
                query=query_by_job_summary,
                filters=filters,
                 alpha=0.5,
                 limit=10,
                target_vector="resume",
            )

In [None]:
client.close()

In [None]:
response_for_job_summary.objects

[Object(uuid=_WeaviateUUIDInt('c634ce31-530a-5165-855a-69a84d857092'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=None, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'year_of_experince': 14.0, 'certifications': [], 'key_responsibilities': ['solution architect in analytics practice', 'solutioned webi to powerbi conversion project', 'created dataflows, datasets, reports, paginated reports', 'architected power bi dashboard design', 'worked on bo 4.3 upgrade project', 'worked on bw/4hana 1.0 green field implementation', 'worked on bw/4hana 2.0 upgrade projects', 'worked extensively on sap bi in various domains', 'worked with abap cds views and native hana modeling', 'provided key user training and proposed system improvements', 'involved in performance tuning techniques', 'worked on sap bo security setup and broadcasting', 'involved in hana upgradation project', 'worked through different ticketing tools'

**3.Create Candidate chunks for Reranking**

In [None]:
all_chunks_for_responsibility = [o for o in response_for_responsibility.objects]
all_chunks_for_skills = [o for o in response_for_skill.objects]
all_chunks_for_job_summary = [o for o in response_for_job_summary.objects]

In [None]:
#all_chunks

**ReRank Util Function**

In [None]:
!pip install protobuf==3.20.3 --force-reinstall --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-health-checking 1.75.1 requires protobuf<7.0.0,>=6.31.1, but you have protobuf 3.20.3 which is incompatible.
onnx 1.19.1 requires protobuf>=4.25.1, but you have protobuf 3.20.3 which is incompatible.
ydf 0.13.0 requires protobuf<7.0.0,>=5.29.1, but you have protobuf 3.20.3 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.
tensorflow-metadata 1.17.2 requires protobuf>=4.25.2; python_version >= "3.11", but you have protobuf 3.20.3 which is incompatible.
grpcio-status 1.71.2 requires protobuf<6.0dev,>=5.26.1, but you have protobuf 3.20.3 which is incompatible.[0m[31m
[0m

In [None]:
from typing import List, Dict
from sentence_transformers import CrossEncoder

In [None]:
from typing import List, Dict
from sentence_transformers import CrossEncoder

def rerank(query: str, candidates: List, top_k: int = None) -> List[Dict]:
   _reranker = CrossEncoder(RERANKER_MODEL, trust_remote_code=True)

   if top_k is None:
        top_k = RERANK_TOP_K
   if not candidates:
        return []

   # Convert Weaviate Objects to dictionaries for mutability
   mutable_candidates = [c.properties for c in candidates]
   pairs = [(query, c["resume_chunk"]) for c in mutable_candidates]
   scores = _reranker.predict(pairs)

   for c, s in zip(mutable_candidates, scores):
        c["rerank_score"] = float(s)
   ranked = sorted(mutable_candidates, key=lambda x: x["rerank_score"], reverse=True)
   return ranked[:top_k]

# Calling rerank function - for query and list of chunks

In [None]:
rarank_chunks_for_responsibility = rerank(query_by_responsibility, all_chunks_for_responsibility)
rarank_chunks_for_skill = rerank(query_by_skills, all_chunks_for_skills)
rarank_chunks_for_job_summary = rerank(query_by_job_summary, all_chunks_for_job_summary)

In [None]:
len(rarank_chunks_for_job_summary)

3

**Combine all chunks and find unique resume Link - This is the desired output/result**

In [None]:
resume_link_set = set()

for chunk in rarank_chunks_for_responsibility:
  resume_link_set.add(chunk['resume_link'])

for chunk in rarank_chunks_for_skill:
  resume_link_set.add(chunk['resume_link'])

for chunk in rarank_chunks_for_job_summary:
  resume_link_set.add(chunk['resume_link'])

In [None]:
resume_link_set

{'/content/resume_db/Candidate 105.docx'}

In [None]:
for chunk in rarank_chunks_for_skill:
  print(chunk['rerank_score'])
  print(chunk['resume_chunk'])
  print(chunk['resume_link'])
  print("---------------------------------------------------")

0.7962647080421448
developed finance and insurance web apps using java stack.

education

mca (master of computer applications), university of calcutta, 2007–2010

skills

python, pandas, numpy, scikit-learn, mlflow, tensorflow, pytorch, langchain, generative ai, prompt engineering, vector databases (pinecone), fastapi, docker, kubernetes, aws, gcp, sql, mongodb

key strengths

architectural mindset | emerging tech enthusiast | mentor & leader | innovation-driven mindset
/content/resume_db/Rajesh_Kumar_Kushwaha_Resume.docx
---------------------------------------------------
0.5987187623977661
built rag, langchain, vector db (pinecone) solution improving retrieval performance by 30%.

data scientist | coforge | 08/2023 – 06/2024

automated claim processing with nlp and ml models.

fine-tuned bert for ner to extract policy information from claim forms.

deployed ai pipeline using fastapi.

data scientist | mercedes-benz | 11/2022 – 05/2023

led dealer retention projects, improving engage