In [7]:
from qdrant_client import QdrantClient
from qdrant_client.models import VectorParams, Distance
from dotenv import load_dotenv
import os

load_dotenv()  # Load environment variables from .env file

cloud_client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY")
)

if not cloud_client.collection_exists("resumes"):
    cloud_client.create_collection(
        collection_name="resumes",
        vectors_config=VectorParams(
            size=384,   # SAME AS LOCAL
            distance=Distance.COSINE
        )
    )
    print("‚úÖ Cloud Collection created")
else:
    print("‚ÑπÔ∏è Cloud Collection already exists")
info = cloud_client.get_collection("resumes")
print(info)

‚ÑπÔ∏è Cloud Collection already exists


In [6]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from dotenv import load_dotenv

import os 

load_dotenv()  # Load environment variables from .env file

# Local Docker
local_client = QdrantClient(url="http://localhost:6333")

# Cloud
cloud_client = QdrantClient(
    url=os.getenv("QDRANT_URL"),
    api_key=os.getenv("QDRANT_API_KEY"),
    timeout=120
)

collection_name = "resumes"
batch_size = 50
offset = None
total = 0

print("üöÄ Starting migration...")

while True:
    records, offset = local_client.scroll(
        collection_name=collection_name,
        limit=batch_size,
        offset=offset,
        with_vectors=True,
        with_payload=True
    )

    if not records:
        break

    points = [
        PointStruct(
            id=record.id,
            vector=record.vector,
            payload=record.payload
        )
        for record in records
    ]

    cloud_client.upsert(
        collection_name=collection_name,
        points=points
    )

    total += len(points)
    print(f"Transferred: {total}")

    if offset is None:
        break

print("‚úÖ Migration Complete")


üöÄ Starting migration...
Transferred: 50
Transferred: 100
Transferred: 150
Transferred: 200
Transferred: 243
‚úÖ Migration Complete


In [10]:
import io
import os
import time
import uuid
import boto3
import fitz  # PyMuPDF
from dotenv import load_dotenv
from pymongo import MongoClient
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer
from PIL import Image
from pdf2image import convert_from_bytes
import pytesseract

# ================= TESSERACT CONFIG =================
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
os.environ["TESSDATA_PREFIX"] = r"C:\Program Files (x86)\Tesseract-OCR"

# ================= LOAD ENV =================
load_dotenv()

AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")
AWS_REGION = os.getenv("AWS_REGION")
S3_BUCKET = os.getenv("AWS_S3_BUCKET")

MONGO_URI = os.getenv("MONGODB_URI")
QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")  # <--- new

# ================= CHECK ENV =================
for var_name, var in [
    ("AWS_ACCESS_KEY", AWS_ACCESS_KEY),
    ("AWS_SECRET_KEY", AWS_SECRET_KEY),
    ("AWS_REGION", AWS_REGION),
    ("S3_BUCKET", S3_BUCKET),
    ("MONGO_URI", MONGO_URI),
    ("QDRANT_URL", QDRANT_URL),
    ("QDRANT_API_KEY", QDRANT_API_KEY)
]:
    if not var:
        raise RuntimeError(f"‚ùå Environment variable {var_name} missing")

DB_NAME = "ats"
COLLECTION_NAME = "applications"
QDRANT_COLLECTION = "resumes"
BATCH_SIZE = 10
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"

# ================= START =================
print("\nü§ñ AGENT STARTED: RESUME INDEXING")
print("üìä MongoDB + AWS S3 + Qdrant")

# ================= AWS =================
s3 = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    region_name=AWS_REGION
)
s3.list_buckets()
print("‚úÖ AWS verified")

# ================= MongoDB =================
mongo = MongoClient(MONGO_URI)
db = mongo[DB_NAME]
applications = db[COLLECTION_NAME]
print("‚úÖ MongoDB connected")

# ================= QDRANT =================
qdrant = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=False,  # REST API (required for Cloud)
    timeout=120
)

# Ensure collection exists
if not qdrant.collection_exists(QDRANT_COLLECTION):
    qdrant.create_collection(
        collection_name=QDRANT_COLLECTION,
        vectors_config=VectorParams(
            size=384,
            distance=Distance.COSINE
        )
    )
    print("‚úÖ Qdrant collection created")
else:
    print("‚ÑπÔ∏è Qdrant collection exists")

# ================= MODEL =================
model = SentenceTransformer(MODEL_NAME)
print("‚úÖ Embedding model loaded")

# ================= HELPERS =================
def mongo_id_to_uuid(mongo_id: str) -> str:
    return str(uuid.uuid5(uuid.NAMESPACE_DNS, mongo_id))

def extract_s3_key(url: str) -> str:
    return url.split(".amazonaws.com/")[-1]

def extract_text_from_s3(url: str) -> str:
    key = extract_s3_key(url)
    print(f"   üì• Downloading: {key}")

    obj = s3.get_object(Bucket=S3_BUCKET, Key=key)
    file_bytes = obj["Body"].read()
    text = ""

    # ---------- IMAGE ----------
    if key.lower().endswith((".jpg", ".jpeg", ".png")):
        image = Image.open(io.BytesIO(file_bytes))
        text = pytesseract.image_to_string(image)

    # ---------- PDF ----------
    else:
        with fitz.open(stream=file_bytes, filetype="pdf") as doc:
            for page in doc:
                text += page.get_text()
        # OCR fallback if PDF has no extractable text
        if not text.strip():
            images = convert_from_bytes(file_bytes)
            for img in images:
                text += pytesseract.image_to_string(img)

    if not text.strip():
        raise ValueError("No text extracted")

    print(f"   üìÑ Extracted {len(text)} chars")
    return text


# ================= AGENT =================
def resume_indexing_agent():
    """
    Only index resumes whose resume_status is 'open'.
    """
    while True:
        # ----------------- QUERY -----------------
        query = {
            "resume": {"$exists": True, "$regex": "^http"},
            "resume_status": "open",  # ONLY open resumes
            "$or": [
                {"rag_uploaded": False},
                {"rag_uploaded": "False"},  # in case it is stored as string
                {"rag_uploaded": {"$exists": False}}
            ]
        }

        pending_count = applications.count_documents(query)
        print(f"üìå Pending/open resumes count: {pending_count}")

        batch = list(applications.find(query).limit(BATCH_SIZE))
        if not batch:
            print("‚úÖ No open resumes pending")
            break

        points = []

        for app in batch:
            app_id = str(app["_id"])
            job_id = str(app.get("jobID", ""))
            print(f"\nüìÑ Processing {app_id} | Job: {job_id}")

            try:
                text = extract_text_from_s3(app["resume"])
                embedding = model.encode(text).tolist()

                points.append(
                    PointStruct(
                        id=mongo_id_to_uuid(app_id),
                        vector=embedding,
                        payload={
                            "application_id": app_id,
                            "job_id": job_id,
                            "resume_text": text[:1500]
                        }
                    )
                )

                # Update only open resumes
                applications.update_one(
                    {"_id": app["_id"]},
                    {"$set": {
                        "resume_status": "indexed",
                        "rag_uploaded": True,
                        "indexed_at": time.time()
                    }}
                )
                print("‚úÖ Indexed")

            except Exception as e:
                applications.update_one(
                    {"_id": app["_id"]},
                    {"$set": {
                        "resume_status": "failed",
                        "error": str(e)
                    }}
                )
                print(f"‚ùå Failed ‚Üí {e}")

        if points:
            qdrant.upsert(
                collection_name=QDRANT_COLLECTION,
                points=points
            )
            print(f"üöÄ {len(points)} vectors pushed to Qdrant")

        time.sleep(1)

# ================= RUN =================
if __name__ == "__main__":
    try:
        resume_indexing_agent()  # ‚úÖ no arguments needed
        print("\nüéØ Agent finished successfully")
    finally:
        mongo.close()
        print("‚úÖ MongoDB closed")




ü§ñ AGENT STARTED: RESUME INDEXING
üìä MongoDB + AWS S3 + Qdrant
‚úÖ AWS verified
‚úÖ MongoDB connected
‚ÑπÔ∏è Qdrant collection exists


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 335.24it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


‚úÖ Embedding model loaded
üìå Pending/open resumes count: 15

üìÑ Processing 684d489a7d783615f6da851e | Job: 684d43cb7d783615f6da82c7
   üì• Downloading: uploads/1749895321436-Khushi singh resume  (1).pdf
‚ùå Failed ‚Üí Unable to get page count. Is poppler installed and in PATH?

üìÑ Processing 684d4d617d783615f6da8706 | Job: 684d43cb7d783615f6da82c7
   üì• Downloading: uploads/1749896543506-WhatsApp Image 2025-06-14 at 3.45.00 PM.pdf
‚ùå Failed ‚Üí Unable to get page count. Is poppler installed and in PATH?

üìÑ Processing 684d57808c77325205caf83b | Job: 684d448a7d783615f6da8313
   üì• Downloading: uploads/1749899135124-WhatsApp Image 2025-06-14 at 4.26.09 PM.pdf
‚ùå Failed ‚Üí Unable to get page count. Is poppler installed and in PATH?

üìÑ Processing 6852adcdbcf10b931eb7c420 | Job: 6852ad6bbcf10b931eb7c3af
   üì• Downloading: uploads/1750248908595-infosis.jfif
‚ùå Failed ‚Üí Unable to get page count. Is poppler installed and in PATH?

üìÑ Processing 6868b4048a09e3fb00a486

In [18]:
import os
from dotenv import load_dotenv
from pymongo import MongoClient
from bson import ObjectId
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer
from bs4 import BeautifulSoup
import re

# ================= LOAD ENV =================
load_dotenv()

MONGO_URI = os.getenv("MONGODB_URI")
DB_NAME = "ats"

QDRANT_URL = os.getenv("QDRANT_URL")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
QDRANT_COLLECTION = "resumes"

print("\nüöÄ JD MATCHING AGENT STARTED (Best Score Relative Mode)")

# ================= MongoDB =================
mongo = MongoClient(MONGO_URI)
db = mongo[DB_NAME]
companies = db["companies"]
applications = db["applications"]
jobs = db["jobs"]
job_statuses = db["job-statuses"]

print("‚úÖ MongoDB connected")

# ================= QDRANT =================
qdrant = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    prefer_grpc=False
)
print("‚úÖ Qdrant connected")

# ================= MODEL =================
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
print("‚úÖ Embedding model loaded")

# ================= CLEAN HTML =================
def clean_html(raw_html):
    soup = BeautifulSoup(raw_html, "html.parser")
    text = soup.get_text(separator=" ")
    text = re.sub(r"\s+", " ", text).strip()
    return text

# ================= SEMANTIC JD MATCHING =================
def jd_matching_agent(company):
    company_id = company["_id"]
    print(f"\nüè¢ Running JD Matching For Company: {company.get('name')}")
    print(f"üîé Processing company: {company_id}")

    # 1Ô∏è‚É£ Get OPEN status
    open_status_doc = list(job_statuses.find({
        "company_id": ObjectId(company_id),
        "jobStatus": "Open"
    }))

    if not open_status_doc:
        print(f"‚ùå No OPEN status found for company {company.get('name')}")
        return

    open_status_id = open_status_doc[0]["_id"]

    # 2Ô∏è‚É£ Get OPEN jobs
    open_jobs = list(jobs.find({
        "company_id": ObjectId(company_id),
        "status": str(open_status_id)
    }))

    if not open_jobs:
        print("‚ùå No OPEN jobs found")
        return

    # 3Ô∏è‚É£ Process each open job
    for job in open_jobs:
        job_id = str(job["_id"])
        job_description = job.get("description", "")

        if not job_description.strip():
            print(f"‚ö†Ô∏è Job {job_id} has no description")
            continue

        print(f"\nüîé Processing Job: {job_id}")

        cleaned_description = clean_html(job_description)
        jd_vector = model.encode(cleaned_description).tolist()

        try:
            # Fetch all resumes and filter in Python
            search_results = qdrant.query_points(
                collection_name=QDRANT_COLLECTION,
                query=jd_vector,
                limit=100000,  # Fetch all
                with_payload=True,
                with_vectors=False
            ).points

            # Filter only resumes for this job
            search_results = [
                r for r in search_results
                if r.payload.get("job_id") == job_id
            ]

            print(f"üìä Found {len(search_results)} resumes for this job")

            if not search_results:
                continue

            scores = [r.score for r in search_results if r.score is not None]

            if not scores:
                print("‚ùå No similarity scores found")
                continue

            # üéØ Best-score based cutoff
            best_score = max(scores)
            cutoff = best_score * 0.63

            print(f"üèÜ Best Score: {best_score:.4f}")
            print(f"üéØ Selection Cutoff (63% of Best): {cutoff:.4f}")

            # 4Ô∏è‚É£ Update application status
            for result in search_results:
                payload = result.payload or {}
                application_id = payload.get("application_id")
                score = result.score

                if not application_id or score is None:
                    continue

                status = "selected" if score >= cutoff else "rejected"

                applications.update_one(
                    {"_id": ObjectId(application_id)},
                    {"$set": {"resume_status": status}}
                )

                print(f"   ‚ûú {application_id} | {score:.4f} ‚Üí {status}")

        except Exception as e:
            print(f"‚ùå Qdrant query failed: {e}")

    print("‚úÖ Company JD Matching Done")

# ================= RUN =================
if __name__ == "__main__":
    try:
        ai_companies = list(companies.find({"aiFeaturesEnabled": True}))

        if not ai_companies:
            print("‚ùå No AI enabled companies found")
        else:
            print(f"üè¢ Found {len(ai_companies)} AI enabled companies")

            for company in ai_companies:
                jd_matching_agent(company)

        print("\nüéØ JD Matching Completed Successfully")
    finally:
        mongo.close()
        print("‚úÖ MongoDB closed")



üöÄ JD MATCHING AGENT STARTED (Best Score Relative Mode)
‚úÖ MongoDB connected
‚úÖ Qdrant connected


Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 103/103 [00:00<00:00, 153.95it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


‚úÖ Embedding model loaded
üè¢ Found 3 AI enabled companies

üè¢ Running JD Matching For Company: f2fintech
üîé Processing company: 682858bb96c2ed0759146648

üîé Processing Job: 684920beae8bd1780f520b09
üìä Found 0 resumes for this job

üîé Processing Job: 684d43cb7d783615f6da82c7
üìä Found 161 resumes for this job
üèÜ Best Score: 0.5407
üéØ Selection Cutoff (63% of Best): 0.3406
   ‚ûú 684d489c7d783615f6da8521 | 0.5407 ‚Üí selected
   ‚ûú 68e76cac72a5ec1432f67f96 | 0.4242 ‚Üí selected
   ‚ûú 68662f868e27a08a98339a69 | 0.4138 ‚Üí selected
   ‚ûú 6958d29212dc988d45a83653 | 0.4052 ‚Üí selected
   ‚ûú 690c3c73450fd808275d0ef3 | 0.3896 ‚Üí selected
   ‚ûú 68a990b40f7a9b9b9c5c9059 | 0.3778 ‚Üí selected
   ‚ûú 69391342483e6754327e9e82 | 0.3711 ‚Üí selected
   ‚ûú 68d3be5380642c2dbf4d5b64 | 0.3679 ‚Üí selected
   ‚ûú 68b68135dbdbc7d4e8742ed6 | 0.3607 ‚Üí selected
   ‚ûú 69391281483e6754327e9e4b | 0.3597 ‚Üí selected
   ‚ûú 693914c7483e6754327e9ee9 | 0.3512 ‚Üí selected
   ‚ûú 68b9273