In [None]:
!pip install google-genai pandas scikit-learn pydantic




In [None]:
import os
import pandas as pd
import json
import numpy as np
from pydantic import BaseModel, Field
from sklearn.metrics.pairwise import cosine_similarity

# --- 1. SET UP GEMINI API KEY ---
# NOTE: Replace 'YOUR_API_KEY' with your actual key or use a Colab secret.
# In Colab, you can store your key as a 'Secret' named 'GEMINI_API_KEY'
# and load it using: os.environ['GEMINI_API_KEY'] = '...'
from google import genai
from google.genai.errors import APIError
from google.colab import userdata

# Replace 'MY_API_KEY' with the actual name of your secret in Colab Secrets
GEMINI_API_KEY = userdata.get('SHL_LABS')


try:
    # Attempt to load API Key from environment variable
    # Please set the GEMINI_API_KEY environment variable in your Colab notebook secrets or runtime.

    client = genai.Client(api_key=GEMINI_API_KEY)

except Exception as e:
    print(f"Error initializing Gemini client: {e}")
    # Exit or raise error if API key is not set up correctly

# Define the embedding model to use
EMBEDDING_MODEL = 'text-embedding-004'

print("Setup complete. Client initialized.")

Setup complete. Client initialized.


In [None]:
TEST_TYPE_MAPPING = {
    'A': 'Ability & Aptitude',
    'B': 'Biodata & Situational Judgement',
    'C': 'Competencies',
    'D': 'Development & 360',
    'E': 'Assessment Exercise',
    'K': 'Knowledge & Skills',
    'P': 'Personality & Behaviour',
    'S': 'Simulation'
    # Add any other single-letter codes you extracted
}
import re

def extract_duration(text: str) -> int:
    """
    Extracts duration in minutes from a text string.
    If a range like '25-35' exists, returns the upper bound (e.g., 35).
    If no number is found, returns -1.
    """
    if not text:
        return -1

    # Find all numbers in the text
    numbers = re.findall(r'\d+', text)

    if not numbers:
        return -1

    # If it's a range like "25-35", pick the last number
    return int(numbers[-1])

# ✅ Test cases
# examples = [
#     "Approximate Completion Time in minutes = 17",
#     "Approximate Completion Time in minutes = 25-35",
#     "Completion time: about 50 mins",
#     "This test takes approximately half an hour",
#     ""
# ]

# for e in examples:
#     print(f"{e!r} → {extract_duration(e)}")


def translate_test_types(symbols_list):
    """Translates single-letter codes into full test category names."""
    if not isinstance(symbols_list, list):
        # Handle cases where the input might be a single string or non-list
        symbols_list = [symbols_list] if isinstance(symbols_list, str) else []

    full_names = []
    for symbol in symbols_list:
        # Use .get() with a fallback in case a symbol is missing in the map
        full_name = TEST_TYPE_MAPPING.get(symbol.upper(), None)
        if full_name:
            full_names.append(full_name)

    return full_names

def safe_join(data, is_test_type=False):
    """Safely joins list data into a comma-separated string, applying translation if needed."""

    # 1. Apply Translation
    if is_test_type:
        data = translate_test_types(data)

    # 2. Join the data
    if isinstance(data, list):
        return ', '.join(filter(lambda x: isinstance(x, str), data))
    return str(data)

import json

def join_description(data: dict) -> str:
    desc = data.get("description", {})
    parts = []

    for key, value in desc.items():
        if isinstance(value, list):
            joined = ", ".join(value)
            parts.append(f"{key.replace('_', ' ').title()}: {joined}")
        else:
            parts.append(f"{key.replace('_', ' ').title()}: {value}")

    return " | ".join(parts)


In [None]:
# --- 2. LOAD & PREPARE DATA ---

CATALOG_PATH = 'shl_assessment_details1.json'

# Load the JSON data
try:
    with open(CATALOG_PATH, 'r') as f:
        catalog_data = json.load(f)
except FileNotFoundError:
    print(f"Error: File not found at {CATALOG_PATH}")
    catalog_data = {} # Initialize as empty dictionary if file not found
except json.JSONDecodeError:
    print(f"Error: Could not decode JSON from {CATALOG_PATH}")
    catalog_data = {} # Initialize as empty dictionary if JSON is invalid


if catalog_data:
    assessment_df = pd.DataFrame.from_dict(catalog_data, orient='index').reset_index()

    # assessment_df['source_text'] = assessment_df['name'] + ". " + assessment_df['description'] + ". " + assessment_df['joblevel'] + ". " + assessment_df['assessment_length'] + ". " + assessment_df['language'] + ". " + assessment_df['test_type']
    assessment_df['source_text'] = (
        "Assessment Name: " + assessment_df['name'] + ". " +
        "Description: " + assessment_df['description'] + ". " +
        "Languages: " + assessment_df['language'].apply(lambda x: safe_join(x)) + ". " +
        "Job Levels: " + assessment_df['joblevel'].apply(lambda x: safe_join(x)) + ". " +
        "Test Types: " + assessment_df['test_type'].apply(lambda x: safe_join(x, is_test_type=True))
    )

    print(f"Catalog loaded with {len(assessment_df)} assessments.")
else:
    print("No data loaded from catalog.")
    assessment_df = pd.DataFrame() # Create an empty DataFrame if data loading failed
assessment_df["test_duration"] = assessment_df["assessment_length"].apply(extract_duration)
assessment_df.head()

Catalog loaded with 376 assessments.


Unnamed: 0,index,name,description,joblevel,assessment_length,language,test_type,combined,url,source_text,test_duration
0,Global Skills Development Report,Global Skills Development Report,This report is designed to be given to individ...,"[Director, Entry-Level, Executive, General Pop...",,[],,Description\nThis report is designed to be giv...,https://www.shl.com/products/product-catalog/v...,Assessment Name: Global Skills Development Rep...,-1
1,.NET Framework 4.5,.NET Framework 4.5,The.NET Framework 4.5 test measures knowledge ...,"[Professional Individual Contributor, Mid-Prof...",Approximate Completion Time in minutes = 30,[English (USA)],K,Description\nThe.NET Framework 4.5 test measur...,https://www.shl.com/products/product-catalog/v...,Assessment Name: .NET Framework 4.5. Descripti...,30
2,.NET MVC (New),.NET MVC (New),Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",Approximate Completion Time in minutes = 17,[English (USA)],K,Description\nMulti-choice test that measures t...,https://www.shl.com/products/product-catalog/v...,Assessment Name: .NET MVC (New). Description: ...,17
3,.NET MVVM (New),.NET MVVM (New),Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",Approximate Completion Time in minutes = 5,[English (USA)],K,Description\nMulti-choice test that measures t...,https://www.shl.com/products/product-catalog/v...,Assessment Name: .NET MVVM (New). Description:...,5
4,.NET WCF (New),.NET WCF (New),Multi-choice test that measures the knowledge ...,"[Mid-Professional, Professional Individual Con...",Approximate Completion Time in minutes = 11,[English (USA)],K,Description\nMulti-choice test that measures t...,https://www.shl.com/products/product-catalog/v...,Assessment Name: .NET WCF (New). Description: ...,11


In [None]:
# --- 3. VECTOR INDEXING (EMBEDDING GENERATION) ---
from google import genai
from google.genai import types
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def generate_embeddings(texts, model_name=EMBEDDING_MODEL):
    """Generates embeddings for a list of texts using the Gemini API."""
    print(f"Generating embeddings for {len(texts)} assessments...")

    try:
        # ✅ The Gemini API expects one content per call for 'embed_content'
        # For batch embedding, use a loop or a batch request (depending on SDK version)
        embeddings = []

        for text in texts:
            response = client.models.embed_content(
                model=model_name,
                contents=text,
                config=types.EmbedContentConfig(task_type="SEMANTIC_SIMILARITY")
            )
            # ✅ Correct way to extract embedding from response
            embeddings.append(response.embeddings[0].values)

        embeddings = np.array(embeddings)
        return embeddings

    except genai.types.APIError as e:
        print(f"An API error occurred during embedding generation: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during embedding generation: {e}")
        return None


In [None]:
def extract_constraints_with_gemini(query: str):
    """
    Uses Gemini to extract structured constraints like job level, technical skills, test types, etc.
    Returns JSON directly (no Pydantic schema).
    """

    print("Extracting structured constraints from user query using Gemini...")

    prompt = f"""
    You are an AI that extracts hiring-related information from text.

    Given the following query or job description, extract and return structured information as a JSON object.
    Make sure your response is a valid JSON — no explanations or extra text. STRICTLY FOLLOE ALL THE RULES.

    JSON format:
    {{
      "description": {{
        "job_level": [],
        "technical_skills": [],
        "test_types": [],
        "job_family": [],
        "experience": "",
        "industry": [],
        "language": ""
      }},
      "test_duration": 0
    }}

    Extraction Rules:
    - "job_level": choose atleast one (the most relevant) [Director, Entry-Level, Graduate, Manager, Mid-Professional, Supervisor, Executive, Frontline Manager, General Population, Professional Individual Contributor].
    - "technical_skills": choose atleast or can add skill not listed here(the most relevant) (Python, SQL, Excel, JavaScript, Web Development, Engineering Fields).
    - "soft_skills": choose at least 2–3 from ['Communication', 'Teamwork', 'Problem Solving', 'Adaptability', 'Critical Thinking', 'Leadership', 'Time Management', 'Creativity', 'Conflict Resolution'] or add other most relevant ones if not listed here.
    - "test_types": choose atleast one (the most relevant):
      ['Ability & Aptitude', 'Biodata & Situational Judgement', 'Competencies',
       'Development & 360', 'Assessment Exercise', 'Knowledge & Skills',
       'Personality & Behaviour', 'Simulation'].
    - "job_family": choose atleast one (the most relevant):
      ['Business', 'Clerical', 'Contact Center', 'Customer Service',
       'Information Technology', 'Safety', 'Sales'].
    - "experience": extract numeric or qualitative experience terms ('2-5 years', 'Entry-level', 'Senior').
    - "industry": choose atleast one (the most relevant) from:
      ['Banking/Finance', 'Healthcare', 'Hospitality', 'Insurance',
       'Manufacturing', 'Oil & Gas', 'Retail', 'Telecommunications'].
    - "language": Specify the language preference mentioned in the query. If no language is explicitly stated, infer it from the country mentioned (use the country's national language). If neither is provided, default to English.
    - "test_duration": extract total time in minutes.
        • If duration is in range (e.g., 25–35), pick the higher value.
        • If no number is mentioned, return -1.

    QUERY: {query}
    """

    try:
        response = client.models.generate_content(
            model="gemini-2.5-flash",
            contents=prompt,
            config={"response_mime_type": "application/json"}
        )

        # Parse and return as Python dict
        return json.loads(response.text)

    except APIError as e:
        print(f"⚠️ Gemini API Error: {e}")
        return {
            "description": {
                "job_level": [],
                "technical_skills": [],
                "test_types": [],
                "job_family": [],
                "experience": "",
                "industry": [],
                "language": ""
            },
            "test_duration": 0
        }

    except json.JSONDecodeError as e:
        print(f"⚠️ JSON Decode Error: {e}")
        return {
            "description": {
                "job_level": [],
                "technical_skills": [],
                "test_types": [],
                "job_family": [],
                "experience": "",
                "industry": [],
                "language": ""
            },
            "test_duration": 0
        }


# # --- Example usage ---
# if __name__ == "__main__":
#     query = "Hiring a graduate software developer skilled in Python and Java for the IT industry, expected experience 0-2 years, with an aptitude and coding test under 40 minutes."
#     result = extract_constraints_with_gemini(query)
#     print(json.dumps(result, indent=2))


In [None]:
# --- 5. HYBRID RECOMMENDATION ENGINE ---

def recommend_assessments(query: str, df: pd.DataFrame, embeddings: np.ndarray, top_k_retrieval: int = 20, final_k: int = 10):
    """
    Performs hybrid recommendation: Vector Search (Retrieval) + LLM Constraint Filtering (Re-ranking).
    """
    if embeddings is None or len(embeddings) == 0:
        print("Error: Embeddings not available.")
        return []

    # 1. LLM Constraint Extraction
    constraints = extract_constraints_with_gemini(query)
    # print(f"\nExtracted Constraints: Max Duration={constraints['test_duration']} min, Skills={constraints['description']}")

# ------------------------- use query
    # 2. Vector Search (Retrieval)
    query_embedding_result = client.models.embed_content(
        model=EMBEDDING_MODEL,
        contents=[query],
        config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY")
    )

    # ✅ FIX: Access the embedding attribute properly
    query_embedding = np.array(query_embedding_result.embeddings[0].values)

    # Compute cosine similarity
    similarities = cosine_similarity(query_embedding.reshape(1, -1), embeddings)[0]

    # Top-k retrieval
    top_indices = np.argsort(similarities)[::-1][:top_k_retrieval]
    candidates_df = df.iloc[top_indices].copy()
    candidates_df['similarity_score'] = similarities[top_indices]

    print(f"Retrieval done. Found {len(candidates_df)} candidates via semantic search.")

    # 3. Constraint Filtering (Re-ranking)
    filtered_df = candidates_df.copy()

    # Filter 1: Max Duration
    if constraints['test_duration'] is not None:
        filtered_df = filtered_df[filtered_df['test_duration'] <= constraints['test_duration']]
        print(f"After Duration Filter: {len(filtered_df)} remaining.")

    # # Filter 2: Required Test Type
    # if constraints.required_test_types:
    #     def type_matches(test_types):
    #         if not isinstance(test_types, list):
    #             return False
    #         return any(t in constraints.required_test_types for t in test_types)

    #     filtered_df = filtered_df[filtered_df['test_type'].apply(type_matches)]
    #     print(f"After Test Type Filter: {len(filtered_df)} remaining.")

    # Sort by similarity and select final
    filtered_df = filtered_df.sort_values(by='similarity_score', ascending=False)
    final_recommendations = filtered_df.head(final_k)

    # Format output
    results = []
    for _, row in final_recommendations.iterrows():
        results.append({
            'url': row.get('url', ''),
            'name': row.get('name', ''),
            'adaptive_support': row.get('adaptive_support', ''),
            'description': row.get('description', ''),
            'duration': row.get('duration_minutes', ''),
            'remote support': row.get('remote_support', ''),
            'test type': row.get('test_type', '')
        })

    return {"recommended assessments": results}

In [None]:
# extract_constraints_with_gemini(test_queries)

In [None]:
# -- generate embeddings
assessment_embeddings = generate_embeddings(assessment_df['source_text'].tolist())
if assessment_embeddings is not None:
    print(f"✅ Embeddings shape: {assessment_embeddings.shape}")

Generating embeddings for 376 assessments...
✅ Embeddings shape: (376, 768)


In [None]:
# --- SUBMISSION FILE GENERATION ---

submission_records = []

# 2. Iterate and Recommend
# for query in test_queries:
print(f"Processing query: {query[:50]}...")

recommendations = recommend_assessments(
    query=query,
    df=assessment_df,
    embeddings=assessment_embeddings,
        # client=client,
        # final_k=10 # Max 10 recommendations
)

    # 3. Format Results for Submission
for item in recommendations['recommended assessments']:
    submission_records.append({
        "Query": query,
        "Assessment_url": item['url']
    })

# 4. Create and Save Submission CSV
submission_df = pd.DataFrame(submission_records)

# File MUST be in the exact format: Query,Assessment_url
submission_df.to_csv('SHL_Submission_Data.csv', index=False)

print("\n✅ Final submission CSV created successfully!")
print(f"You must submit 'SHL_Submission_Data.csv' for evaluation.")

Processing query: KEY RESPONSIBITILES:

Manage the sound-scape of th...
Extracting structured constraints from user query using Gemini...
Retrieval done. Found 20 candidates via semantic search.
After Duration Filter: 11 remaining.

✅ Final submission CSV created successfully!
You must submit 'SHL_Submission_Data.csv' for evaluation.


In [None]:
query = '''KEY RESPONSIBITILES:

Manage the sound-scape of the station through appropriate creative and marketing interventions to Increase or Maintain the listenership
Acts as an interface between Programming & sales team, thereby supporting the sales team by providing creative inputs in order to increase the overall ad spends by clients
Build brand Mirchi by ideating fresh programming initiatives on air campaigns, programming led on-ground events & new properties to ensure brand differentiation & thus increase brand recall at station level
Invest time in local RJs to grow & develop them as local celebrities
Through strong networking, must focus on identifying the best of local talent and ensure to bring the creative minds from the market on board with Mirchi
Build radio as a category for both listeners & advertisers
People Management
Identifying the right talent and investing time in developing them by frequent feedback on their performance
Monitor, Coach and mentor team members on a regular basis
Development of Jocks as per guidelines
Must have an eye to spot the local talent to fill up vacancies locally




TECHNICAL SKILLS & QUALIFICATION REQUIRED:

Graduation / Post Graduation (Any specialisation) with 8 -12 years of relevant experience
Experience in digital content conceptualisation
Strong branding focus
Must be well-read in variety of areas and must keep up with the latest events in the city / cluster / country
Must know to read, write & speak English


PERSONAL ATTRIBUTES:

Excellent communication skills
Good interpersonal skills
People management


Suggest me some tests for the above jd. The duration should be at most 90 mins'''

In [None]:
# # --- 5. HYBRID RECOMMENDATION ENGINE --- extracted content

# def recommend_assessments(query: str, df: pd.DataFrame, embeddings: np.ndarray, top_k_retrieval: int = 20, final_k: int = 10):
#     """
#     Performs hybrid recommendation: Vector Search (Retrieval) + LLM Constraint Filtering (Re-ranking).
#     """
#     if embeddings is None or len(embeddings) == 0:
#         print("Error: Embeddings not available.")
#         return []

#     # 1. LLM Constraint Extraction
#     constraints = extract_constraints_with_gemini(query)
#     print(f"\nExtracted Constraints: Max Duration={constraints.test_duration} min, Skills={constraints.description}")
# # -------------------------------------- LLM output for search
#     model_desc = join_description(constraints)
#     # 2. Vector Search (Retrieval)
#     query_embedding_result = client.models.embed_content(
#         model=EMBEDDING_MODEL,
#         contents=[model_desc],
#         config=types.EmbedContentConfig(task_type="RETRIEVAL_QUERY")
#     )

#     # ✅ FIX: Access the embedding attribute properly
#     query_embedding = np.array(query_embedding_result.embeddings[0].values)

#     # Compute cosine similarity
#     similarities = cosine_similarity([query_embedding], embeddings)[0]

#     # Top-k retrieval
#     top_indices = np.argsort(similarities)[::-1][:top_k_retrieval]
#     candidates_df = df.iloc[top_indices].copy()
#     candidates_df['similarity_score'] = similarities[top_indices]

#     print(f"Retrieval done. Found {len(candidates_df)} candidates via semantic search.")

#     # 3. Constraint Filtering (Re-ranking)
#     filtered_df = candidates_df.copy()

#     # Filter 1: Max Duration
#     if constraints.max_duration_minutes is not None:
#         filtered_df = filtered_df[filtered_df['test_duration'] <= constraints.test_duration]
#         print(f"After Duration Filter: {len(filtered_df)} remaining.")

#     # Sort by similarity and select final
#     filtered_df = filtered_df.sort_values(by='similarity_score', ascending=False)
#     final_recommendations = filtered_df.head(final_k)

#     # Format output
#     results = []
#     for _, row in final_recommendations.iterrows():
#         results.append({
#             'url': row.get('url', ''),
#             'name': row.get('name', ''),
#             'adaptive_support': row.get('adaptive_support', ''),
#             'description': row.get('description', ''),
#             'duration': row.get('duration_minutes', ''),
#             'remote support': row.get('remote_support', ''),
#             'test type': row.get('test_type', '')
#         })

#     return {"recommended assessments": results}