In [44]:
import os
import json
import chromadb
from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction

from dotenv import load_dotenv, find_dotenv

In [45]:
load_dotenv(find_dotenv())

True

In [3]:
client = chromadb.PersistentClient(path="./vector_database2")

In [4]:
collection = client.get_collection(
    name="linkedin_people_profiles_4",
    embedding_function=OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"])
)

In [5]:
collection.query(
    query_texts="People with experience of 5 years",
    include=["documents", "metadatas"]
)

{'ids': [['ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
   'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
   'ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0',
   'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
   'ACoAABIURlEB0IaIlQL8yLwChwLnQFDTIiBtFbY',
   'ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y',
   'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
   'ACoAABBPpTkBE83UntVzca-RXL5ggcsfztv9RRs',
   'ACoAACbYB28BGp07ec7sQFVsqdcNnzxVsBhwScc',
   'ACoAACE9Q2MBe_i_ol-tkC8L5lAv12dRAZez-n0']],
 'distances': None,
 'metadatas': [[{'Agile Methodologies': 'Agile Methodologies',
    'Artificial Intelligence (AI)': 'Artificial Intelligence (AI)',
    'Artificial Neural Networks': 'Artificial Neural Networks',
    'Character User Interface,Progress': 'Character User Interface,Progress',
    'Computer Vision': 'Computer Vision',
    'Core Java': 'Core Java',
    'Data Analysis': 'Data Analysis',
    'Data Science': 'Data Science',
    'Deep Learning': 'Deep Learning',
    'Linux': 'Linux',
    'Machine Learning': 'Machine

In [93]:
retrieved_documents = collection.query(
    query_texts="People with experience of 5 years",
    n_results=7,
    include=["documents"],
    where={
        "$or":[
            {"Data Science": "Data Science"},
            {"Machine Learning": "Machine Learning"},
            {"Python": "Python"},
            {"Statistical Techniques": "Statistical Techniques"},
            {"Software Engineering": "Software Engineering"}
        ]
    }
)["documents"]

In [95]:
retrieved_documents[0]

['{"summary": "Experience in Cognitive Computing with leverage of Machine Learning, Deep Learning, Artificial Intelligence, Natural Language Processing skills . Data driven analyst with the ability to apply ML/DL/NLP/Computer Vision Techniques and leverage algorithms to solve real world problems. Established ability in deploying effective predictive/NLP/Computer Vision models across banking/HealthCare industries to accurately forecast and deliver proven results. Strong understanding of Oracle DB and PLSQL with data migration exp\\u00e9rience. Domain Knowledge of SupplyChain/finance/banking/Retail domain with communication skills, excellent product and process design knowledge. Domain HealthCare,Mortgage Banking, Consumer, Commercial Lending, Escrow, Loan processing and servicing, Deposits. Onsite experience (US/UK) with business domain experience in Retail and supply chain with special focus on Customer engagement and managing end-user expectations. Experience in Analysis, Design, Deve

# Some Other Retrieval Techniques

### 1. Normal Technique

In [65]:
# Setting up Langsmith

os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")

In [66]:
JOB_DESCRIPTION = """
Save Data Scientist - India  at Netflix
About the job
Netflix is one of the world leading entertainment services with 278 million paid memberships in over 190 countries enjoying TV series, films and games across a wide variety of genres and languages. Members can play, pause and resume watching as much as they want, anytime, anywhere, and can change their plans at any time.

The Role

Netflix is one of the world's leading entertainment services, with 270 million paid memberships in over 190 countries enjoying TV series, films and games across a wide variety of genres and languages. Members can play, pause and resume watching as much as the want, anytime, anywhere, and can change their plans at any time. Our mission to entertain the world is anchored in our content, and data is a crucial component in shaping our comprehensive content strategy. We focus on creating analytical products that support our content partners in their complex and nuanced decision-making processes. We are a highly collaborative team that partners across Netflix to drive impact. We are seeking a talented Senior Data Scientist to provide key insights to our content decision makers in India. You will generate insights by scoping and executing deep dive analysis with local partner analytics teams, Consumer Insights and Finance & Strategy. In success, you will collaborate on existing priorities but also will propose and execute on new opportunities. This role features ample opportunity for project ownership and will support impactful decision-making. In this role you will

Be a strategic thought partner with business stakeholders to define high impact analytical problems and innovative ways to solve them with data.
Develop statistical models explaining viewership, content engagement, and other key behavioral patterns
Build dashboards and visualization that enables stakeholders to self-serve metrics and trends effectively. 
Actively socialize and educate, aiding on interpretation
Translate analytic insights into actionable recommendations for business or content improvement, and communicate these findings clearly to a broad audienceIdentify and proactively socialize regional insights, including those that may generalize to opportunities in other markets

What you will bring

Exceptional interpersonal and communication skills to influence stakeholders using clear insights and recommendations
Exceptional thought partnership to build credibility and relationships with stakeholders
Strong statistical knowledge: understanding of predictive modeling; ability to tease out incrementality vs. correlations, confounder identification and amelioration, etc.
Basic understanding of experimentation including power calculations and interpretation of results
Strong SQL skills and experience with distributed analytic processing technologies (S3, Presto, Hive & Spark)
Strong skills in Python or R
Highly effective in engaging with diverse stakeholders and adept at cultivating strong partnerships. P
assionate about communicating difficult concepts to non-technical audiencesSelf-starter who thrives under a high level of autonomy.
Exceptional interpersonal and communication skills.
Enthusiastic about Netflix culture

We are an equal opportunity employer and celebrate diversity, recognizing that diversity of thought and background builds stronger teams. We approach diversity and inclusion seriously and thoughtfully. We do not discriminate on the basis of race, ethnicity, religion, color, place of birth, sex, gender identity or expression, sexual orientation, age, marital status, military service status or disability status.
"""

In [67]:
from langchain_core.prompts.chat import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_groq import ChatGroq

In [117]:
def get_candidates_profiles(relevant_docs: list[str], job_description: str) -> str:
    PROMT_TEMPLATE = [
        (
            "system",
            "Act as a HR of a company who is looking for some candidates to fill a job position",
        ),
        (
            "ai",
            """
            You will be given a job description and some candidates profiles. You first need to analyse the\
            job description step-by-step. After that analyse each job profile. Once all these done\
            then you need to compare each job profile with the job description and then report out 3 potential\
            candidate which is most suitable for the job.\
            
            ###
            The output should contain the candidate name, his years of experience, his job skills and his linkedin profile url, his present company (NO PREAMBLE)
            ###
            
            ***YOUR ANASWER SHOULD COMES FROM THE GIVEN JOB PROFILES ONLY AND NOTHING ELSE.***
            """,
        ),
        (
            "human",
            (
                "Job Description: {job_description} \n\n Candidates Profiles: {candidate_profiles}"
            ),
        ),
    ]

    job_search_prompt = ChatPromptTemplate.from_messages(PROMT_TEMPLATE)

    llama_model = ChatGroq(
        api_key=os.environ["GROQ_API_KEY"], model="llama-3.1-70b-versatile"
    )

    # Creating Chain for retrieval
    chain = job_search_prompt | llama_model | StrOutputParser()

    return chain.invoke(
        {"candidate_profiles": relevant_docs, "job_description": job_description}
    )


In [119]:
result = get_candidates_profiles(
    relevant_docs=retrieved_documents[0],
    job_description=JOB_DESCRIPTION
)

In [120]:
print(result)

After analyzing the job description and the candidates' profiles, I've identified the top three candidates who are most suitable for the Senior Data Scientist position at Netflix India. Here are their profiles:

**Candidate 1: Sakshi Narwani**

* Years of experience: 4+
* Job skills: Data Science, Machine Learning, Python, R, SQL, Predictive Modeling, Data Visualization
* LinkedIn profile: https://www.linkedin.com/in/sakshinarwani/
* Current company: EXL
* Education: PG Diploma in Data Science from International Institute of Information Technology Bangalore

**Candidate 2: Sumeet Kharbanda**

* Years of experience: 7+
* Job skills: Data Science, Machine Learning, Python, SQL, Predictive Modeling, Natural Language Processing
* LinkedIn profile: https://www.linkedin.com/in/sumeet-data-scientist/
* Current company: EXL
* Education: PG Diploma in Data Science from International Institute of Information Technology Bangalore

**Candidate 3: Rakesh Nain**

* Years of experience: 4+
* Job skil

In [107]:
def extract_keywords(job_description: str) -> str:    
    simple_prompt = """
    You will be given a job description. You need to analyse the job description completely first step by step
    and after that you need to extract  TOP 5 IMPORTANT keywords which are essential for this job description.

    <Output Format>
    Print the final output in comma separated list (NO PREAMBLE). NOTHING ELSE NEEDED IN THE OUTPUT.
    </Output Format>

    *** YOUR OUTPUT SHOULD COME ONLY FROM THE PROVIDED DATA AND NOTHING ELSE. DO NOT ASSUME ANYTHING FROM YOUR SIDE."

    Job Description: {job_description}
    """

    promt = ChatPromptTemplate.from_template(simple_prompt)

    chain = (
        promt
        | llama_model
        | StrOutputParser()
    )

    return chain.invoke(job_description)

In [108]:
keywords = extract_keywords(JOB_DESCRIPTION)

In [109]:
print(keywords)

Data Scientist, Statistical Modeling, Python, Data Analysis, Machine Learning


In [110]:
keywords_list = keywords.split(", ")

keywords_list

['Data Scientist',
 'Statistical Modeling',
 'Python',
 'Data Analysis',
 'Machine Learning']

In [111]:
retrieved_documents = collection.query(
    query_texts=JOB_DESCRIPTION,
    n_results=7,
    include=["documents"],
    where={
        "$or":[
            {keywords_list[0]: keywords_list[0]},
            {keywords_list[1]: keywords_list[1]},
            {keywords_list[2]: keywords_list[2]},
            {keywords_list[3]: keywords_list[3]},
            {keywords_list[4]: keywords_list[4]},
        ]
    }
)["documents"]

In [112]:
retrieved_documents

[['{"summary": "As a Data Science professional holding a Post Graduate Diploma, I am deeply committed to the process of extracting actionable insights from complex datasets. My expertise spans a broad spectrum of data-related domains, encompassing advanced data analytics and machine learning techniques. I have honed my proficiency in utilizing an extensive array of cutting-edge tools and technologies, ensuring that I possess the versatility required to excel in diverse data-driven environments. My dedication to harnessing the power of data for informed decision-making is a driving force behind my professional endeavors.", "industryName": "Information Technology & Services", "lastName": "Narwani", "locationName": "India", "firstName": "Sakshi", "headline": "Data Scientist", "img_641_641": "800_800/profile-displayphoto-shrink_800_800/0/1694867340481?e=1731542400&v=beta&t=CjqB-VZoSUqQEhpxAXAAl60m-ssqzvotpYlrElmXu0c", "public_id": "sakshinarwani", "experience": [{"locationName": "Gurugram,