In [1]:
import os
import json
import chromadb
import pandas as pd
from linkedin_api import Linkedin
from langchain_core.documents import Document
from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())

True

# Loading Data

In [3]:
# Authenticate using any Linkedin account credentials
api = Linkedin(
    username=os.environ["LINKEDIN_USERNAME"],
    password=os.environ["LINKEDIN_PASSWORD"]
)

In [4]:
SEARCH_KEYWORD = "Data Scientist"

In [5]:
all_peoples = api.search_people(
    keywords=SEARCH_KEYWORD,
    limit=20
)

all_peoples_df = pd.DataFrame(all_peoples)

all_peoples_df

Unnamed: 0,urn_id,distance,jobtitle,location,name
0,ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4,DISTANCE_2,Data Scientist,Gurugram,Vidur Srivastava
1,ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk,DISTANCE_2,Data Scientist IBM,Noida,Isha Aneja
2,ACoAACzdEGcBQjWaqlIKydeYpk-9n7NpLDpXbdY,DISTANCE_2,Data Scientist,Gurugram,Sakshi Narwani
3,ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY,DISTANCE_2,Data Scientist,Noida,Tushar Mathur
4,ACoAACMdmgABEKidiIRTUeY4ZC_etvTvOPPAWRA,DISTANCE_2,Data Scientist at TCS,New Delhi,Shakshi Neha
5,ACoAACqnIWQBi3JKxfm8FVwW8SfjZj6U2YmfUms,DISTANCE_2,Data Scientist at KnowDis Data Science,India,Deepak Kumar
6,ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY,DISTANCE_2,Data Scientist @ R Systems | Master's in Data ...,Gurugram,Rakesh Nain
7,ACoAAA1k1eUB0Ylj--mqKdQ67o6wnie7-A5afjY,DISTANCE_2,Data Scientist / AI Consultant @ IBM Consultin...,Greater Delhi Area,Rakshit Singh
8,ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y,DISTANCE_2,Data Scientist,Gurgaon,Riya Bhatnagar
9,ACoAACbYB28BGp07ec7sQFVsqdcNnzxVsBhwScc,DISTANCE_2,Data Scientist,Gurugram,Navdeep Singh


In [7]:
people_urn_ids = []
for row, col in all_peoples_df.iterrows():
    people_urn_ids.append(col["urn_id"])

people_urn_ids

['ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
 'ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
 'ACoAACzdEGcBQjWaqlIKydeYpk-9n7NpLDpXbdY',
 'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
 'ACoAACMdmgABEKidiIRTUeY4ZC_etvTvOPPAWRA',
 'ACoAACqnIWQBi3JKxfm8FVwW8SfjZj6U2YmfUms',
 'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
 'ACoAAA1k1eUB0Ylj--mqKdQ67o6wnie7-A5afjY',
 'ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y',
 'ACoAACbYB28BGp07ec7sQFVsqdcNnzxVsBhwScc',
 'ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0',
 'ACoAACUNkNwBsZws6K44VX40ECBT3woAaJttIhQ',
 'ACoAACE9Q2MBe_i_ol-tkC8L5lAv12dRAZez-n0',
 'ACoAABJ97q0BFcxSTVQ9_y1dgDAXNMpNiUtF93k',
 'ACoAABBPpTkBE83UntVzca-RXL5ggcsfztv9RRs',
 'ACoAACA43R8Buv3_11zWBpYvQ0u3_u_S89F3Clc',
 'ACoAACuf0GEBxqLlm79WsFe0eagVOHzxvQhltaA',
 'ACoAACu4NbUBH95IDJX8ynU_hDNCiu4uWC3F4qI',
 'ACoAAA_8N7UBcb426isJoolyZrF0Lgh2oyOZbcc',
 'ACoAABIURlEB0IaIlQL8yLwChwLnQFDTIiBtFbY']

In [8]:
from collections import defaultdict

peoples_profiles: dict[str, list] = defaultdict(list)

for urn_id in people_urn_ids:
    try:
        profile_details = api.get_profile(urn_id=urn_id)
        skills = api.get_profile_skills(urn_id=urn_id)
        peoples_profiles[urn_id].append(
            {
                "profile_details": profile_details,
                "skills": skills
            }
        )
    except KeyError:
        continue

peoples_profiles

defaultdict(list,
            {'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4': [{'profile_details': {'summary': 'Experienced data scientist with a track record of translating complex data into actionable insights across diverse sectors spanning FMCG, Sports and Supply Chain.',
                'industryName': 'Information Technology & Services',
                'lastName': 'Srivastava',
                'locationName': 'India',
                'student': False,
                'geoCountryName': 'India',
                'geoCountryUrn': 'urn:li:fs_geo:102713980',
                'geoLocationBackfilled': False,
                'elt': False,
                'industryUrn': 'urn:li:fs_industry:96',
                'firstName': 'Vidur',
                'entityUrn': 'urn:li:fs_profile:ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
                'geoLocation': {'geoUrn': 'urn:li:fs_geo:115884833'},
                'geoLocationName': 'Gurugram, Haryana',
                'location': {'basicLocation': {'co

In [9]:
# code to clean up unnecessary details
for key, values in peoples_profiles.items():
    for val in values:
        for k1, v1 in val.items():
            if isinstance(v1, dict):
                if "student" in v1:
                    del v1["student"]
                if "geoCountryUrn" in v1:
                    del v1["geoCountryUrn"]
                if "geoLocationBackfilled" in v1:
                    del v1["geoLocationBackfilled"]
                if "entityUrn" in v1:
                    del v1["entityUrn"]
                if "geoCountryName" in v1:
                    del v1["geoCountryName"]
                if "elt" in v1:
                    del v1["elt"]
                if "profilePictureOriginalImage" in v1:
                    del v1["profilePictureOriginalImage"]
                if "industryUrn" in v1:
                    del v1["industryUrn"]
                if "profilePicture" in v1:
                    del v1["profilePicture"]
                if "geoLocation" in v1:
                    del v1["geoLocation"]
                if "geoLocationName" in v1:
                    del v1["geoLocationName"]
                if "location" in v1:
                    del v1["location"]
                if "backgroundPicture" in v1:
                    del v1["backgroundPicture"]
                if "backgroundPictureOriginalImage" in v1:
                    del v1["backgroundPictureOriginalImage"]
                if "displayPictureUrl" in v1:
                    del v1["displayPictureUrl"]
                if "img_400_400" in v1:
                    del v1["img_400_400"]
                if "img_200_200" in v1:
                    del v1["img_200_200"]
                if "img_800_800" in v1:
                    del v1["img_800_800"]
                if "img_100_100" in v1:
                    del v1["img_100_100"]
                if "img_767_767" in v1:
                    del v1["img_767_767"]
                if "profile_id" in v1:
                    del v1["profile_id"]
                if "profile_urn" in v1:
                    del v1["profile_urn"]
                if "member_urn" in v1:
                    del v1["member_urn"]
                if "volunteer" in v1:
                    del v1["volunteer"]
                if "honors" in v1:
                    del v1["honors"]
                if "experience" in v1:
                    for ele in v1["experience"]:
                        if "entityUrn" in ele:
                            del ele["entityUrn"]
                        if "geoUrn" in ele:
                            del ele["geoUrn"]
                        if "region" in ele:
                            del ele["region"]
                        if "companyUrn" in ele:
                            del ele["companyUrn"]
                        if "companyLogoUrl" in ele:
                            del ele["companyLogoUrl"]
                if "education" in v1:
                    for ele in v1["education"]:
                        if "entityUrn" in ele:
                            del ele["entityUrn"]
                        for sch in ele.get("school", []):
                            if "objectUrn" in ele:
                                del ele["objectUrn"]
                            if "entityUrn" in ele:
                                del ele["entityUrn"]
                            if "trackingId" in ele:
                                del ele["trackingId"]
                            if "logoUrl" in ele:
                                del ele["logoUrl"]
                            if "schoolUrn" in ele:
                                del ele["schoolUrn"]
                if "certifications" in v1:
                    for ele in v1["certifications"]:
                        if "company" in ele:
                                del ele["company"]
                        if "displaySource" in ele:
                                del ele["displaySource"]
                        if "companyUrn" in ele:
                                del ele["companyUrn"]
                        if "url" in ele:
                                del ele["url"]
                if "projects" in v1:
                    for ele in v1["projects"]:
                        if "members" in ele:
                                del ele["members"]
                if "urn_id" in v1["urn_id"]:
                    del v1["urn_id"]

In [10]:
profile_details

{'summary': 'Experienced Data Scientist with a demonstrated history of working in the computer software industry. Skilled in Computer Vision, Power BI, Image Processing, Text Extraction using Python Programming. Strong engineering professional with a PG Diploma focused in Data Science from Great Lakes Institute of Management, Gurgaon. ',
 'industryName': 'Computer Software',
 'lastName': 'Khandelwal',
 'locationName': 'India',
 'firstName': 'Sushant',
 'headline': 'Data Scientist | AI | ML | Python | SQL |  Automation | Analytics | Azure',
 'public_id': 'sushant-khandelwal-7296b785',
 'experience': [{'locationName': 'Gurugram, Haryana, India',
   'geoLocationName': 'Gurugram, Haryana, India',
   'companyName': 'Optum',
   'timePeriod': {'startDate': {'month': 8, 'year': 2022}},
   'company': {'employeeCountRange': {'start': 10001},
    'industries': ['Hospital & Health Care']},
   'title': 'Data Scientist'},
  {'locationName': 'Gurugram, Haryana, India',
   'geoLocationName': 'Gurugram

In [11]:
# Store data into chromadb
all_documents = []
ids = []
for urn_id, profile_info in peoples_profiles.items():
    for info in profile_info:
        metadatas = {}
        ids.append(urn_id)
        documents_without_skills = {k1: v1 for k, v in info.items() if k == "profile_details" for k1, v1 in v.items() if k1 != "skills"}
        for skill_name in info["skills"]:
            if "name" in skill_name:
                metadatas.update({skill_name["name"]: skill_name["name"]})

    all_documents.append(Document(page_content=json.dumps(documents_without_skills), metadata=metadatas))

In [14]:
json.loads(all_documents[0].page_content)

{'summary': 'Experienced data scientist with a track record of translating complex data into actionable insights across diverse sectors spanning FMCG, Sports and Supply Chain.',
 'industryName': 'Information Technology & Services',
 'lastName': 'Srivastava',
 'locationName': 'India',
 'firstName': 'Vidur',
 'headline': 'Data Scientist',
 'public_id': 'vidur-srivastava-a45b85143',
 'experience': [{'locationName': 'Gurugram, Haryana, India',
   'geoLocationName': 'Gurugram, Haryana, India',
   'companyName': 'Tredence Inc.',
   'timePeriod': {'startDate': {'month': 7, 'year': 2024}},
   'company': {'employeeCountRange': {'start': 1001, 'end': 5000},
    'industries': ['Management Consulting']},
   'title': 'Data Scientist'},
  {'locationName': 'Bengaluru, Karnataka, India',
   'geoLocationName': 'Bengaluru, Karnataka, India',
   'companyName': 'Tiger Analytics',
   'timePeriod': {'endDate': {'month': 5, 'year': 2024},
    'startDate': {'month': 5, 'year': 2022}},
   'company': {'employee

In [15]:
client = chromadb.PersistentClient(path="./vector_database2")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [16]:
collection = client.create_collection(
    name="linkedin_people_profiles_4",
    embedding_function=OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"]),
)

In [17]:
for id, doc in zip(ids, all_documents):
    try:
        collection.add(
            ids=id,
            documents=doc.page_content,
            metadatas=doc.metadata,
        )
    except Exception:
        continue

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 400 Bad Request"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/

In [18]:
linkedin_collection = client.get_collection(
    name="linkedin_people_profiles_4", 
    embedding_function=OpenAIEmbeddingFunction(
        api_key=os.environ["OPENAI_API_KEY"]
    )
)

In [19]:
linkedin_collection.count()

19

In [20]:
retrieved_documents = linkedin_collection.query(
    query_texts="People with experience of 5 years",
    n_results=5,
    include=["documents"],
    where={
        "$or":[
            {"Data Science": "Data Science"},
            {"Machine Learning": "Machine Learning"},
            {"Python": "Python"},
            {"Statistical Techniques": "Statistical Techniques"},
            {"Software Engineering": "Software Engineering"}
        ]
    }
)["documents"]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [21]:
retrieved_documents

[['{"summary": "Experience in Cognitive Computing with leverage of Machine Learning, Deep Learning, Artificial Intelligence, Natural Language Processing skills . Data driven analyst with the ability to apply ML/DL/NLP/Computer Vision Techniques and leverage algorithms to solve real world problems. Established ability in deploying effective predictive/NLP/Computer Vision models across banking/HealthCare industries to accurately forecast and deliver proven results. Strong understanding of Oracle DB and PLSQL with data migration exp\\u00e9rience. Domain Knowledge of SupplyChain/finance/banking/Retail domain with communication skills, excellent product and process design knowledge. Domain HealthCare,Mortgage Banking, Consumer, Commercial Lending, Escrow, Loan processing and servicing, Deposits. Onsite experience (US/UK) with business domain experience in Retail and supply chain with special focus on Customer engagement and managing end-user expectations. Experience in Analysis, Design, Dev