In [62]:
import os
import json
import pandas as pd
from linkedin_api import Linkedin
import chromadb
from langchain_core.documents import Document
from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
from dotenv import load_dotenv, find_dotenv

In [2]:
load_dotenv(find_dotenv())

True

In [3]:
# Authenticate using any Linkedin account credentials
api = Linkedin(
    username=os.environ["LINKEDIN_USERNAME"],
    password=os.environ["LINKEDIN_PASSWORD"]
)

In [4]:
SEARCH_KEYWORD = "Data Scientist"

In [5]:
all_peoples = api.search_people(
    keywords=SEARCH_KEYWORD,
    limit=20
)

all_peoples

[{'urn_id': 'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist',
  'location': 'Gurugram',
  'name': 'Vidur Srivastava'},
 {'urn_id': 'ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist IBM',
  'location': 'Noida',
  'name': 'Isha Aneja'},
 {'urn_id': 'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist',
  'location': 'Noida',
  'name': 'Tushar Mathur'},
 {'urn_id': 'ACoAACMdmgABEKidiIRTUeY4ZC_etvTvOPPAWRA',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist at TCS',
  'location': 'New Delhi',
  'name': 'Shakshi Neha'},
 {'urn_id': 'ACoAACqnIWQBi3JKxfm8FVwW8SfjZj6U2YmfUms',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist at KnowDis Data Science',
  'location': 'India',
  'name': 'Deepak Kumar'},
 {'urn_id': 'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
  'distance': 'DISTANCE_2',
  'jobtitle': "Data Scientist @ R Systems | Mas

In [6]:
all_peoples_df = pd.DataFrame(all_peoples)

all_peoples_df

Unnamed: 0,urn_id,distance,jobtitle,location,name
0,ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4,DISTANCE_2,Data Scientist,Gurugram,Vidur Srivastava
1,ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk,DISTANCE_2,Data Scientist IBM,Noida,Isha Aneja
2,ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY,DISTANCE_2,Data Scientist,Noida,Tushar Mathur
3,ACoAACMdmgABEKidiIRTUeY4ZC_etvTvOPPAWRA,DISTANCE_2,Data Scientist at TCS,New Delhi,Shakshi Neha
4,ACoAACqnIWQBi3JKxfm8FVwW8SfjZj6U2YmfUms,DISTANCE_2,Data Scientist at KnowDis Data Science,India,Deepak Kumar
5,ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY,DISTANCE_2,Data Scientist @ R Systems | Master's in Data ...,Gurugram,Rakesh Nain
6,ACoAAA1k1eUB0Ylj--mqKdQ67o6wnie7-A5afjY,DISTANCE_2,Data Scientist / AI Consultant @ IBM Consultin...,Greater Delhi Area,Rakshit Singh
7,ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y,DISTANCE_2,Data Scientist,Gurgaon,Riya Bhatnagar
8,ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0,DISTANCE_2,Looking for Data Science Opportunities | Manag...,Gurgaon,Sumeet Kharbanda
9,ACoAACbYB28BGp07ec7sQFVsqdcNnzxVsBhwScc,DISTANCE_2,Data Scientist,Gurugram,Navdeep Singh


In [7]:
people_urn_ids = []
for row, col in all_peoples_df.iterrows():
    people_urn_ids.append(col["urn_id"])

people_urn_ids

['ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
 'ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
 'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
 'ACoAACMdmgABEKidiIRTUeY4ZC_etvTvOPPAWRA',
 'ACoAACqnIWQBi3JKxfm8FVwW8SfjZj6U2YmfUms',
 'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
 'ACoAAA1k1eUB0Ylj--mqKdQ67o6wnie7-A5afjY',
 'ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y',
 'ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0',
 'ACoAACbYB28BGp07ec7sQFVsqdcNnzxVsBhwScc',
 'ACoAABJ97q0BFcxSTVQ9_y1dgDAXNMpNiUtF93k',
 'ACoAACUNkNwBsZws6K44VX40ECBT3woAaJttIhQ',
 'ACoAABBPpTkBE83UntVzca-RXL5ggcsfztv9RRs',
 'ACoAACu4NbUBH95IDJX8ynU_hDNCiu4uWC3F4qI',
 'ACoAACA43R8Buv3_11zWBpYvQ0u3_u_S89F3Clc',
 'ACoAACuf0GEBxqLlm79WsFe0eagVOHzxvQhltaA',
 'ACoAABIURlEB0IaIlQL8yLwChwLnQFDTIiBtFbY',
 'ACoAAA_8N7UBcb426isJoolyZrF0Lgh2oyOZbcc',
 'ACoAABr-5jIB9VLlcHDFacNUqBWr7BWroxPEhRI',
 'ACoAAAzHcn4BGz95j9zh8hNbBxgBM2kdM9L7I9w']

In [8]:
from collections import defaultdict

peoples_profiles: dict[str, list] = defaultdict(list)

for urn_id in people_urn_ids:
    try:
        profile_details = api.get_profile(urn_id=urn_id)
        skills = api.get_profile_skills(urn_id=urn_id)
        peoples_profiles[urn_id].append(
            {
                "profile_details": profile_details,
                "skills": skills
            }
        )
    except KeyError:
        continue

peoples_profiles


defaultdict(list,
            {'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4': [{'profile_details': {'summary': 'Experienced data scientist with a track record of translating complex data into actionable insights across diverse sectors spanning FMCG, Sports and Supply Chain.',
                'industryName': 'Information Technology & Services',
                'lastName': 'Srivastava',
                'locationName': 'India',
                'student': False,
                'geoCountryName': 'India',
                'geoCountryUrn': 'urn:li:fs_geo:102713980',
                'geoLocationBackfilled': False,
                'elt': False,
                'industryUrn': 'urn:li:fs_industry:96',
                'firstName': 'Vidur',
                'entityUrn': 'urn:li:fs_profile:ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
                'geoLocation': {'geoUrn': 'urn:li:fs_geo:115884833'},
                'geoLocationName': 'Gurugram, Haryana',
                'location': {'basicLocation': {'co

In [9]:
# code to clean up unnecessary details
for key, values in peoples_profiles.items():
    for val in values:
        for k1, v1 in val.items():
            if isinstance(v1, dict):
                if "student" in v1:
                    del v1["student"]
                if "geoCountryUrn" in v1:
                    del v1["geoCountryUrn"]
                if "geoLocationBackfilled" in v1:
                    del v1["geoLocationBackfilled"]
                if "entityUrn" in v1:
                    del v1["entityUrn"]
                if "geoCountryName" in v1:
                    del v1["geoCountryName"]
                if "elt" in v1:
                    del v1["elt"]
                if "profilePictureOriginalImage" in v1:
                    del v1["profilePictureOriginalImage"]
                if "industryUrn" in v1:
                    del v1["industryUrn"]
                if "profilePicture" in v1:
                    del v1["profilePicture"]
                if "geoLocation" in v1:
                    del v1["geoLocation"]
                if "geoLocationName" in v1:
                    del v1["geoLocationName"]
                if "location" in v1:
                    del v1["location"]
                if "backgroundPicture" in v1:
                    del v1["backgroundPicture"]
                if "backgroundPictureOriginalImage" in v1:
                    del v1["backgroundPictureOriginalImage"]
                if "displayPictureUrl" in v1:
                    del v1["displayPictureUrl"]
                if "img_400_400" in v1:
                    del v1["img_400_400"]
                if "img_200_200" in v1:
                    del v1["img_200_200"]
                if "img_800_800" in v1:
                    del v1["img_800_800"]
                if "img_100_100" in v1:
                    del v1["img_100_100"]
                if "img_767_767" in v1:
                    del v1["img_767_767"]
                if "profile_id" in v1:
                    del v1["profile_id"]
                if "profile_urn" in v1:
                    del v1["profile_urn"]
                if "member_urn" in v1:
                    del v1["member_urn"]
                if "volunteer" in v1:
                    del v1["volunteer"]
                if "honors" in v1:
                    del v1["honors"]
                if "experience" in v1:
                    for ele in v1["experience"]:
                        if "entityUrn" in ele:
                            del ele["entityUrn"]
                        if "geoUrn" in ele:
                            del ele["geoUrn"]
                        if "region" in ele:
                            del ele["region"]
                        if "companyUrn" in ele:
                            del ele["companyUrn"]
                        if "companyLogoUrl" in ele:
                            del ele["companyLogoUrl"]
                if "education" in v1:
                    for ele in v1["education"]:
                        if "entityUrn" in ele:
                            del ele["entityUrn"]
                        for sch in ele.get("school", []):
                            if "objectUrn" in ele:
                                del ele["objectUrn"]
                            if "entityUrn" in ele:
                                del ele["entityUrn"]
                            if "trackingId" in ele:
                                del ele["trackingId"]
                            if "logoUrl" in ele:
                                del ele["logoUrl"]
                            if "schoolUrn" in ele:
                                del ele["schoolUrn"]
                if "certifications" in v1:
                    for ele in v1["certifications"]:
                        if "company" in ele:
                                del ele["company"]
                        if "displaySource" in ele:
                                del ele["displaySource"]
                        if "companyUrn" in ele:
                                del ele["companyUrn"]
                        if "url" in ele:
                                del ele["url"]
                if "projects" in v1:
                    for ele in v1["projects"]:
                        if "members" in ele:
                                del ele["members"]

In [64]:
# Store data into chromadb
all_documents = []
ids = []
for urn_id, profile_info in peoples_profiles.items():
    for info in profile_info:
        metadatas = {}
        ids.append(urn_id)
        # documents.append({k1: v1 for k, v in info.items() if k == "profile_details" for k1, v1 in v.items() if k1 != "skills"})
        for skill_name in info["skills"]:
            if "name" in skill_name:
                metadatas.update({skill_name["name"]: skill_name["name"]})

    all_documents.append(Document(page_content=json.dumps(info), metadata=metadatas))

In [69]:
all_documents

[Document(metadata={'Data Mining': 'Data Mining', 'Deep Neural Networks (DNN)': 'Deep Neural Networks (DNN)', 'Computer Architecture': 'Computer Architecture', 'Optimization': 'Optimization', 'TensorFlow': 'TensorFlow', 'Apache Spark': 'Apache Spark', 'Algorithm Design': 'Algorithm Design', 'Statistical Learning': 'Statistical Learning', 'Unstructured Data': 'Unstructured Data', 'Teamwork': 'Teamwork', 'Problem Solving': 'Problem Solving', 'Cluster Analysis': 'Cluster Analysis', 'Analytical Skills': 'Analytical Skills', 'Natural Language Processing (NLP)': 'Natural Language Processing (NLP)', 'Predictive Analytics': 'Predictive Analytics', 'Statistical Data Analysis': 'Statistical Data Analysis', 'Statistics': 'Statistics', 'Python (Programming Language)': 'Python (Programming Language)', 'SQL': 'SQL', 'Java': 'Java', 'Object-Oriented Programming (OOP)': 'Object-Oriented Programming (OOP)', 'Machine Learning': 'Machine Learning', 'Data Visualization': 'Data Visualization', 'Cypher Quer

In [13]:
client = chromadb.PersistentClient(path="./vector_database2")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [70]:
collection = client.create_collection(
    name="linkedin_people_profiles_3",
    embedding_function=OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"]),
)

In [71]:
for id, doc in zip(ids, all_documents):
    try:
        collection.add(
            ids=id,
            documents=doc.page_content,
            metadatas=doc.metadata,
        )
    except Exception:
        continue

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 400 Bad Request"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/

In [72]:
client.count_collections()

3

In [73]:
client.list_collections()

[Collection(id=2e261f40-ab3b-4ff6-b7d5-428835c428c3, name=linkedin_people_profiles_3),
 Collection(id=47ff5815-b380-4ed3-9074-c9b35deb06bc, name=linkedin_people_profiles_2),
 Collection(id=71f01971-885a-4a7a-b06f-6fc82ed6005e, name=linkedin_people_profiles)]

In [233]:
for collection in client.list_collections():
    client.delete_collection(name=collection.name)

In [74]:
linkedin_collection = client.get_collection(name="linkedin_people_profiles_3", embedding_function=OpenAIEmbeddingFunction())

In [75]:
linkedin_collection.count()

19

In [78]:
retrieved_documents = linkedin_collection.query(
    query_texts="People with experience of 5 years",
    include=["documents"],
    where={"Amazon Web Services (AWS)": "Amazon Web Services (AWS)"}
)["documents"]

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [86]:
json.loads(retrieved_documents[0][1])["skills"]

[{'name': 'LangChain'},
 {'name': 'Amazon Web Services (AWS)'},
 {'name': 'openai'},
 {'name': 'Microsoft Azure'},
 {'name': 'Large Language Models (LLM)'},
 {'name': 'Generative AI'},
 {'name': 'Data Science'},
 {'name': 'Machine Learning'},
 {'name': 'Deep Learning'},
 {'name': 'Object Detection'},
 {'name': 'Image Segmentation'},
 {'name': 'Image Processing'},
 {'name': 'Recommender Systems'},
 {'name': 'Recurrent Neural Networks (RNN)'},
 {'name': 'Convolutional Neural Networks (CNN)'},
 {'name': 'Transformer Models'},
 {'name': 'TensorFlow'},
 {'name': 'Predictive Modeling'},
 {'name': 'PyTorch'},
 {'name': 'Computer Vision'},
 {'name': 'Data Analysis'},
 {'name': 'Data Wrangling'},
 {'name': 'Docker Products'},
 {'name': 'Data Visualization'},
 {'name': 'Data Modeling'},
 {'name': 'Data Warehousing'},
 {'name': 'Artificial Intelligence (AI)'},
 {'name': 'Data Structures'},
 {'name': 'Object-Oriented Programming (OOP)'},
 {'name': 'Big Data'},
 {'name': 'Statistics'},
 {'name': 'P

In [21]:
linkedin_collection.query(
    query_texts="People with experience of 5 years",
    include=["documents"],
    where_document={"$contains": "AWS"}
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


{'ids': [['ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
   'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
   'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
   'ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0',
   'ACoAABr-5jIB9VLlcHDFacNUqBWr7BWroxPEhRI',
   'ACoAABJ97q0BFcxSTVQ9_y1dgDAXNMpNiUtF93k']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [['{"profile_details": {"summary": "Experience in Cognitive Computing with leverage of Machine Learning, Deep Learning, Artificial Intelligence, Natural Language Processing skills . Data driven analyst with the ability to apply ML/DL/NLP/Computer Vision Techniques and leverage algorithms to solve real world problems. Established ability in deploying effective predictive/NLP/Computer Vision models across banking/HealthCare industries to accurately forecast and deliver proven results. Strong understanding of Oracle DB and PLSQL with data migration exp\\u00e9rience. Domain Knowledge of SupplyChain/finance/banking/Retail domain with com