In [217]:
import os
import json
import pandas as pd
from linkedin_api import Linkedin
import chromadb
from chromadb.utils.embedding_functions.openai_embedding_function import OpenAIEmbeddingFunction
from dotenv import load_dotenv, find_dotenv

In [3]:
load_dotenv(find_dotenv())

True

In [4]:
# Authenticate using any Linkedin account credentials
api = Linkedin(
    username=os.environ["LINKEDIN_USERNAME"],
    password=os.environ["LINKEDIN_PASSWORD"]
)

In [5]:
SEARCH_KEYWORD = "Data Scientist"

In [6]:
all_peoples = api.search_people(
    keywords=SEARCH_KEYWORD,
    limit=20
)

all_peoples

[{'urn_id': 'ACoAAAZZciMBmlehJoR3zG5AnILlz_0LDgRmano',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist @ Hunch',
  'location': 'Gurugram',
  'name': 'Sahil Maheshwari'},
 {'urn_id': 'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist',
  'location': 'Gurugram',
  'name': 'Vidur Srivastava'},
 {'urn_id': 'ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist IBM',
  'location': 'Noida',
  'name': 'Isha Aneja'},
 {'urn_id': 'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scientist',
  'location': 'Noida',
  'name': 'Tushar Mathur'},
 {'urn_id': 'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
  'distance': 'DISTANCE_2',
  'jobtitle': "Data Scientist @ R Systems | Master's in Data Science",
  'location': 'Gurugram',
  'name': 'Rakesh Nain'},
 {'urn_id': 'ACoAACeUD1cBHSiBiEFTJnJc3WWxAnH1y_1erQA',
  'distance': 'DISTANCE_2',
  'jobtitle': 'Data Scient

In [7]:
all_peoples_df = pd.DataFrame(all_peoples)

In [8]:
people_urn_ids = []
for row, col in all_peoples_df.iterrows():
    people_urn_ids.append(col["urn_id"])

people_urn_ids

['ACoAAAZZciMBmlehJoR3zG5AnILlz_0LDgRmano',
 'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4',
 'ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
 'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
 'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
 'ACoAACeUD1cBHSiBiEFTJnJc3WWxAnH1y_1erQA',
 'ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y',
 'ACoAABJ97q0BFcxSTVQ9_y1dgDAXNMpNiUtF93k',
 'ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0',
 'ACoAACR1zh0BnySZente_0caSEZGn2Qx1I8I_78',
 'ACoAAA_8N7UBcb426isJoolyZrF0Lgh2oyOZbcc',
 'ACoAACX92-ABB7kKVBi6f_EXf2zXIOObQpB9Js4',
 'ACoAACMdmgABEKidiIRTUeY4ZC_etvTvOPPAWRA',
 'ACoAABIURlEB0IaIlQL8yLwChwLnQFDTIiBtFbY',
 'ACoAACA43R8Buv3_11zWBpYvQ0u3_u_S89F3Clc',
 'ACoAACu4NbUBH95IDJX8ynU_hDNCiu4uWC3F4qI',
 'ACoAAAzHcn4BGz95j9zh8hNbBxgBM2kdM9L7I9w',
 'ACoAACuf0GEBxqLlm79WsFe0eagVOHzxvQhltaA',
 'ACoAAB72j5kB3LCVCtnZjP-7Bl4mpfNQhEi8GJ8',
 'ACoAAC1VcqsByzp1o1GEmybISevasiO7VjSe39U']

In [186]:
from collections import defaultdict

peoples_profiles: dict[str, list] = defaultdict(list)

for urn_id in people_urn_ids:
    try:
        profile_details = api.get_profile(urn_id=urn_id)
        skills = api.get_profile_skills(urn_id=urn_id)
        peoples_profiles[urn_id].append(
            {
                "profile_details": profile_details,
                "skills": skills
            }
        )
    except KeyError:
        continue

peoples_profiles


defaultdict(list,
            {'ACoAAAZZciMBmlehJoR3zG5AnILlz_0LDgRmano': [{'profile_details': {'summary': '11+ years of experience with a proven track record of driving revenue growth and operational efficiency through data-driven solutions. Skilled in data manipulation, visualization, statistical inference, machine learning, and MLOps with strong theoretical foundations. Effective communication skills, adept at distilling complex ideas and concepts into easy-to- understand language. Experienced in leading end-to-end ML projects and solving the unique challenges that come with them.',
                'industryName': 'Information Technology & Services',
                'lastName': 'Maheshwari',
                'locationName': 'India',
                'student': False,
                'geoCountryName': 'India',
                'geoCountryUrn': 'urn:li:fs_geo:102713980',
                'geoLocationBackfilled': False,
                'elt': False,
                'industryUrn': 'urn:li:f

In [187]:
# code to clean up unnecessary details
for key, values in peoples_profiles.items():
    for val in values:
        for k1, v1 in val.items():
            if isinstance(v1, dict):
                if "student" in v1:
                    del v1["student"]
                if "geoCountryUrn" in v1:
                    del v1["geoCountryUrn"]
                if "geoLocationBackfilled" in v1:
                    del v1["geoLocationBackfilled"]
                if "entityUrn" in v1:
                    del v1["entityUrn"]
                if "geoCountryName" in v1:
                    del v1["geoCountryName"]
                if "elt" in v1:
                    del v1["elt"]
                if "profilePictureOriginalImage" in v1:
                    del v1["profilePictureOriginalImage"]
                if "industryUrn" in v1:
                    del v1["industryUrn"]
                if "profilePicture" in v1:
                    del v1["profilePicture"]
                if "geoLocation" in v1:
                    del v1["geoLocation"]
                if "geoLocationName" in v1:
                    del v1["geoLocationName"]
                if "location" in v1:
                    del v1["location"]
                if "backgroundPicture" in v1:
                    del v1["backgroundPicture"]
                if "backgroundPictureOriginalImage" in v1:
                    del v1["backgroundPictureOriginalImage"]
                if "displayPictureUrl" in v1:
                    del v1["displayPictureUrl"]
                if "img_400_400" in v1:
                    del v1["img_400_400"]
                if "img_200_200" in v1:
                    del v1["img_200_200"]
                if "img_800_800" in v1:
                    del v1["img_800_800"]
                if "img_100_100" in v1:
                    del v1["img_100_100"]
                if "img_767_767" in v1:
                    del v1["img_767_767"]
                if "profile_id" in v1:
                    del v1["profile_id"]
                if "profile_urn" in v1:
                    del v1["profile_urn"]
                if "member_urn" in v1:
                    del v1["member_urn"]
                if "volunteer" in v1:
                    del v1["volunteer"]
                if "honors" in v1:
                    del v1["honors"]
                if "experience" in v1:
                    for ele in v1["experience"]:
                        if "entityUrn" in ele:
                            del ele["entityUrn"]
                        if "geoUrn" in ele:
                            del ele["geoUrn"]
                        if "region" in ele:
                            del ele["region"]
                        if "companyUrn" in ele:
                            del ele["companyUrn"]
                        if "companyLogoUrl" in ele:
                            del ele["companyLogoUrl"]
                if "education" in v1:
                    for ele in v1["education"]:
                        if "entityUrn" in ele:
                            del ele["entityUrn"]
                        for sch in ele.get("school", []):
                            if "objectUrn" in ele:
                                del ele["objectUrn"]
                            if "entityUrn" in ele:
                                del ele["entityUrn"]
                            if "trackingId" in ele:
                                del ele["trackingId"]
                            if "logoUrl" in ele:
                                del ele["logoUrl"]
                            if "schoolUrn" in ele:
                                del ele["schoolUrn"]
                if "certifications" in v1:
                    for ele in v1["certifications"]:
                        if "company" in ele:
                                del ele["company"]
                        if "displaySource" in ele:
                                del ele["displaySource"]
                        if "companyUrn" in ele:
                                del ele["companyUrn"]
                        if "url" in ele:
                                del ele["url"]
                if "projects" in v1:
                    for ele in v1["projects"]:
                        if "members" in ele:
                                del ele["members"]

In [212]:
for urn_id, profile_info in peoples_profiles.items():
    for info in profile_info:
        for k,v in info.items():
            if k == "profile_details":
                for k1,v1 in v.items():
                    if k1 != "skills":
                        a = [k1,v1]
                        print(a)

['summary', '11+ years of experience with a proven track record of driving revenue growth and operational efficiency through data-driven solutions. Skilled in data manipulation, visualization, statistical inference, machine learning, and MLOps with strong theoretical foundations. Effective communication skills, adept at distilling complex ideas and concepts into easy-to- understand language. Experienced in leading end-to-end ML projects and solving the unique challenges that come with them.']
['industryName', 'Information Technology & Services']
['lastName', 'Maheshwari']
['locationName', 'India']
['firstName', 'Sahil']
['headline', 'Data Scientist @ Hunch']
['public_id', 'maheshwarisahil']
['experience', [{'locationName': 'Delhi, India', 'geoLocationName': 'Delhi, India', 'companyName': 'Hunch', 'timePeriod': {'startDate': {'month': 11, 'year': 2023}}, 'company': {'employeeCountRange': {'start': 51, 'end': 200}, 'industries': ['Internet']}, 'title': 'Head of Data Science'}, {'location

In [214]:
# Store data into chromadb
ids = []
documents = []
metadatas = []

for urn_id, profile_info in peoples_profiles.items():
    for info in profile_info:
        ids.append(urn_id)
        documents.append({k1: v1 for k, v in info.items() if k == "profile_details" for k1, v1 in v.items() if k1 != "skills"})
        metadatas.append({"skills": info["skills"]})

In [215]:
documents

[{'summary': '11+ years of experience with a proven track record of driving revenue growth and operational efficiency through data-driven solutions. Skilled in data manipulation, visualization, statistical inference, machine learning, and MLOps with strong theoretical foundations. Effective communication skills, adept at distilling complex ideas and concepts into easy-to- understand language. Experienced in leading end-to-end ML projects and solving the unique challenges that come with them.',
  'industryName': 'Information Technology & Services',
  'lastName': 'Maheshwari',
  'locationName': 'India',
  'firstName': 'Sahil',
  'headline': 'Data Scientist @ Hunch',
  'public_id': 'maheshwarisahil',
  'experience': [{'locationName': 'Delhi, India',
    'geoLocationName': 'Delhi, India',
    'companyName': 'Hunch',
    'timePeriod': {'startDate': {'month': 11, 'year': 2023}},
    'company': {'employeeCountRange': {'start': 51, 'end': 200},
     'industries': ['Internet']},
    'title': 'H

In [216]:
skills_list = []
for skill in metadatas:
    skills_dict = {"skills": [item["name"] for item in skill["skills"]]}
    skills_list.append(skills_dict)

skills_list = [{'skills': ",".join(skill["skills"])} for skill in skills_list]

skills_list

[{'skills': 'Probability Theory,Probabilistic Programming,Amazon Web Services (AWS),Mathematics,Artificial Neural Networks,Natural Language Processing (NLP),Software Development,Credit Scoring,Business Development,Risk Based Pricing,Price Elasticity,Genetic Algorithms,Causal Inference,Time Series Forecasting,Recommender Systems,team lead,Team Mentoring,A/B Testing,Dynamic Pricing,Inventory Optimization,MLOps,Business Strategy,Machine Learning,Deep Learning,Statistical Data Analysis,Statistical Modeling,SQL,R,Microsoft Excel,Tableau,C,Java,Data Analytics,Data Science,Statistics,Artificial Intelligence (AI),Python (Programming Language),Data Mining,Algorithms,Data Visualization,Bayesian inference,Design of Experiments (DOE),Reinforcement Learning,Automated Machine Learning (AutoML),Graphical Models'},
 {'skills': 'Data Mining,Deep Neural Networks (DNN),Computer Architecture,Optimization,TensorFlow,Apache Spark,Algorithm Design,Statistical Learning,Unstructured Data,Teamwork,Problem Solvi

In [102]:
client = chromadb.PersistentClient(path="./vector_database")

INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


In [234]:
collection = client.create_collection(
    name="linkedin_people_profiles",
    embedding_function=OpenAIEmbeddingFunction(api_key=os.environ["OPENAI_API_KEY"]),
)

In [235]:
for id, doc, metadata in zip(ids, documents, skills_list):
    try:
        collection.add(
            ids=id,
            documents=json.dumps(doc),
            metadatas=metadata,
        )
    except Exception:
        continue

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embedding

In [236]:
client.count_collections()

1

In [232]:
client.list_collections()

[]

In [233]:
for collection in client.list_collections():
    client.delete_collection(name=collection.name)

In [243]:
linkedin_collection = client.get_collection(name="linkedin_people_profiles", embedding_function=OpenAIEmbeddingFunction())

In [244]:
linkedin_collection.count()

19

In [246]:
linkedin_collection.query(
    query_texts="People with experience of 5 years",
    include=["documents"]
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


{'ids': [['ACoAAAbvohIBhdiLqQ5FCbtA40AFvg3f-T7fKfk',
   'ACoAAAROFiYBT5QvFHuxzGcEIeV9MLsZrg66TyY',
   'ACoAABuJAnYBOub-HVz9rxZx0YCscxv0KDx7be0',
   'ACoAAAZZciMBmlehJoR3zG5AnILlz_0LDgRmano',
   'ACoAAB72j5kB3LCVCtnZjP-7Bl4mpfNQhEi8GJ8',
   'ACoAAB_Pb3QB2wyW1d2XQVv2cjEDJPM4lRD9WQY',
   'ACoAAAzHcn4BGz95j9zh8hNbBxgBM2kdM9L7I9w',
   'ACoAACzqAZ8BNYEJjGPpk_l6DlwoAzZJUXVYd0Y',
   'ACoAABIURlEB0IaIlQL8yLwChwLnQFDTIiBtFbY',
   'ACoAACLhI5UB5P9yeNUgC5weJ_vgDPQFZQOanX4']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [['{"summary": "Experience in Cognitive Computing with leverage of Machine Learning, Deep Learning, Artificial Intelligence, Natural Language Processing skills . Data driven analyst with the ability to apply ML/DL/NLP/Computer Vision Techniques and leverage algorithms to solve real world problems. Established ability in deploying effective predictive/NLP/Computer Vision models across banking/HealthCare industries to accurately forecast and deliver prov