In [1]:
from dataclasses import dataclass

import os
import random
import time
from linkedin_scraper import Person, actions
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC





In [2]:
@dataclass
class Experience():
    position_title: str = None
    from_date: str = None
    to_date: str = None
    description: str = None
    position_title: str = None
    duration: str = None
    location: str = None
    institution_name: str = None
    linkedin_url: str = None

class ScrapedProfile:
    def __init__(self, profile_name, experiences, profile_school, profile_dist, profile_description, profile_link):
        self.profile_name = profile_name
        self.experiences = experiences
        self.profile_school = profile_school
        self.profile_dist = profile_dist
        self.profile_description = profile_description
        self.profile_link = profile_link

def wait_for_element_to_load(by=By.CLASS_NAME, name="pv-top-card", base=None):
    base = base or driver
    return WebDriverWait(base, 180).until(
        EC.presence_of_element_located(
            (
                by,
                name
            )
        )
    )

def wait_for_all_elements_to_load(by=By.CLASS_NAME, name="pv-top-card", base=None):
    base = base or driver
    return WebDriverWait(base, 180).until(
        EC.presence_of_all_elements_located(
            (
                by,
                name
            )
        )
    )

def get_pvs_list_element(position_summary_text):
    if not position_summary_text:
        return None
    
    try:
        return position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list")
    except:
        return position_summary_text.find_element(By.CLASS_NAME,"pvs-list")
    
    return position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list")

def get_experiences(driver):
    driver.execute_script('alert("Focus window")')
    driver.switch_to.alert.accept()
    try:
        WebDriverWait(driver, 240).until(lambda d: d.execute_script('return document.readyState') == 'complete')
        main = wait_for_element_to_load(by=By.TAG_NAME, name="main")
    except:
        driver = reinstantiate_driver(driver)
    
    driver.execute_script(
                "window.scrollTo(0, Math.ceil(document.body.scrollHeight/2));"
            )
    driver.execute_script(
                "window.scrollTo(0, document.body.scrollHeight);"
            )

    main_list = wait_for_element_to_load(name="pvs-list", base=main)
    experiences = []

    for position in main_list.find_elements(By.XPATH,"li"):
        position = position.find_element(By.CLASS_NAME,"pvs-entity")
        company_logo_elem, position_details = position.find_elements(By.XPATH,"*")

        # company elem
        company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")

        # position details
        position_details_list = position_details.find_elements(By.XPATH,"*")
        position_summary_details = position_details_list[0] if len(position_details_list) > 0 else None
        position_summary_text = position_details_list[1] if len(position_details_list) > 1 else None
        outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
        work_times = ''
        
        if len(outer_positions) == 4:
            # position_title = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
            position_title = outer_positions[0].find_element(By.TAG_NAME,"span").text
            company = outer_positions[1].find_element(By.TAG_NAME,"span").text
            work_times = outer_positions[2].find_element(By.TAG_NAME,"span").text
            location = outer_positions[3].find_element(By.TAG_NAME,"span").text
        elif len(outer_positions) == 3:
            if "·" in outer_positions[2].text:
                # position_title = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
                position_title = outer_positions[0].find_element(By.TAG_NAME,"span").text                
                company = outer_positions[1].find_element(By.TAG_NAME,"span").text
                work_times = outer_positions[2].find_element(By.TAG_NAME,"span").text
                location = ""
            else:
                position_title = ""
                # company = outer_positions[0].find_element(By.TAG_NAME,"span").find_element(By.TAG_NAME,"span").text
                company = outer_positions[0].find_element(By.TAG_NAME,"span").text
                work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
                location = outer_positions[2].find_element(By.TAG_NAME,"span").text
        elif len(outer_positions) == 2:
            company = outer_positions[0].text.split('\n')[0]
            # duration = outer_positions[1].text.split('\n')[0].split("·")[1].strip()

        company = company.split(' · ')[0] # 6/14/23 added this line to handle showing "full-time" in company name
        times = work_times.split("·")[0].strip() if work_times else ""
        duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None

        from_date = " ".join(times.split(" ")[:2]) if times else ""
        to_date = " ".join(times.split(" ")[3:]) if times else ""
        
        pvs_list_element = get_pvs_list_element(position_summary_text)

        if position_summary_text and len(pvs_list_element.find_elements(By.XPATH,"li")) > 1:
            descriptions = pvs_list_element.find_elements(By.XPATH,"li")
            for description in descriptions:
                res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
                position_title_elem = res[0] if len(res) > 0 else None
                work_times_elem = res[1] if len(res) > 1 else None
                location_elem = res[2] if len(res) > 2 else None

                location = location_elem.find_element(By.XPATH,"*").text if location_elem else None
                position_title = position_title_elem.find_element(By.XPATH,"*").find_element(By.TAG_NAME,"*").text if position_title_elem else ""
                work_times = work_times_elem.find_element(By.XPATH,"*").text if work_times_elem else ""
                times = work_times.split("·")[0].strip() if work_times else ""
                duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
                from_date = " ".join(times.split(" ")[:2]) if times else ""
                to_date = " ".join(times.split(" ")[3:]) if times else ""

                experience = Experience(
                    position_title=position_title,
                    from_date=from_date,
                    to_date=to_date,
                    duration=duration,
                    location=location,
                    description=description,
                    institution_name=company,
                    linkedin_url=company_linkedin_url
                )
                experiences.append(experience)
        else:
            description = position_summary_text.text if position_summary_text else ""

            experience = Experience(
                position_title=position_title,
                from_date=from_date,
                to_date=to_date,
                duration=duration,
                location=location,
                description=description,
                institution_name=company,
                linkedin_url=company_linkedin_url
            )
            experiences.append(experience)
    return experiences

def scrape_profile_live_filtering(driver, profile_link):
    
    experiences_url = os.path.join(profile_link, "details/experience")
    print(experiences_url)
    driver.get(experiences_url)
    try:
        WebDriverWait(driver, 240).until(lambda d: d.execute_script('return document.readyState') == 'complete')
    except:
        driver = reinstantiate_driver(driver)
    time.sleep(5 + random.random() * 10)
    experiences = get_experiences(driver)
    
    # FILTERING
    # found_target_company = False
    # if len(experiences) > 1 and experiences[0].duration in RECENT_LIST and is_likely_startup(experiences[0]):
    #     for experience in experiences[1:5]:
    #         company = experience.institution_name.lower()
    #         for target_company in COMPANY_LIST:
    #             if target_company.lower() in company:
    #                 filter_company_match_dict[profile_link] = target_company
    #                 found_target_company = True
    #                 break
    # if not found_target_company:
    #     return None
    
    person_obj = Person(profile_link, driver = driver, scrape=False, experiences = [None])
    try:
        WebDriverWait(driver, 240).until(lambda d: d.execute_script('return document.readyState') == 'complete')
    except:
        driver = reinstantiate_driver(driver)
    time.sleep(2 + random.random() * 7)
    
    # name
    profile_name = driver.find_element(By.CLASS_NAME, "text-heading-xlarge").text
    time.sleep(1 + random.random())

    # education
    education = []
    edu_section = driver.find_element(By.ID, "education")
    parent_element = edu_section.find_element(By.XPATH, "./..")
    entries = parent_element.find_elements(By.CLASS_NAME, "pvs-entity")
    for entry in entries:
        elem = entry.find_elements(By.CLASS_NAME, "visually-hidden")
        education.append({"school": elem[0].text, "degree": elem[1].text})
    time.sleep(1 + random.random())

    # degree of connection
    profile_dist = driver.find_element(By.CLASS_NAME, "dist-value").text
    time.sleep(1 + random.random())

    # description
    profile_description = driver.find_element(By.CLASS_NAME, "text-body-medium").text
    time.sleep(1 + random.random())

    # profile link
    profile_link = driver.current_url
    time.sleep(1 + random.random())
    
    profile = ScrapedProfile(profile_name,
                   experiences,
                   education,
                   profile_dist,
                   profile_description,
                   profile_link)
    return profile

In [3]:
options = FirefoxOptions()
options.add_argument("--headless")
driver = webdriver.Firefox(options=options)
driver.get("https://dev.to")
# actions.login(driver, 'ljiangfbla@gmail.com', 'Sunf1owerC@pit@1!')
actions.login(driver, 'jchao2001@gmail.com', 'Spoiler.Neurology.Primarily.Sandstorm.Laziness')

def reinstantiate_driver(driver):
    driver.quit()
    options = FirefoxOptions()
    options.add_argument("--headless")
    driver = webdriver.Firefox(options=options)
    actions.login(driver, 'ljiangfbla@gmail.com', 'Sunf1owerC@pit@1!')
    print('recreated driver')

In [4]:
profile_urls = []

In [37]:
url="https://www.linkedin.com/sales/search/people#page=35&query=(recentSearchParam%3A(id%3A2703706810%2CdoLogHistory%3Atrue)%2Cfilters%3AList((type%3ACURRENT_COMPANY%2Cvalues%3AList((id%3Aurn%253Ali%253Aorganization%253A18583501%2Ctext%3AStealth%2520Startup%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18454116%2Ctext%3AStealth%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18016269%2Ctext%3AStealth%2520Mode%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A91313799%2Ctext%3AStealth%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))))%2C(type%3APAST_COMPANY%2Cvalues%3AList((id%3Aurn%253Ali%253Aorganization%253A1815218%2Ctext%3AUber%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A309694%2Ctext%3AAirbnb%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2857634%2Ctext%3ACoinbase%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2135371%2Ctext%3AStripe%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A20708%2Ctext%3APalantir%2520Technologies%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3608%2Ctext%3ANVIDIA%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3205573%2Ctext%3ADoorDash%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A675562%2Ctext%3ASquare%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A30846%2Ctext%3ASpaceX%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A30086%2Ctext%3APalo%2520Alto%2520Networks%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3131483%2Ctext%3AFlexport%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3477522%2Ctext%3ADatabricks%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A748731%2Ctext%3AKlarna%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3254263%2Ctext%3ARobinhood%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A6575553%2Ctext%3AByteDance%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18505670%2Ctext%3ABrex%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2732417%2Ctext%3AInstacart%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A17998520%2Ctext%3AScale%2520AI%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2684737%2Ctext%3APlaid%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3767529%2Ctext%3ANubank%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3991822%2Ctext%3AAirtable%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A10043614%2Ctext%3ASnyk%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A10607336%2Ctext%3AChainalysis%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A10893210%2Ctext%3Adbt%2520Labs%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A11062162%2Ctext%3AGrafana%2520Labs%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A11130470%2Ctext%3AOpenAI%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A11193683%2Ctext%3AHugging%2520Face%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A11247457%2Ctext%3ASolugen%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A11741116%2Ctext%3ARunway%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A11869260%2Ctext%3ARetool%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A1406226%2Ctext%3ARamp%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A14824547%2Ctext%3AFireblocks%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A1594050%2Ctext%3AGoogle%2520DeepMind%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A16181286%2Ctext%3AVercel%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A17932068%2Ctext%3ALacework%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A17988315%2Ctext%3ARippling%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18013280%2Ctext%3AFaire%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18293159%2Ctext%3AAnduril%2520Industries%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18309569%2Ctext%3ASemgrep%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18586257%2Ctext%3AAbnormal%2520Security%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18593641%2Ctext%3AWeights%2520%2526%2520Biases%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18742807%2Ctext%3ATRM%2520Labs%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18769344%2Ctext%3AModern%2520Treasury%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18777798%2Ctext%3ACribl%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A18922914%2Ctext%3ADeel%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A19107985%2Ctext%3AMercury%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A24024765%2Ctext%3ACohere%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2418251%2Ctext%3AZapier%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2497653%2Ctext%3ACrowdStrike%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A27159855%2Ctext%3AStarburst%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A2850862%2Ctext%3ACanva%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A30898036%2Ctext%3ANotion%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3502352%2Ctext%3AWebflow%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A35462987%2Ctext%3AVanta%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3650502%2Ctext%3AFigma%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A37564254%2Ctext%3APersona%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3769390%2Ctext%3ABenchling%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A3954657%2Ctext%3AFivetran%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A40671813%2Ctext%3ARobust%2520Intelligence%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A4803356%2Ctext%3ASourcegraph%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A6424460%2Ctext%3ASentry%2520%2528sentry.io%2529%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A64890982%2Ctext%3AWiz%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A65281968%2Ctext%3ATecton%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A65638805%2Ctext%3AMaterial%2520Security%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A67081245%2Ctext%3ATemporal%2520Technologies%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A68023390%2Ctext%3AIsland%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A68047275%2Ctext%3AUniswap%2520Labs%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A70975817%2Ctext%3AVarda%2520Space%2520Industries%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A71668100%2Ctext%3AHadrian%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A74126343%2Ctext%3AAnthropic%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A74882602%2Ctext%3AGlean%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A7602863%2Ctext%3AZipline%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A76262108%2Ctext%3AKumo.AI%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A80114151%2Ctext%3AClickHouse%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A81330326%2Ctext%3AAdept%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A81491861%2Ctext%3APredibase%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A82318617%2Ctext%3AMidjourney%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A83019124%2Ctext%3AEigenLayer%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A89486558%2Ctext%3ACharacter.AI%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A89962189%2Ctext%3AThe%2520Arbitrum%2520Foundation%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A926041%2Ctext%3AOkta%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))%2C(id%3Aurn%253Ali%253Aorganization%253A9309408%2Ctext%3ACockroach%2520Labs%2CselectionType%3AINCLUDED%2Cparent%3A(id%3A0))))))&sessionId=kRjHJsRMT2W5DI61EZQXnQ%3D%3D"

In [40]:
driver.get(url)

try:
    WebDriverWait(driver, 240).until(lambda d: d.execute_script(
        'return document.readyState') == 'complete')
except:
    driver = reinstantiate_driver(driver)
time.sleep(2 + random.random() * 6)

while True:
    profiles = driver.find_elements(By.CLASS_NAME, "artdeco-list__item")

    for profile in profiles:
        # scroll to the profile
        driver.execute_script("arguments[0].scrollIntoView();", profile)
        wait_for_element_to_load(name="artdeco-entity-lockup__title")

        # click the profile
        salesNavOpenProfileButton = profile.find_element(By.CLASS_NAME, "artdeco-entity-lockup__title")
        salesNavOpenProfileButton.click()
        wait_for_element_to_load(name="_actions-container_1dg5u8")
        time.sleep(2 + random.random() * 6)
        
        try:
            # click the three dots button on the salesnav popout
            actionContainer = driver.find_element(By.CLASS_NAME, "_actions-container_1dg5u8")
            threeDotsButton = actionContainer.find_element(By.CLASS_NAME, "_icon_ps32ck")
            threeDotsButton.click()
            wait_for_element_to_load(name="_visible_x5gf48")
            time.sleep(2 + random.random() * 6)

            # get an <a> tag which is a child of dropdown menu
            dropdownContainer = driver.find_element(By.CLASS_NAME, "_visible_x5gf48")
            normalLinkedInUrl = dropdownContainer.find_elements(By.TAG_NAME, "a")[1].get_attribute("href")
            wait_for_element_to_load(name="artdeco-pagination__button--next")

            profile_urls.append(normalLinkedInUrl)

            # close the popout
            header = driver.find_element(By.CLASS_NAME, "_inline-sidesheet-header-actions_1cn7lg")
            button = header.find_elements(By.CLASS_NAME, "_button_ps32ck")[1]
            button.click()
        except:
            print("skipped")

    # navigate to next page
    if (len(profiles) < 25):
        break
    nextPageButton = driver.find_element(By.CLASS_NAME, "artdeco-pagination__button--next")
    nextPageButton.click()
    wait_for_element_to_load(name="artdeco-list__item")
    time.sleep(2 + random.random() * 6)

print(profile_urls)

skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
skipped
['https://www.linkedin.com/in/masonswofford', 'https://www.linkedin.com/in/eyqs', 'https://www.linkedin.com/in/georgemli', 'https://www.linkedin.com/in/barwi', 'https://www.linkedin.com/in/chris-harding', 'https://www.linkedin.com/in/jason-hoch', 'https://www.linkedin.com/in/indyg', 'https://www.linkedin.com/in/nanzheng', 'https://www.linkedin.com/in/eyal-susser-2a35112', 'https://www.linkedin.com/in/sharvanath', 'https://www.linkedin.com/in/adam-behrens', 'https://www.linkedin.com/in/subu-biswas-5114251b', 'https://www.linkedin.com/in/wukaling', 'https://www.linkedin.com/in/mattjcoop', 'https://www.linkedin.com/in/kunal-tangri-61ba48121', 'https://www.linkedin.com/in/shrutigupta22', 'https://www.linkedin.com/in/-adrian', 'https://www.linkedin.com/in/hamelhusain', 'https://www.linkedin.com/in/ngavin', 'https://www.linkedin.com/in/willempienaar', 'https://www.linkedin.com/in/s

In [44]:
# deduplicate and save profile_urls
profile_urls = list(set(profile_urls))
with open('my_list.txt', 'w') as f:
    for item in profile_urls:
        f.write("%s\n" % item)

In [5]:
# read profile_urls into a list from my_list.txt
with open('my_list.txt', 'r') as f:
    profile_urls = f.read().splitlines()

In [49]:
# open pkl file
import pickle
with open('historical_candidate_list.pkl', 'rb') as f:
    data = pickle.load(f)

scraped = []
for item in data:
    scraped.append(item[0])

In [17]:
import json
candidates = []

In [18]:
start = 900
end = 925

for idx, url in enumerate(profile_urls):
  if start > idx:
    continue
  if idx >= end:
    break
  # scrape profiles, and write results to a file
  try:
    profile = scrape_profile_live_filtering(driver, url)
    candidates.append(profile)
    
    # if idx >= 0:
    #   break
    with open('scraped_urls.txt', 'a') as f:
      f.write(url + '\n')
    print(((idx+1)/len(profile_urls)) * 100, '% Done - at index:', idx)
  except:
    print('Failed to scrape profile: ', url)
    with open('failed_urls.txt', 'a') as f:
      f.write(url + '\n')

https://www.linkedin.com/in/ross-olason-834464194/details/experience
99.44812362030905 % Done - at index: 900
https://www.linkedin.com/in/elynntucker/details/experience
Failed to scrape profile:  https://www.linkedin.com/in/elynntucker
https://www.linkedin.com/in/daniel-sim-jing-yuan-675963171/details/experience
99.66887417218543 % Done - at index: 902
https://www.linkedin.com/in/jessicankropf/details/experience
99.77924944812362 % Done - at index: 903
https://www.linkedin.com/in/jeffreydf/details/experience
99.8896247240618 % Done - at index: 904
https://www.linkedin.com/in/markdalas/details/experience
100.0 % Done - at index: 905


In [19]:
import pandas as pd

df = pd.DataFrame(columns=["url", "name"])

In [20]:
def parseCandidate(x):
    res = {}
    res['url'] = x.profile_link
    res['name'] = x.profile_name
    res['dist'] = x.profile_dist
    res['description'] = x.profile_description
    schoolIndex = 0
    for i, e in enumerate(x.profile_school):
        res[f'edu{i} school'] = e["school"]
        res[f'edu{i} degree'] = e["degree"]
        schoolIndex += 1
    exp = 0
    for i, e in enumerate(x.experiences):
        res[f'exp{i} title'] = e.position_title
        res[f'exp{i} company'] = e.institution_name.split(" ·")[0]
        res[f'exp{i} duration'] = e.duration
        res[f'exp{i} start'] = e.from_date
        exp += 1
    return res

for candidate in candidates:
    row = parseCandidate(candidate)
    df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
    

In [21]:
categories = {
    "SECURITY": [
        "Okta",
        "Snyk",
        "R2C/Semgrep",
        "Wiz",
        "Lacework",
        "Crowdstrike",
        "Palo Alto Networks",
        "Island",
        "Vanta",
        "Material Security",
        "Abnormal Security"
    ],
    "OTHER": [
        "Figma",
        "Airtable",
        "Notion",
        "Canva",
        "Webflow",
        "Faire",
        "Deel",
        "Rippling",
        "Flexport",
        "Benchling",
        "Solugen"
    ],
    "PUBLIC": [
        "Doordash",
        "Uber",
        "Palantir",
        "Airbnb",
        "Instacart"
    ],
    "INFRA": [
        "Fivetran",
        "DBT",
        "Temporal",
        "Cockroach Labs",
        "Grafana",
        "Zapier",
        "Starburst",
        "Retool",
        "Sentry",
        "Sourcegraph",
        "Cribl",
        "Vercel",
        "Clickhouse"
    ],
    "FINTECH": [
        "Robinhood",
        "Square",
        "Stripe",
        "Ramp",
        "Brex",
        "Plaid",
        "Modern Treasury",
        "Mercury",
        "Persona",
        "Klarna",
        "Nubank"
    ],
    "CRYPTO": [
        "Coinbase",
        "Uniswap",
        "Chainalysis",
        "Arbitrum",
        "TRM",
        "Fireblocks",
        "Eigenlayer"
    ],
    "FRONTIER": [
        "Anduril",
        "SpaceX",
        "Zipline",
        "Varda",
        "Hadrian"
    ],
    "AI": [
        "Bytedance",
        "Scale AI",
        "Anthropic",
        "Robust intelligence",
        "OpenAI",
        "Predibase",
        "Cohere",
        "Databricks",
        "Hugging Face",
        "RunwayML",
        "Tecton",
        "Weights & Biases",
        "Kumo AI",
        "NVIDIA",
        "Adept",
        "Glean",
        "Character.ai",
        "Midjourney",
        "Facebook AI",
        "FAIR",
        "Google brain"
    ]
}

# Example color mapping for categories
category_colors = {
    "SECURITY": 'red',
    "OTHER": 'blue',
    "PUBLIC": 'green',
    "INFRA": 'yellow',
    "FINTECH": 'orange',
    "CRYPTO": 'purple',
    "FRONTIER": 'cyan',
    "AI": 'magenta'
}

# Create a reverse dictionary for easier lookup: {company: category}
company_category = {}
for category, companies in categories.items():
    for company in companies:
        company_category[company] = category

# Modify the style function
def highlight_by_category(val):
    category = company_category.get(val)
    if category:
        color = category_colors.get(category, 'none')  # default to 'none' if no color is specified
    else:
        color = 'none'
    return f'background-color: {color}'

styled_df = df.style.applymap(highlight_by_category)

# Save the styled DataFrame to an Excel file
styled_df.to_excel(f'results_{start}-{end}.xlsx', engine='openpyxl', index=False)


In [157]:
html = """
<html>
<head>
</head>
<body style="font-family: Arial, sans-serif;">
<h2>Hi Liu,</h2>

<p>Here are the latest sourcing updates from Linkedin:</p>

<table style="border-collapse: collapse; width: 100%; margin-bottom: 25px;">
<tr style="background-color: #76bbef;">
  <th style="border: 1px solid #ddd; padding: 8px; text-align: left; background-color: #76bbef; color: white;">Name</th>
  <th style="border: 1px solid #ddd; padding: 8px; text-align: left; background-color: #76bbef; color: white;">Role</th>
  <th style="border: 1px solid #ddd; padding: 8px; text-align: left; background-color: #76bbef; color: white;">New Company</th>
  <th style="border: 1px solid #ddd; padding: 8px; text-align: left; background-color: #76bbef; color: white;">Time in Role</th> 
  <th style="border: 1px solid #ddd; padding: 8px; text-align: left; background-color: #76bbef; color: white;">Past Company</th> 
  <th style="border: 1px solid #ddd; padding: 8px; text-align: left; background-color: #76bbef; color: white;">Profile Link</th>
</tr>
"""

row_counter = 0
for candidate in candidates:
    bg_color = "#f2f2f2" if row_counter % 2 else "#ffffff"
    html += """
    <tr style="background-color: {bg_color};">
      <td>{}</td>
      <td>{}</td>
      <td>{}</td>
      <td>{}</td>
      <td>{}</td>
      <td><a href="{}">Linkedin</a></td>
    </tr>
    """.format(
        candidate.profile_name,
        candidate.experiences[0].position_title,
        candidate.experiences[0].institution_name.split(" ·")[0],
        candidate.experiences[0].duration,
        candidate.experiences[1].institution_name.split(" ·")[0],
        candidate.profile_link,
        bg_color=bg_color,
    )
    row_counter += 1

html += """
</table>
.......

<p>Best,</p>
<p>Sourcing Bot</p>
</body>
</html>
"""

# print(html)
# save html to file
with open("sourcing_updates.html", "w") as f:
    f.write(html)