In [None]:
from selenium import webdriver  # For browser automation
from selenium.webdriver.common.by import By  # To specify HTML element locating strategies
from selenium.webdriver.support.ui import WebDriverWait  # For waiting until certain conditions are met
from selenium.webdriver.support import expected_conditions as EC  # To specify conditions for WebDriverWait
import time  # For sleep and timestamp functionality
import pandas as pd  # For handling Excel files and dataframes
import random as rd  # For generating random delays
from time import sleep  # Alternative for time.sleep
from datetime import datetime  # For timestamp handling

# Input file containing the list of company names
input_file = "company_name_list.xlsx"  
company_df = pd.read_excel(input_file)  # Read the Excel file into a DataFrame
company_names = company_df["Company Name"].tolist()  # Extract the list of company names

# Base URL of the Y Combinator companies page
base_url = "https://www.ycombinator.com/companies"

# Initialize the Selenium WebDriver for Chrome browser
driver = webdriver.Chrome()
driver.maximize_window()  # Maximize the browser window for better visibility

# List to store the company names and their corresponding links
company_links = []

# Loop through each company name in the list
for company in company_names:
    print(f"Searching for: {company}")  # Print the company being searched
    
    driver.get(base_url)  # Navigate to the Y Combinator companies page
    
    try:
        # Wait until the search box element is located, with a maximum wait time of 10 seconds
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//*[@id='ycdc_new/pages/Companies/IndexPage-react-component-2152ffe0-d666-4e44-87fe-a0887c5ae3a9']/div[2]/section[2]/div/div[2]/div[2]/div[1]/input"))
        )
        
        # Clear the search box, type the company name, and submit the search
        search_box.clear()
        search_box.send_keys(company)
        search_box.submit()
        
        # Wait until the first search result appears and capture it
        first_result = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "/html/body/div[1]/div[2]/section[2]/div/div[2]/div[2]/div[4]/a[1]/div/div[2]/div/div[1]/span[1]"))
        )
        
        # Get the link of the first result (if available)
        link = first_result.get_attribute("href")
        company_links.append({
            "Company Name": company,
            "Link": link if link else "No link found"
        })
    except Exception as e:
        # Handle errors gracefully and log the issue
        print(f"Error searching for {company}: {e}")
        company_links.append({"Company Name": company, "Link": "Error or no result"})
    
    # Add a random delay between requests
    time.sleep(rd.randint(2, 5))

# Save the results to an Excel file
output_file = "ycombinator_founder_links.xlsx"
pd.DataFrame(company_links).to_excel(output_file, index=False)  # Write the data to an Excel file without an index

# Quit the WebDriver to close the browser session
driver.quit()

In [None]:
# List to store the profile data for all companies
all_profile_data = []

# Input file containing company URLs
company_url_directory = "ycombinator_founder_links.xlsx"
df = pd.read_excel(company_url_directory)  # Read the Excel file into a DataFrame

# Initialize the Selenium WebDriver for Chrome
driver = webdriver.Chrome()
driver.maximize_window()  # Maximize the browser window

# Loop through each unique URL in the dataframe
for unique_url in df["Links"]:
    driver.get(unique_url)  # Navigate to the company's page
    time.sleep(5)  # Wait for the page to load completely

    # Initialize variables to store data with default empty values
    try:
        company_name = driver.find_element(By.XPATH, "//h1").text  # Extract company name
    except:
        company_name = ""  # Default to empty if not found

    # Extract details for Person 1 (P1)
    try:
        p1_name = driver.find_element(By.XPATH, 
            "//*[@id='ycdc_new/pages/Companies/ShowPage-react-component-caffd477-2eee-4501-aad0-9f3b5cea8622']/div[2]/div/section/div[2]/div[1]/div[2]/div/div/div[1]").text
    except:
        p1_name = ""
    try:
        p1_x = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/div/section/div[2]/div[1]/div[2]/div/div/div[2]/a[1]"
            ).get_attribute("href")  # Extract Twitter link
    except:
        p1_x = ""
    try:
        p1_linkedin = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/div/section/div[2]/div[1]/div[2]/div/div/div[2]/a[2]"
            ).get_attribute("href")  # Extract LinkedIn link
    except:
        p1_linkedin = ""

    # Extract details for Person 2 (P2)
    try:
        p2_name = driver.find_element(By.XPATH, 
            "/html/body/div[1]/div[2]/div/section/div[2]/div[2]/div[2]/div/div/div[1]").text
    except:
        p2_name = ""
    try:
        p2_x = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/div/section/div[2]/div[2]/div[2]/div/div/div[2]/a[1]"
            ).get_attribute("href")  # Extract Twitter link
    except:
        p2_x = ""
    try:
        p2_linkedin = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/div/section/div[2]/div[2]/div[2]/div/div/div[2]/a[2]"
            ).get_attribute("href")  # Extract LinkedIn link
    except:
        p2_linkedin = ""

    # Extract details for Person 3 (P3)
    try:
        p3_name = driver.find_element(By.XPATH, 
            "/html/body/div[1]/div[2]/div/section/div[2]/div[3]/div[2]/div/div/div[1]").text
    except:
        p3_name = ""
    try:
        p3_x = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/div/section/div[2]/div[3]/div[2]/div/div/div[2]/a[1]"
            ).get_attribute("href")  # Extract Twitter link
    except:
        p3_x = ""
    try:
        p3_linkedin = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/div/section/div[2]/div[3]/div[2]/div/div/div[2]/a[2]"
            ).get_attribute("href")  # Extract LinkedIn link
    except:
        p3_linkedin = ""

    # Extract details for Person 4 (P4)
    try:
        p4_name = driver.find_element(By.XPATH, 
            "/html/body/div[1]/div[2]/section/div[2]/div[2]/div[3]/div[1]/div/div/div[1]").text
    except:
        p4_name = ""
    try:
        p4_linkedin = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/section/div[2]/div[2]/div[3]/div[1]/div/div/div[3]/a"
            ).get_attribute("href")  # Extract LinkedIn link
    except:
        p4_linkedin = ""

    # Extract details for Person 5 (P5)
    try:
        p5_name = driver.find_element(By.XPATH, 
            "/html/body/div[1]/div[2]/section/div[2]/div[2]/div[3]/div[2]/div/div/div[1]").text
    except:
        p5_name = ""
    try:
        p5_linkedin = driver.find_element(
            By.XPATH, "/html/body/div[1]/div[2]/section/div[2]/div[2]/div[3]/div[2]/div/div/div[3]/a"
            ).get_attribute("href")  # Extract LinkedIn link
    except:
        p5_linkedin = ""

    # Store extracted data into a dictionary
    profile_links = {
        "Company_name": company_name,
        "P1_name": p1_name,
        "P1_twitter": p1_x,
        "P1_Linkedin": p1_linkedin,
        "P2_name": p2_name,
        "P2_twitter": p2_x,
        "P2_Linkedin": p2_linkedin,
        "P3_name": p3_name,
        "P3_twitter": p3_x,
        "P3_Linkedin": p3_linkedin,
        "P4_name": p4_name,
        "P4_Linkedin": p4_linkedin,
        "P5_name": p5_name,
        "P5_Linkedin": p5_linkedin,
    }

    # Append the data to the main list
    all_profile_data.append(profile_links)
    print(f"Scrape done for {len(all_profile_data)} companies")  # Print progress

# Save all data to an Excel file
df = pd.DataFrame(all_profile_data)
df.to_excel("ycombinator_founder_linkedin_complete.xlsx", index=False)  # Save data without an index

# Quit the WebDriver to close the browser session
driver.quit()