# Scrape Y-combinator Company's Data
This is a scraper the uses both Selenium and BeautifulSoup.

Selenium: Selenium is used to scroll to the end of the page, handling the dynamic part of the scraping
BeautifulSoup: Since BeautifulSoup is faster than selenium, it is used to get the contents from selenium after the end of the page is reached. At the first stage, BeautifulSoup is used to get the company's page link on Y-combinator (all 1000 companies found). At the stage, all the link acquired on the Y-combinator page was loaded to BeautifulSoup to scrape the page contents. 

The information needed was the companies name, link, size, summary, location, and the year founded

In [6]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import time
import re
import os


try:
    path = os.getenv('CHROMEDRIVER_HOME')
    driver = webdriver.Chrome(executable_path=path)
except Exception as e:
    driver = webdriver.Chrome(ChromeDriverManager().install())

url = 'https://www.ycombinator.com/companies'

driver.get(url)
time.sleep(10)

check_page_length = 0
try:
    # scroll to the end of the page
    while True:
        page_len = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        time.sleep(1)

        if check_page_length == page_len:
            break
            
        
        check_page_length = page_len
except:
    driver.close()



y_company_page_urls = []
# get the list of all companies
soup = BeautifulSoup(driver.page_source, 'lxml')
get_company_list_block = soup.body.find("div", attrs={"class":"SharedDirectory-module__section___1ljf9 SharedDirectory-module__results___3SG0w"})

a = 'https://www.ycombinator.com'
for company in get_company_list_block.find_all('a', href=True):
    name = company.find('span', attrs={"class": "SharedDirectory-module__coName___gbFfW"}).text
    y_link = company['href']
    # print(f"{name} :{a+y_link} \n")

    y_company_page_urls.append(a + y_link)

driver.close()
print(len(y_company_page_urls))
companies = []
count = 1
for url in y_company_page_urls:
    print(count)

    source = requests.get(url).text
    soup = BeautifulSoup(source, 'lxml')

    section = soup.section
    company_name = section.div.h1.text
    summary = section.div.p.text
    link = section.div.a.text


    box = section.find("div", attrs={'class':'highlight-box'})
    facts = box.find("div", attrs={'class':'facts'})


    info = []
    for fact in facts.find_all('div'):
        # founded, team size, location
        info.append(fact.span.text)

    company = {     "company_name": company_name,
                    "link" : link,
                    "summary" : summary,
                    "team_size" : info[1],
                    "founded" : info[0],
                    "location" : info[2]
            }

    companies.append(company)

    count += 1

path = "/home/dit/DiT/GitHub/Pylingo/Jupyters/Mr. Pipe work/ycombunator_data.csv"
df = pd.DataFrame(companies)
df.to_csv(path, index=False)

1000


Unnamed: 0,company_name,link,summary,team_size,founded,location
0,Dropbox,http://dropbox.com,Dropbox is building the world’s first smart wo...,4000,2008,San Francisco
1,PagerDuty,http://pagerduty.com,PagerDuty is an operations performance platfor...,775,,San Francisco
2,Embark Trucks,http://embarktrucks.com,We are a San Francisco based team building sel...,100,2016,San Francisco
3,GoCardless,http://gocardless.com,GoCardless is on a mission to take the pain ou...,400,2011,"London, United Kingdom"
4,Lattice,https://lattice.com,Lattice is the #1 performance management solut...,160,2015,San Francisco
...,...,...,...,...,...,...
995,Ambient,https://ambient.ai,,11,,Palo Alto
996,Armory,http://armory.io,Enterprises break customer trust in very expen...,69,,"San Mateo, CA"
997,RankScience,https://www.rankscience.com,RankScience improves website SEO through autom...,12,2017,New York
998,XIX.ai,http://getxix.com,"Meet XIX Entry, a biometric identity provider ...",7,,San Francisco


In [None]:
from selenium import webdriver
import time

from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import time
import pandas as pd
import re
import os


try:
    path = os.getenv('CHROMEDRIVER_HOME')
    driver = webdriver.Chrome(executable_path=path)
except Exception as e:
    driver = webdriver.Chrome(ChromeDriverManager().install())

url = 'https://www.ycombinator.com/companies'

driver.get(url)
time.sleep(6)

check_page_length = 0
try:
    # scroll to the end of the page
    while True:
        page_len = driver.execute_script("window.scrollTo(0, document.body.scrollHeight);var lenOfPage=document.body.scrollHeight;return lenOfPage;")
        time.sleep(1)

        if check_page_length == page_len:
            break
        check_page_length = page_len
except:
    driver.close()

companies = []

# get the list of all companies
company_details = driver.find_elements_by_xpath('//a[@class="SharedDirectory-module__company___AVmr6 no-hovercard"]')

for company_detail in company_details:
    # print( company_detail)
    try:
        company_name = company_detail.find_element_by_xpath('.//span[@class="SharedDirectory-module__coName___gbFfW"]').text
    except NoSuchElementException:
        company_name = -1
        pass

    try:
        location = company_detail.find_element_by_xpath('.//span[@class="SharedDirectory-module__coName___gbFfW"]//following-sibling::*').text
        
    except NoSuchElementException:
        location = -1
        pass

    try:
        detail = company_detail.find_element_by_xpath('.//span[@class="SharedDirectory-module__coDescription___1LfuP"]').text
    except NoSuchElementException:
        detail = -1
        pass

        
    company = {   "company_name": company_name,
            "location": location,
            "detail": detail
        }

    companies.append(company)

driver.close()

In [None]:
df

# BeautifulSoup Prac

In [9]:
df.head()

Unnamed: 0,company_name,link,summary,team_size,founded,location
0,Dropbox,http://dropbox.com,Dropbox is building the world’s first smart wo...,4000,2008.0,San Francisco
1,PagerDuty,http://pagerduty.com,PagerDuty is an operations performance platfor...,775,,San Francisco
2,Embark Trucks,http://embarktrucks.com,We are a San Francisco based team building sel...,100,2016.0,San Francisco
3,GoCardless,http://gocardless.com,GoCardless is on a mission to take the pain ou...,400,2011.0,"London, United Kingdom"
4,Lattice,https://lattice.com,Lattice is the #1 performance management solut...,160,2015.0,San Francisco
