In [None]:
import pandas as pd
import requests
import csv
import time
from bs4 import BeautifulSoup
from selenium import webdriver
import pickle

In [None]:
# Extract urls from website
def get_company_links():
        
    template = 'https://www.startbase.com/startups/?listOptions%5Bcompany-startup%5D=%7B%22version%22%3A1.3%2C%22sort%22%3A%22company.startbase_score%22%2C%22sortDirection%22%3A%22desc%22%2C%22display%22%3A%22small%22%2C%22itemsPerPage%22%3A4567%2C%22page%22%3A{}%2C%22userLocation%22%3Anull%2C%22filters%22%3A%7B%7D%7D'
    driver = webdriver.Firefox() # open browser

    links = []
    count = 1
    
    while True: # go through all urls
        try:
            url = template.format(count)
            driver.get(url) # go to url
            time.sleep(10) # wait for page to load
            element = driver.find_element_by_class_name('main-body') # get where links are            
            html = element.get_attribute('innerHTML') # obtain updated html with links
            
            soup = BeautifulSoup(html, 'html.parser')
            all_as = soup.find_all("a")            
            page_links = []
            for a in all_as:
                link = a.get("href")
                if link is not None and link.startswith('/organization/'):
                    page_links.append(link)           
            
            if not page_links: # when there are no more startups in the url
                driver.quit() # close browser
                break
            
            links = links + page_links
            count += 1    
            
        except:
            driver.quit()
            break

    return links

In [None]:
# Extract the info from a company url
def get_company_info(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Name, logo and teaser
    title_card = soup.find('div', 'flex-grow-1 mx-lg-4')
    startup_name = title_card.h1.get('data-name')
    logo = title_card.h1.get('data-icon')
    teaser = title_card.find('div', 'sb-teaser').text.strip()
    
    # Body info
    body_card = soup.find('ul', 'unformated row justify-content-start mb-0')
    info_cards = body_card.find_all('li')
    startup_info = []; labels_info = []

    for count, info in enumerate(info_cards):
        labels_info.append(info.text.strip().split(':')[0])
        try:
            if count != 9: #10th item is links to social media
                info_piece = info.text.strip()
                # text cleaning
            #         if type(info_piece) == str: # When empty
            #             info_piece = ''
                if '\n ' in info_piece:
                    info_piece = info_piece.split('\n ')[1]
                else:
                    info_piece = info_piece.split(': ')[1]

                if info_piece[0] == ' ':
                    info_piece = info_piece[1::]   
            else: #social media
                media_links = []
                for media in info.find_all('a'):
                    media_links.append(media.get('href'))
                info_piece = media_links
        except:
            info_piece = ''
        
        startup_info.append(info_piece)

    # About info
    try:
        about_company = soup.find('div', {'class':'sb-card mb-4'}, id='company').p.text
    except:
        about_company = ''

    # Team info
    try:
        # Get names
        member_names = soup.find_all('span',{'class':'name'})
        names = [];
        for name in member_names:
            names.append(name.text)
        # Get labels
        member_labels = soup.find_all('span',{'class':'label'})
        labels = [];
        for label in member_labels:
            labels.append(label.text) 

        team_members = dict(zip(names, labels))
    except:
        team_members = '';
    
    # Join extracted data
    info_data = [startup_name] + [logo] + [teaser] + startup_info + [about_company] + [team_members]
    info_labels = ['Name'] + ['Logo'] + ['Teaser'] + labels_info + ['About'] + ['Members']
    
    company_data = dict(zip(info_labels, info_data))
    
    return company_data

In [None]:
# Obtain all links of startups
links = get_company_links()
with open('links.pickle', 'wb') as f:
    pickle.dump(links, f)

In [None]:
# Get all data from links
data = []

for i, link in enumerate(links):
    data.append(get_company_info('https://www.startbase.com' + link))
    if i % 50 == 0:
        print(i)     
dataframe = pd.DataFrame(data)
with open('startbasedata.pickle', 'wb') as f:
    pickle.dump(dataframe, f)

In [None]:
# Convert to csv
dataframe.to_csv('startbase.csv', index=False)