## Libraries

In [2]:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd

## Initialization

In [3]:
# Initialize object to read and parse data from MSP list
list_url = "https://www.crn.com/news/managed-services/the-2020-managed-service-provider-500"
list_soup = BeautifulSoup(urlopen(list_url),'html.parser')

In [4]:
# Initialize headless browser to render HTML from webpages
chrome_options = Options()
chrome_options.add_argument("--headless")
browser = webdriver.Chrome(options=chrome_options, executable_path=r"\Users\collinsm4\Desktop\chromedriver_win32\chromedriver.exe")

In [5]:
# Initialize empty list for each data point
providers = []
sources = []
ids = []
executives = []
websites = []
locations = []
appearances = []
awards = []

## Functions

In [10]:
# Define function to open link to company data table and find all available data points
def get_data(url):    
    browser.get(url)
    table_soup = BeautifulSoup(browser.page_source)
    
    table = table_soup.find('div',attrs={'id':'databaseResults'})
    records = table.find_all('div',attrs={'class':'ans'})
    
    data = []
    for entry in records:
        data.append(entry.get_text())
    
    return data

In [7]:
# Define function to append data to corresponding list
def store_data(name,source,id,data):
    providers.append(name)
    sources.append(source)
    ids.append(id)
    executives.append(data[0])
    websites.append(data[1])
    locations.append(data[2])
    appearances.append(data[3])
    awards.append(data[4])

## Main

In [11]:
# Iterate over all providers listed, gather available data, and save to corresponding list
for provider in list_soup.find_all('div',attrs={'class':'data1'}):
    company = provider.get_text()
    url = provider.find('a').get('href')
    id = url.replace('https://www.crn.com/rankings-and-lists/msp2020-details.htm?c=','')

    company_data = get_data(url)
    
    store_data(company, url, id, company_data)

browser.close()

## Data Export

In [12]:
# Combine data lists into a table
data_table = pd.DataFrame({'id':ids,'provider':providers,'location':locations,'executive':executives,'website':websites,'msp500_award':awards,'msp500_appearances':appearances,'source':sources})

In [13]:
# Export table to CSV file
data_table.to_csv('msp_2020_test2.csv',index=False)