## Imports

In [1]:
import pandas as pd
import numpy as np
import requests
import time

from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

import os

from fuzzywuzzy import fuzz, process

import warnings
warnings.filterwarnings('ignore')

## Gather Fortune 100 list info

The code below uses Selenium to gather information from the most recent [Fortune 100 list](https://fortune.com/fortune500/2020/search/). 

The code uses the `By.XPATH` option and the `.click()` method to set the page to 100 Rows from the beginning. It then finds the top of the table that has all of the companies and then gathers all of the company names and ranks, as well as the link Fortune's profile page for each company (e.g., [Walmart's profile page](https://fortune.com/company/walmart/fortune500/). For each row in the table, it appends a dictionary to the `cos` list created earlier in the code, which is then used to create a DataFrame.

In [2]:
# prepare the option for the chrome driver
options = webdriver.ChromeOptions()
options.add_argument('headless')

# start chrome browser
browser = webdriver.Chrome(options=options)

# get all the data from the Fortune 100 list
browser.get('https://fortune.com/fortune500/2020/search/')

# create list that all of the info can be appended to
cos = []

# set the page length to 100 rows instead of 10 rows
options_rows = browser.find_elements(
    By.XPATH, '//select[@aria-label="rows per page"]/option')
for o in options_rows:
    if o.text == '100 Rows':
        o.click()

# gather all of the row data
elements = browser.find_elements(By.CLASS_NAME, 'rt-tr-group')

# iterate through the elements vairable to get the information that will
# be appended to the list `cos`
for e in elements:
    # create `co` dictionary
    co = {}

    # get the company name, rank and link to the fortune profile page for each row
    co['company'] = e.text.split('\n')[1]
    co['rank'] = e.text.split('\n')[0]
    co['fortune_link'] = e.find_element(By.TAG_NAME, 'a').get_attribute('href')

    # append to the `cos` list
    cos.append(co)

    # sleep for 4 seconds so as not to hit the page too many times
    time.sleep(4)

# create a dataframe from the final list
cos = pd.DataFrame(cos)
cos

Unnamed: 0,company,rank,fortune_link
0,Walmart,1,https://fortune.com/company/walmart/fortune500/
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...
3,Apple,4,https://fortune.com/company/apple/fortune500/
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...
...,...,...,...
95,Northrop Grumman,96,https://fortune.com/company/northrop-grumman/f...
96,Capital One Financial,97,https://fortune.com/company/capital-one-financ...
97,Plains GP Holdings,98,https://fortune.com/company/plains-gp-holdings...
98,AbbVie,99,https://fortune.com/company/abbvie/fortune500/


## Gather company website information from each Fortune profile page

In [3]:
# create a column from the company website
# commented out so as not to overwrite the rows

cos['co_website'] = '' * 100

The code below iterates through each row in the `cos_df` DataFrame and uses the link collected from the table to visit each profile page for the Fortune 100. The code then collects the link Fortune has listed as the company's website and adds it to the DataFrame.

In [5]:
# prepare the option for the chrome driver
options = webdriver.ChromeOptions()
options.add_argument('headless')

# start chrome browser
browser = webdriver.Chrome(options=options)

# iterate through the links in the `cos_df` to get to the fortune
# profile page for each of the companies
for i in range(len(cos)):
    url = cos['fortune_link'][i]
    browser.get(f'{url}')
    vals = browser.find_elements(By.TAG_NAME, 'a')

    # get the company website from the fortune profile page
    cos['co_website'][i] = [
        v.get_attribute('href') for v in vals
        if 'fortune.com' not in v.get_attribute('href')
    ][2]

    time.sleep(3)

In [6]:
cos

Unnamed: 0,company,rank,fortune_link,co_website
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://www.stock.walmart.com/
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/
3,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/
...,...,...,...,...
95,Northrop Grumman,96,https://fortune.com/company/northrop-grumman/f...,https://www.northropgrumman.com/
96,Capital One Financial,97,https://fortune.com/company/capital-one-financ...,https://www.capitalone.com/
97,Plains GP Holdings,98,https://fortune.com/company/plains-gp-holdings...,https://www.plainsallamerican.com/
98,AbbVie,99,https://fortune.com/company/abbvie/fortune500/,https://www.abbvie.com/


Manually adjusting some websites that were incorrect below.

In [7]:
cos.loc[0,'co_website'] = 'https://corporate.walmart.com'
cos.loc[13, 'co_website'] = 'https://investor.costco.com'
cos.loc[40,'co_website'] = 'http://www.freddiemac.com/investors/'
cos.loc[70, 'co_website'] = 'https://corporate.charter.com'
cos.loc[74, 'co_website'] = 'https://corporate.bestbuy.com'

In [8]:
# save the dataframe to a csv
# commented out until needed

cos.to_csv('./fortune_100_data.csv', index = False)