## Imports

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import time

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

import os

from fuzzywuzzy import fuzz, process

import warnings
warnings.filterwarnings('ignore')

## Read in the data

In [2]:
# read in the data

cos = pd.read_csv('./fortune_100_data_w_links.csv')

In [5]:
# save the data

# cos.to_csv('./fortune_100_data_w_links.csv', index = False)

### Limit to just the top five Fortune 100 companies

Given this project is a proof of concept, I am limiting the DataFrame to just the top five Fortune 100 companies. 

In [3]:
cos = cos[cos['rank'] <=5]
cos

Unnamed: 0,company,rank,fortune_link,co_website,newsroom_link,pressroom_link,corporate_link,final
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://corporate.walmart.com,https://corporate.walmart.com/newsroom/2021/03...,https://www.diabetes.org/newsroom/press-releas...,https://corporate.walmart.com/#,https://corporate.walmart.com/newsroom/company...
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/,https://www.amazon.com/gp/customer-preferences...,https://www.amazon.com/Starbucks-Espresso-Roas...,https://www.amazon.com/Protector-Addtam-Extens...,https://press.aboutamazon.com/press-releases
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/,https://corporate.exxonmobil.com/About-us/Busi...,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/#main-content,https://corporate.exxonmobil.com/News/Newsroom...
3,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/,https://www.apple.com/apple-news/,https://www.apple.com/us/shop/goto/temporary_c...,https://www.apple.com/us/shop/goto/trade_in,https://www.apple.com/newsroom/archive/
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/,https://www.cvshealth.com/news-and-insights/to...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/social-responsibilit...,https://www.cvshealth.com/news-and-insights/pr...


## Create the `loop_url`, `type` and `page_type` columns 

Following a review of each of the final newsroom links, each of the these columns are used in the code further below.

* **loop_url**: This column is the url that the code can use to loop through.
* **type**: The category of iteration used - for the top five companies, the types are `pages` and `years`
* **page_type**: This is used in a function created below to return the right ending as the code loops through the values. 

In [4]:
loop_dict = {
    'Walmart': {
        'loop_url': 'https://corporate.walmart.com/newsroom/company-news?p=',
        'type': 'pages',
        'page_type': 'page'
    },
    'Amazon': {
        'loop_url':
        'https://press.aboutamazon.com/press-releases?a9d908dd_year%5Bvalue%5D=',
        'type': 'years',
        'page_type': 'year'
    },
    'Exxon Mobil': {
        'loop_url':
        'https://corporate.exxonmobil.com/api/v2/related/collection?itemid=3f311ddd-4cb5-489b-a768-325e94ee0ef1&contextid=3f311ddd-4cb5-489b-a768-325e94ee0ef1&language=en&pagesize=5&page=',
        'type': 'pages',
        'page_type': 'page'
    },
    'Apple': {
        'loop_url': 'https://www.apple.com/newsroom/archive/?page=',
        'type': 'pages',
        'page_type': 'page'
    },
    'CVS Health': {
        'loop_url':
        'https://www.cvshealth.com/news-and-insights/press-releases?page=',
        'type': 'pages',
        'page_type': 'page'
    }
}

In [5]:
for row in range(len(cos)):
    company = cos.loc[row,'company']
    cos.loc[row,'loop_url'] = loop_dict[company]['loop_url']
    cos.loc[row,'type'] = loop_dict[company]['type']
    cos.loc[row,'page_type'] = loop_dict[company]['page_type']
cos

Unnamed: 0,company,rank,fortune_link,co_website,newsroom_link,pressroom_link,corporate_link,final,loop_url,type,page_type
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://corporate.walmart.com,https://corporate.walmart.com/newsroom/2021/03...,https://www.diabetes.org/newsroom/press-releas...,https://corporate.walmart.com/#,https://corporate.walmart.com/newsroom/company...,https://corporate.walmart.com/newsroom/company...,pages,page
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/,https://www.amazon.com/gp/customer-preferences...,https://www.amazon.com/Starbucks-Espresso-Roas...,https://www.amazon.com/Protector-Addtam-Extens...,https://press.aboutamazon.com/press-releases,https://press.aboutamazon.com/press-releases?a...,years,year
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/,https://corporate.exxonmobil.com/About-us/Busi...,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/#main-content,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/api/v2/relate...,pages,page
3,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/,https://www.apple.com/apple-news/,https://www.apple.com/us/shop/goto/temporary_c...,https://www.apple.com/us/shop/goto/trade_in,https://www.apple.com/newsroom/archive/,https://www.apple.com/newsroom/archive/?page=,pages,page
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/,https://www.cvshealth.com/news-and-insights/to...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/social-responsibilit...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/news-and-insights/pr...,pages,page


## Get html for companies with `type` == `pages`

This section gathers HTML from each of the newsrooms identified with a `type` of `pages` and saves the information in the HTML folder.

In [6]:
pages = cos[cos['type'] == 'pages'].reset_index(drop= True)
pages

Unnamed: 0,company,rank,fortune_link,co_website,newsroom_link,pressroom_link,corporate_link,final,loop_url,type,page_type
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://corporate.walmart.com,https://corporate.walmart.com/newsroom/2021/03...,https://www.diabetes.org/newsroom/press-releas...,https://corporate.walmart.com/#,https://corporate.walmart.com/newsroom/company...,https://corporate.walmart.com/newsroom/company...,pages,page
1,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/,https://corporate.exxonmobil.com/About-us/Busi...,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/#main-content,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/api/v2/relate...,pages,page
2,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/,https://www.apple.com/apple-news/,https://www.apple.com/us/shop/goto/temporary_c...,https://www.apple.com/us/shop/goto/trade_in,https://www.apple.com/newsroom/archive/,https://www.apple.com/newsroom/archive/?page=,pages,page
3,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/,https://www.cvshealth.com/news-and-insights/to...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/social-responsibilit...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/news-and-insights/pr...,pages,page


In [7]:
# create a function that will return the appropriate page ending 

def get_page_ending(i, page_type):

    if page_type == 'page':
        return str(i)

In [8]:
# set up the webdriver
options = webdriver.ChromeOptions()
options.page_load_strategy = 'normal'
options.add_argument('headless')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
browser = webdriver.Chrome(options=options)
browser.execute_cdp_cmd(
    'Network.setUserAgentOverride', {
        "userAgent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/83.0.4103.53 Safari/537.36'
    })

# loop through each row in the `pages` dataframe
for row in range(len(pages)):

    # create a list that can be appended to
    htmls = []
    
    # get the page type as a variable
    page_type = pages.loc[row, 'page_type']
    
    # get the url
    url = pages.loc[row, 'loop_url']

    for i in range(50):
        if i % 5 == 0:
            print(f"Company: {pages.loc[row, 'company']}: {row} | Iteration: {i} | Page type: {pages.loc[row,'page_type']}")
            
        try:
            # create a dictionary that we can add to
            page_html = {}
            
            # create a variable for the end of the page url, 
            # calling on the previously created function
            ending = get_page_ending(i, page_type)
            
            # add the page number to the end of the url
            page_url = url + ending

            # open the browser
            browser.get(page_url)
            time.sleep(5)
            
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            
            # add information for each row in case needed later
            page_html['company'] = pages.loc[row, 'company']
            page_html['base_url'] = pages.loc[row, 'final']
            page_html['url'] = page_url
            page_html['page_num'] = i

            # add the html to the dictionary
            page_html['html'] = browser.page_source

            # append the dictionary for this page to the list
            htmls.append(page_html)
            time.sleep(3)
        
        except:
            print()
            print(f"Company: {pages.loc[row, 'company']} | Web page: {i} | Page type: {pages.loc[row,'page_type']} |  Status: Error"                  )
            
    #create a dataframe and save locally
    html_df = pd.DataFrame(htmls)
    html_df.to_csv(f'./html/{pages.loc[row,"company"].replace(" ","_").lower()}_html.csv',index=False)

# executed in 32m 38s

Company: Walmart: 0 | Iteration: 0 | Page type: page
Company: Walmart: 0 | Iteration: 5 | Page type: page
Company: Walmart: 0 | Iteration: 10 | Page type: page
Company: Walmart: 0 | Iteration: 15 | Page type: page
Company: Walmart: 0 | Iteration: 20 | Page type: page
Company: Walmart: 0 | Iteration: 25 | Page type: page
Company: Walmart: 0 | Iteration: 30 | Page type: page
Company: Walmart: 0 | Iteration: 35 | Page type: page
Company: Walmart: 0 | Iteration: 40 | Page type: page
Company: Walmart: 0 | Iteration: 45 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 0 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 5 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 10 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 15 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 20 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 25 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 30 | Page type: page
Company: Exxon Mobil: 1 | Iteration: 35 | Page type: page


## Getting html for companies with `type` == `years`

This section gathers HTML from each of the newsrooms identified with a `type` of `years` and saves the information in the HTML folder.

In [9]:
years = cos[cos['type'] == 'years'].reset_index(drop= True)
years

Unnamed: 0,company,rank,fortune_link,co_website,newsroom_link,pressroom_link,corporate_link,final,loop_url,type,page_type
0,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/,https://www.amazon.com/gp/customer-preferences...,https://www.amazon.com/Starbucks-Espresso-Roas...,https://www.amazon.com/Protector-Addtam-Extens...,https://press.aboutamazon.com/press-releases,https://press.aboutamazon.com/press-releases?a...,years,year


In [10]:
def get_year_ending(i, page_type):

    if page_type == 'year':
        return str(i)

In [11]:
# set up the webdriver
options = webdriver.ChromeOptions()
options.page_load_strategy = 'normal'
options.add_argument('headless')
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)
browser = webdriver.Chrome(options=options)
browser.execute_cdp_cmd(
    'Network.setUserAgentOverride', {
        "userAgent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/83.0.4103.53 Safari/537.36'
    })

# loop through each row in the `pages` dataframe
for row in range(len(years)):

    # create a list that can be appended to
    htmls = []
    
    # get the page type as a variable
    page_type = years.loc[row, 'page_type']
    
    # get the url
    url = years.loc[row, 'loop_url']

    for i in range(2019,2022):
        print(f"Company: {years.loc[row, 'company']}: {row} | Year: {i} | Page type: {years.loc[row,'page_type']}")
            
        try:
            # create a dictionary that we can add to
            page_html = {}
            
            # create a variable for the end of the page url, 
            # calling on the previously created function
            ending = get_year_ending(i, page_type)
            
            # add the page number to the end of the url
            page_url = url + ending

            # open the browser
            browser.get(page_url)
            time.sleep(5)
            
            browser.execute_script("window.scrollTo(0, document.body.scrollHeight/2);")
            
            # add information for each row in case needed later
            page_html['company'] = years.loc[row, 'company']
            page_html['base_url'] = years.loc[row, 'final']
            page_html['url'] = page_url
            page_html['page_num'] = i

            # add the html to the dictionary
            page_html['html'] = browser.page_source

            # append the dictionary for this page to the list
            htmls.append(page_html)
            time.sleep(3)
        
        except:
            print()
            print(f"Company: {years.loc[row, 'company']} | Web page: {i} | Page type: {years.loc[row,'page_type']} |  Status: Error"                  )
            
    #create a dataframe and save locally
    html_df = pd.DataFrame(htmls)
    html_df.to_csv(f'./html/{years.loc[row,"company"].replace(" ","_").lower()}_html.csv',index=False)
    
# executed in 34.8s

Company: Amazon: 0 | Year: 2019 | Page type: year
Company: Amazon: 0 | Year: 2020 | Page type: year
Company: Amazon: 0 | Year: 2021 | Page type: year
