## Imports

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm
import time

import requests
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.by import By

import os

from fuzzywuzzy import fuzz

import warnings
warnings.filterwarnings('ignore')

In [2]:
# read in the data
cos = pd.read_csv('./fortune_100_data.csv')
cos

Unnamed: 0,company,rank,fortune_link,co_website
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://corporate.walmart.com
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/
3,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/
...,...,...,...,...
95,Northrop Grumman,96,https://fortune.com/company/northrop-grumman/f...,https://www.northropgrumman.com/
96,Capital One Financial,97,https://fortune.com/company/capital-one-financ...,https://www.capitalone.com/
97,Plains GP Holdings,98,https://fortune.com/company/plains-gp-holdings...,https://www.plainsallamerican.com/
98,AbbVie,99,https://fortune.com/company/abbvie/fortune500/,https://www.abbvie.com/


## Finding potential newsrooms from corporate websites

Add a full column for each top fuzzy link I'm searching for. 

In [3]:
# adding newsroom and press room columns
cos['newsroom_link'] = ''*100
cos['pressroom_link'] = ''*100
cos['corporate_link'] = ''*100


The below code visits each `co_website` link and scrapes the HTML for links. It then assesses each link using `fuzzywuzzy` to determine its similarity to 'news', 'press' and 'corporate' to account for differences in how each company may refer to their newsroom page. It then pulls the link with the largest value into the `cos` DataFrame for each company.

In [4]:
# for any companies that simply don't work, catch them in 
# this list to review later
error_cos = []

# prepare the options for the chrome driver
options = webdriver.ChromeOptions()

# making headless so as not to bombard my screen
options.add_argument('headless')

# getting around website features that stop bots
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option('useAutomationExtension', False)

# start chrome browser
browser = webdriver.Chrome(options=options)

# getting around website features that stop bots
browser.execute_cdp_cmd(
    'Network.setUserAgentOverride', {
        "userAgent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
        AppleWebKit/537.36 (KHTML, like Gecko) \
        Chrome/83.0.4103.53 Safari/537.36'
    })

# iterate through each of the companies in the dataframe
# use multiple try/except to be able to continue gathering data
# for other websites even if the website doesnt work on the first pass

# create different print statements in each of the excepts
# to be able to troubleshoot later, if needed

for i in range(len(cos)):
    try:
        # get the site url
        site_url = cos.loc[i, 'co_website']
        # find the base of the url to use later
        base = site_url.split('.')[1]

        try:
            # open the website & sleep to avoid security features
            browser.get(f'{site_url}')
            time.sleep(2)

        except:
            print(f'{cos.loc[i, "company"]} browser')
            print(f'{browser.current_url}, {browser.title}')

# get all the links from the main website
        links = browser.find_elements(By.TAG_NAME, 'a')

        # create list to put all the links from the website into
        site_links = []

        # iterate through all of the links and add to a dictionary
        # that will be added to `site_links`
        for l in links:
            link_info = {}
            url = l.get_attribute('href')

            link_info['link'] = url

            # use fuzzywuzzy package to assess the similarity of each link to a
            # string to find the newsroom, pressroom, and corporate website links
            link_info['news_ratio'] = fuzz.partial_ratio('news', url)
            link_info['press_ratio'] = fuzz.partial_ratio('press', url)
            link_info['corporate_ratio'] = fuzz.partial_ratio('corporate', url)
            try:
                link_info['url_len'] = len(url)
            except:
                pass

# append link info to the `site_links` list
            site_links.append(link_info)

# putting the site links into a data frame
        site_links_df = pd.DataFrame(site_links).drop_duplicates().dropna()
        site_links_df = site_links_df[site_links_df['link'].str.contains(base)]

        # saving dataframe in case we need to refer to later
#         site_links_df.to_csv(
#             f'./data/{cos.loc[i,"company"].replace(" ","_").lower()}_links.csv',
#             index=False)

        # getting dataframes for each of the series of links and resetting at
        # the top of each loop to avoid links from one website being put into
        # another company's row
        try:
            news_link_df = None
            news_link_df = site_links_df[
                site_links_df['news_ratio'] >
                site_links_df['news_ratio'].mean()].sort_values(
                    'news_ratio', ascending=False).reset_index(drop=True)
        except:
            print(f'{cos.loc[i, "company"]} news_link_df')
        try:
            press_link_df = None
            press_link_df = site_links_df[
                site_links_df['press_ratio'] >
                site_links_df['press_ratio'].mean()].sort_values(
                    'press_ratio', ascending=False).reset_index(drop=True)
        except:
            print(f'{cos.loc[i, "company"]} press_link_df')

        try:
            corp_link_df = None
            corp_link_df = site_links_df[
                site_links_df['corporate_ratio'] >
                site_links_df['corporate_ratio'].mean()].sort_values(
                    'corporate_ratio', ascending=False).reset_index(drop=True)
        except:
            print(f'{cos.loc[i, "company"]} corp_link_df')

# pulling the top links into cos
        try:
            cos.loc[i, 'newsroom_link'] = news_link_df.loc[0, 'link']
        except:
            cos.loc[i, 'newsroom_link'] = 'N/A'

        try:
            cos.loc[i, 'pressroom_link'] = press_link_df.loc[0, 'link']
        except:
            cos.loc[i, 'pressroom_link'] = 'N/A'

        try:
            cos.loc[i, 'corporate_link'] = corp_link_df.loc[0, 'link']
        except:
            cos.loc[i, 'corporate_link'] = 'N/A'

    except:
        error_cos.append(cos.loc[i, "company"])
        print(f'{cos.loc[i, "company"]}')
        print(f'{browser.current_url}, {browser.title}')

# printing current iteration if divisible by 5 to give sense of
# where this function is in the process
    if (i + 1) % 5 == 0:
        print(i + 1)

5
10
15
20
25
30
35
40
45
50
55
60
65
70
75
80
85
90
95
100


In [5]:
# saving updated dataframe to new csv for future use
# commented out until needed

cos.to_csv('./fortune_100_data_w_links.csv', index=False)

## Finding the official newsroom link

In [6]:
# read in data
cos = pd.read_csv('./fortune_100_data_w_links.csv')

In [7]:
cos

Unnamed: 0,company,rank,fortune_link,co_website,newsroom_link,pressroom_link,corporate_link
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://corporate.walmart.com,https://corporate.walmart.com/newsroom/2021/03...,https://www.diabetes.org/newsroom/press-releas...,https://corporate.walmart.com/#
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/,https://www.amazon.com/gp/customer-preferences...,https://www.amazon.com/Starbucks-Espresso-Roas...,https://www.amazon.com/Protector-Addtam-Extens...
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/,https://corporate.exxonmobil.com/About-us/Busi...,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/#main-content
3,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/,https://www.apple.com/apple-news/,https://www.apple.com/us/shop/goto/temporary_c...,https://www.apple.com/us/shop/goto/trade_in
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/,https://www.cvshealth.com/news-and-insights/to...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/social-responsibilit...
...,...,...,...,...,...,...,...
95,Northrop Grumman,96,https://fortune.com/company/northrop-grumman/f...,https://www.northropgrumman.com/,https://news.northropgrumman.com/,https://www.northropgrumman.com/who-we-are/glo...,https://www.northropgrumman.com/corporate-resp...
96,Capital One Financial,97,https://fortune.com/company/capital-one-financ...,https://www.capitalone.com/,https://www.capitalone.com/about/newsroom,https://www.capitalone.com/digital/digitalspen...,https://www.capitalone.com/about/corporate-inf...
97,Plains GP Holdings,98,https://fortune.com/company/plains-gp-holdings...,https://www.plainsallamerican.com/,https://www.plainsallamerican.com/about-us/new...,https://www.plainsallamerican.com/sustainabili...,https://www.plainsallamerican.com/contact-us/c...
98,AbbVie,99,https://fortune.com/company/abbvie/fortune500/,https://www.abbvie.com/,https://news.abbvie.com/news/press-releases/ne...,https://news.abbvie.com/news/press-releases/ne...,https://www.abbvie.com/careers/roles/corporate...


The below dictionary is the result of reviewing the links obtained in the function above, and manually double checking them to make sure it is accurate for this project's purposes. This dictionary will be used to fill the `final` column in the cells below.

In [8]:
# dictionary for each company's official newsrooms

final_websites_dict = {'Walmart': 'https://corporate.walmart.com/newsroom/company-news',
 'Amazon': 'https://press.aboutamazon.com/press-releases',
 'Exxon Mobil': 'https://corporate.exxonmobil.com/News/Newsroom/News-releases',
 'Apple': 'https://www.apple.com/newsroom/archive/',
 'CVS Health': 'https://www.cvshealth.com/news-and-insights/press-releases',
 'Berkshire Hathaway': 'https://www.berkshirehathaway.com/news/2020news.html',
 'UnitedHealth Group': 'https://www.unitedhealthgroup.com/search.html?query=*&facets=contenttype_s;uhg:content-type/story&sort=newest&start=0&rows=10',
 'McKesson': 'https://www.mckesson.com/About-McKesson/Newsroom/Press-Releases/?topic=0&business=0&industry=0&month=0&page=1',
 'AT&T': 'https://about.att.com/allnews.html',
 'AmerisourceBergen': 'https://www.amerisourcebergen.com/newsroom/press-releases?filter=all',
 'Alphabet': 'https://abc.xyz/investor/#news',
 'Ford Motor': 'https://media.ford.com/content/fordmedia/fna/us/en/news.html',
 'Cigna': 'https://www.cigna.com/about-us/newsroom/news-and-views/press-releases/',
 'Costco Wholesale': 'https://investor.costco.com/news-releases',
 'Chevron': 'https://www.chevron.com/stories',
 'Cardinal Health': 'https://cardinalhealth.mediaroom.com/newsreleasearchive?l=100',
 'JPMorgan Chase': 'https://www.jpmorganchase.com/news-stories/news',
 'General Motors': 'https://plants.gm.com/media/us/en/gm/news/news_archive.year.All.month.All.html',
 'Walgreens Boots Alliance': 'https://www.walgreensbootsalliance.com/news-media/press-releases/archive',
 'Verizon Communications': 'https://www.verizon.com/about/media-center',
 'Microsoft': 'https://news.microsoft.com/category/press-releases/',
 'Marathon Petroleum': 'https://www.marathonpetroleum.com/Newsroom/Company-News/',
 'Kroger': 'http://ir.kroger.com/CorporateProfile/press-releases/default.aspx',
 'Fannie Mae': 'https://www.fanniemae.com/newsroom/fannie-mae-news',
 'Bank of America': 'https://newsroom.bankofamerica.com/content/newsroom/press-releases.html',
 'Home Depot': 'https://corporate.homedepot.com/newsroom',
 'Phillips 66': 'https://investor.phillips66.com/financial-information/default.aspx',
 'Comcast': 'https://corporate.comcast.com/press',
 'Anthem': 'https://ir.antheminc.com/press-releases?c=130104&nyo=0&p=irol-news',
 'Wells Fargo': 'https://newsroom.wf.com/English/news-releases/default.aspx',
 'Citigroup': 'https://www.citigroup.com/citi/news/news_list_view.html',
 'Valero Energy': 'https://www.valero.com/about/news-room',
 'General Electric': 'https://www.ge.com/news/press-releases',
 'Dell Technologies': 'https://corporate.delltechnologies.com/en-us/newsroom/announcements.htm#/filter-on/Country:en-us',
 'Johnson & Johnson': 'https://www.jnj.com/latest-news?all',
 'State Farm Insurance': 'https://newsroom.statefarm.com/?h=1&t=releases',
 'Target': 'https://corporate.target.com/search-results?mediaType=Press%20release&page=1&sort=publishDate&sortOrder=desc',
 'IBM': 'https://newsroom.ibm.com/announcements?l=100',
 'Raytheon Technologies': 'https://www.rtx.com/news#2',
 'Boeing': 'https://boeing.mediaroom.com/news-releases-statements?l=100',
 'Freddie Mac': 'https://freddiemac.gcs-web.com/news-releases/',
 'Centene': 'https://investors.centene.com/news-releases?field_nir_news_date_value%5Bmin%5D=2021',
 'UPS': 'https://stories.ups.com/upsstories/us/en/newsroom/press-releases.html',
 "Lowe's": 'https://corporate.lowes.com/newsroom/press-releases',
 'Intel': 'https://newsroom.intel.com/news/',
 'Facebook': 'https://investor.fb.com/investor-news/default.aspx',
 'FedEx': 'https://newsroom.fedex.com',
 'MetLife': 'https://www.metlife.com/about-us/newsroom/',
 'Walt Disney': 'https://thewaltdisneycompany.com/news/',
 'Procter & Gamble': 'https://news.pg.com/news-releases/default.aspx',
 'PepsiCo': 'https://www.pepsico.com/news/media-resources/press-releases',
 'Humana': 'https://press.humana.com/news/default.aspx#gsc.tab=0',
 'Prudential Financial (U.S.)': 'https://news.prudential.com/prudential/news/',
 'Archer Daniels Midland': 'https://www.adm.com/news/news-releases',
 'Albertsons': 'https://www.albertsonscompanies.com/newsroom.html',
 'Sysco': 'https://investors.sysco.com/annual-reports-and-sec-filings/news-releases/',
 'Lockheed Martin': 'https://news.lockheedmartin.com/news-releases?l=100',
 'HP': 'https://press.hp.com/us/en/press-releases.html',
 'Energy Transfer': 'https://energytransfer.com/newsroom/',
 'Goldman Sachs Group': 'https://www.goldmansachs.com/media-relations/press-releases-and-comments/current/index.html',
 'Morgan Stanley': 'https://www.morganstanley.com/about-us-newsroom?showAll=true#-536583991-tab',
 'Caterpillar': 'https://www.caterpillar.com/en/news/corporate-press-releases.html',
 'Cisco Systems': 'https://newsroom.cisco.com/pressreleases',
 'Pfizer': 'https://www.pfizer.com/news/press-release/press-releases-archive',
 'HCA Healthcare': 'https://investor.hcahealthcare.com/news/2021/default.aspx',
 'American International Group': 'https://www.aig.com/about-us/news-and-media#press-release-archive',
 'American Express': 'https://about.americanexpress.com/all-news/default.aspx',
 'Delta Air Lines': 'https://news.delta.com/news-archive',
 'Merck': 'https://www.merck.com/media/news/',
 'American Airlines Group': 'http://news.aa.com/news/default.aspx',
 'Charter Communications': 'https://corporate.charter.com/newsroom',
 'Allstate': 'https://www.allstatenewsroom.com/news/',
 'New York Life Insurance': 'https://www.newyorklife.com/newsroom/2021',
 'Nationwide': 'https://news.nationwide.com/?h=1&t=news',
 'Best Buy': 'https://corporate.bestbuy.com/archive',
 'United Airlines Holdings': 'https://hub.united.com/newsroom/',
 'Liberty Mutual Insurance Group': 'https://www.libertymutualgroup.com/about-lm/news/news-release-archive',
 'Dow': 'https://www.dow.com/en-us/news.html',
 'Tyson Foods': 'https://www.tysonfoods.com/news/news-releases',
 'TJX': 'https://investor.tjx.com/investors/press-releases',
 'TIAA': 'https://www.tiaa.org/public/about-tiaa/news-press/press-releases',
 'Oracle': 'https://www.oracle.com/search/press?No=0&Nr=101&Nrpp=100',
 'General Dynamics': 'https://www.gd.com/news/press-releases?page=0&tags=&types=&years=&months=',
 'Deere': 'https://www.deere.com/en/our-company/news-and-announcements/news-releases/',
 'Nike': 'https://news.nike.com/news',
 'Progressive': 'https://progressive.mediaroom.com/news-releases/?l=100',
 'Publix Super Markets': 'https://corporate.publix.com/about-publix/newsroom/news-releases',
 'Coca-Cola': 'https://www.coca-colacompany.com/media-center',
 'Massachusetts Mutual Life': 'https://www.massmutual.com/about-us/news-and-press-releases?page=1',
 'Tech Data': 'https://investor.techdata.com/news/default.aspx',
 'World Fuel Services': 'https://ir.wfscorp.com/news-releases/?b29e099a_year%5Bvalue%5D=_none&op=Filter&b29e099a_widget_id=b29e099a&form_build_id=form-GJk_zT62vVFgzNKdmQ1Dhwuw9nFQ8KvU9I66nafTgmY&form_id=widget_form_base',
 'Honeywell International': 'https://www.honeywell.com/us/en/press',
 'ConocoPhillips': 'https://www.conocophillips.com/news-media/archive/?page=1',
 'USAA': 'https://communities.usaa.com/t5/Press-Releases/bg-p/press-releases',
 'Exelon': 'https://www.exeloncorp.com/newsroom/press-releases',
 'Northrop Grumman': 'https://news.northropgrumman.com/news/releases',
 'Capital One Financial': 'https://www.capitalone.com/about/newsroom/',
 'Plains GP Holdings': 'https://ir.paalp.com/News_Releases',
 'AbbVie': 'https://news.abbvie.com/news/press-releases/',
 'StoneX Group': 'https://www.stonex.com/Media-Room/Press-Releases/'}

Iterate through each value in the `final_websites_dict` and each row in the `cos` DataFrame to add the final newsroom link to the DataFrame.

In [10]:
for company, final in final_websites_dict.items():
    for row in range(len(cos)):
        if company == cos.loc[row,'company']:
            cos.loc[row,'final'] = final
        else:
            pass
cos

Unnamed: 0,company,rank,fortune_link,co_website,newsroom_link,pressroom_link,corporate_link,final
0,Walmart,1,https://fortune.com/company/walmart/fortune500/,https://corporate.walmart.com,https://corporate.walmart.com/newsroom/2021/03...,https://www.diabetes.org/newsroom/press-releas...,https://corporate.walmart.com/#,https://corporate.walmart.com/newsroom/company...
1,Amazon,2,https://fortune.com/company/amazon-com/fortune...,https://www.amazon.com/,https://www.amazon.com/gp/customer-preferences...,https://www.amazon.com/Starbucks-Espresso-Roas...,https://www.amazon.com/Protector-Addtam-Extens...,https://press.aboutamazon.com/press-releases
2,Exxon Mobil,3,https://fortune.com/company/exxon-mobil/fortun...,https://www.exxonmobil.com/,https://corporate.exxonmobil.com/About-us/Busi...,https://corporate.exxonmobil.com/News/Newsroom...,https://corporate.exxonmobil.com/#main-content,https://corporate.exxonmobil.com/News/Newsroom...
3,Apple,4,https://fortune.com/company/apple/fortune500/,https://www.apple.com/,https://www.apple.com/apple-news/,https://www.apple.com/us/shop/goto/temporary_c...,https://www.apple.com/us/shop/goto/trade_in,https://www.apple.com/newsroom/archive/
4,CVS Health,5,https://fortune.com/company/cvs-health/fortune...,https://www.cvshealth.com/,https://www.cvshealth.com/news-and-insights/to...,https://www.cvshealth.com/news-and-insights/pr...,https://www.cvshealth.com/social-responsibilit...,https://www.cvshealth.com/news-and-insights/pr...
...,...,...,...,...,...,...,...,...
95,Northrop Grumman,96,https://fortune.com/company/northrop-grumman/f...,https://www.northropgrumman.com/,https://news.northropgrumman.com/,https://www.northropgrumman.com/who-we-are/glo...,https://www.northropgrumman.com/corporate-resp...,https://news.northropgrumman.com/news/releases
96,Capital One Financial,97,https://fortune.com/company/capital-one-financ...,https://www.capitalone.com/,https://www.capitalone.com/about/newsroom,https://www.capitalone.com/digital/digitalspen...,https://www.capitalone.com/about/corporate-inf...,https://www.capitalone.com/about/newsroom/
97,Plains GP Holdings,98,https://fortune.com/company/plains-gp-holdings...,https://www.plainsallamerican.com/,https://www.plainsallamerican.com/about-us/new...,https://www.plainsallamerican.com/sustainabili...,https://www.plainsallamerican.com/contact-us/c...,https://ir.paalp.com/News_Releases
98,AbbVie,99,https://fortune.com/company/abbvie/fortune500/,https://www.abbvie.com/,https://news.abbvie.com/news/press-releases/ne...,https://news.abbvie.com/news/press-releases/ne...,https://www.abbvie.com/careers/roles/corporate...,https://news.abbvie.com/news/press-releases/


In [11]:
# confirming there are no more NaN
cos['final'].isna().sum()

0

In [12]:
# saving updated dataframe back to csv

cos.to_csv('./fortune_100_data_w_links.csv', index = False)