In [1]:
# -*- coding: utf-8 -*-
"""
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for
each table

license: MIT
"""

from bs4 import BeautifulSoup
import requests
import os
import codecs
wiki = "https://en.wikipedia.org/wiki/List_of_largest_companies_in_the_United_States_by_revenue"
header = {
    'User-Agent': 'Mozilla/5.0'
}  # Needed to prevent 403 error on Wikipedia
page = requests.get(wiki, headers=header)
soup = BeautifulSoup(page.content)

tables = soup.findAll("table", {"class": "wikitable"})

# show tables
for table in tables:
    print("###############")
    print(table.text[:100])

for tn in range(len(tables)):
    table = tables[tn]

    # preinit list of lists
    rows = table.findAll("tr")
    row_lengths = [len(r.findAll(['th', 'td'])) for r in rows]
    ncols = max(row_lengths)
    nrows = len(rows)
    data = []
    for i in range(nrows):
        rowD = []
        for j in range(ncols):
            rowD.append('')
        data.append(rowD)

    # process html
    for i in range(len(rows)):
        row = rows[i]
        rowD = []
        cells = row.findAll(["td", "th"])
        for j in range(len(cells)):
            cell = cells[j]

            #lots of cells span cols and rows so lets deal with that
            cspan = int(cell.get('colspan', 1))
            rspan = int(cell.get('rowspan', 1))
            l = 0
            for k in range(rspan):
                # Shifts to the first empty cell of this row
                while data[i + k][j + l]:
                    l += 1
                for m in range(cspan):
                    data[i + k][j + l + m] += cell.text

        data.append(rowD)

    # write data out to tab seperated format
    page = os.path.split(wiki)[1]
    fname = 'output_{}_t{}.csv'.format(page, tn)
    f = codecs.open(fname, 'w')
    for i in range(nrows):
        rowStr = '\t'.join(data[i])
        rowStr = rowStr.replace('\n', '')
        print(rowStr)
        f.write(rowStr + '\n')

    f.close()

###############




Rank

Name

Industry

Revenue(USD millions)

Revenue growth

Employees

Headquarters

Ref


1

W
###############




Rank

Name

Industry

Profits(USD millions)


1

Apple

Electronics

59,531


2

JPMorgan Chase

Rank	Name	Industry	Revenue(USD millions)	Revenue growth	Employees	Headquarters	Ref
1	Walmart	Retail	514,405	  2.8%	2,200,000	Bentonville, AR	[1]
2	ExxonMobil	Oil and gas	290,212	  18.8%	71,000	Irving, TX	[2]
3	Apple	Electronics	265,595	  15.9%	132,000	Cupertino, CA	[3]
4	Berkshire Hathaway	Conglomerate	247,837	  2.4%	389,000	Omaha, NE	[4]
5	Amazon	Retail	232,887	  30.9%	647,500	Seattle, WA	[5]
6	UnitedHealth Group	Healthcare	226,247	  12.5%	300,000	Minnetonka, MN	[6]
7	McKesson	Healthcare	208,357	  4.9%	68,000	Irving, TX	[7]
8	CVS Health	Healthcare	194,579	  5.3%	295,000	Woonsocket, RI	[8]
9	AT&T	Telecommunications	170,756	  6.4%	268,220	Dallas, TX	[9]
10	AmerisourceBergen	Pharmaceuticals	167,940	  9.7%	20,500	Chesterbrook, PA	[10]
11	Chevron	Oil and gas	1