In [4]:
# -*- coding: utf-8 -*-
"""
Scrape a table from wikipedia using python. Allows for cells spanning multiple rows and/or columns. Outputs csv files for
each table

license: MIT
"""

from bs4 import BeautifulSoup
import requests
import os
import codecs
wiki = "https://en.wikipedia.org/wiki/List_of_regions_of_China"
header = {
    'User-Agent': 'Mozilla/5.0'
}  # Needed to prevent 403 error on Wikipedia
page = requests.get(wiki, headers=header)
soup = BeautifulSoup(page.content)

tables = soup.findAll("table", {"class": "wikitable"})

# show tables
for table in tables:
    print("###############")
    print(table.text[:100])

for tn in range(len(tables)):
    table = tables[tn]

    # preinit list of lists
    rows = table.findAll("tr")
    row_lengths = [len(r.findAll(['th', 'td'])) for r in rows]
    ncols = max(row_lengths)
    nrows = len(rows)
    data = []
    for i in range(nrows):
        rowD = []
        for j in range(ncols):
            rowD.append('')
        data.append(rowD)

    # process html
    for i in range(len(rows)):
        row = rows[i]
        rowD = []
        cells = row.findAll(["td", "th"])
        for j in range(len(cells)):
            cell = cells[j]

            #lots of cells span cols and rows so lets deal with that
            cspan = int(cell.get('colspan', 1))
            rspan = int(cell.get('rowspan', 1))
            l = 0
            for k in range(rspan):
                # Shifts to the first empty cell of this row
                while data[i + k][j + l]:
                    l += 1
                for m in range(cspan):
                    data[i + k][j + l + m] += cell.text

        data.append(rowD)

    # write data out to tab seperated format
    page = os.path.split(wiki)[1]
    fname = 'output_{}_t{}.csv'.format(page, tn)
    f = codecs.open(fname, 'w')
    for i in range(nrows):
        rowStr = '\t'.join(data[i])
        rowStr = rowStr.replace('\n', '')
        print(rowStr)
        f.write(rowStr + '\n')

    f.close()

###############


Region
Map
Area
Population(2010)
PopulationDensity
Provinces/Region
Provincial/Regional Seat


Nor
###############


Region
Area
Population(2010)
PopulationDensity
Provinces included & Notes


North China(without Ea
Region	Map	Area	Population(2010)	PopulationDensity	Provinces/Region	Provincial/Regional Seat
North ChinaNorth China (Huáběi)		1,556,061 km²	164,823,226	105/km²	Beijing	Dongcheng District
North ChinaNorth China (Huáběi)		1,556,061 km²	164,823,226	105/km²	Tianjin	Heping District
North ChinaNorth China (Huáběi)		1,556,061 km²	164,823,226	105/km²	Hebei	Shijiazhuang
North ChinaNorth China (Huáběi)		1,556,061 km²	164,823,226	105/km²	Shanxi	Taiyuan
North ChinaNorth China (Huáběi)		1,556,061 km²	164,823,226	105/km²	Inner Mongolia	Hohhot
Northeast ChinaNortheast China (Dōngběi)		793,300 km²	109,520,844	138/km²	Liaoning	Shenyang
Northeast ChinaNortheast China (Dōngběi)		793,300 km²	109,520,844	138/km²	Jilin	Changchun
Northeast ChinaNortheast China (Dōngběi)		793,300