In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 25

In [2]:
def scrape_page_for_community_table(url):
    
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
    
    tr_tags = []
    
    for tr in soup.find_all('tr'):
        tr_tags.append(tr.text)
    
    column_labels = tr_tags[0].replace('\n\n', '\n').split('\n')[1:8]
    
    max_row_counter = len(column_labels)

    table_vals = []

    for table_row in tr_tags[1:78]:
        table_vals.append(table_row.replace('\n\n', '\n').split('\n')[1:8])

    dict_map_value_to_column = dict()

    for community_vals in table_vals:
        for i in range(max_row_counter):
            if column_labels[i] in dict_map_value_to_column:
                dict_map_value_to_column[column_labels[i]].append(community_vals[i])
            else:
                dict_map_value_to_column[column_labels[i]] = [community_vals[i]]

    cleaned_community_table = pd.DataFrame(dict_map_value_to_column)
    
    return cleaned_community_table

In [3]:
url = 'https://en.wikipedia.org/wiki/Community_areas_in_Chicago'

In [4]:
wikipedia_community_table = scrape_page_for_community_table(url)

In [5]:
wikipedia_community_table.columns

Index(['Number[8]', 'Name[8]', '2017[update] population[9]',
       'Area (sq mi.)[10]', 'Area (km2)',
       '2017[update] populationdensity (/sq mi.)',
       '2017[update] populationdensity (/km2)'],
      dtype='object')

In [6]:
wikipedia_community_table.rename(columns={'Name[8]': 'community_name', 'Area (sq mi.)[10]': 'sq_mi'}, inplace=True, errors='ignore')

In [7]:
community_size = wikipedia_community_table[['community_name', 'sq_mi']]

In [8]:
community_size['community_name'] = community_size['community_name'].apply(lambda x: x.lower())
community_size['sq_mi'] = community_size['sq_mi'].astype(float)

In [15]:
community_size['community_name'] = community_size['community_name'].replace("o'hare", 'ohare').replace('(the) loop[11]', 'loop')

In [16]:
community_size.to_pickle('community_sq_mi.pkl')