In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 75

In [2]:
pct_business_share_sales_url = 'https://www.housingstudies.org/data-portal/browse/?indicator=share-sales-business&view_as=view-table'

total_mortgages_url = 'https://www.housingstudies.org/data-portal/browse/?indicator=total-mortgage-activity&area=chicago-community-areas&property_type=0&view_as=view-table'

total_foreclosures_url = 'https://www.housingstudies.org/data-portal/browse/?indicator=total-foreclosure-activity&area=chicago-community-areas&property_type=0&view_as=view-table'

housing_units_by_type_url = 'https://www.housingstudies.org/data-portal/browse/?indicator=housing-units-composition&area=chicago-community-areas&view_as=view-table'

In [3]:
def scrape_page_for_housing_table(url):
    
    response = requests.get(url)
    page = response.text
    soup = BeautifulSoup(page, 'lxml')
  
    column_labels = []
    for th in soup.find('thead').find_all('th'):
        column_labels.append(th.text)
    
    max_row_counter = len(column_labels)
    
    tr_list = []

    for tr_tag in soup.find('tbody').find_all('tr'):
        tr_list.append(tr_tag)

    list_of_td_tags = []

    for tr in tr_list:
        list_of_td_tags.extend(tr.find_all('td'))
    
    full_community_table = []
    row_of_table = []

    for td in list_of_td_tags:
    
        row_of_table.append(td.text)
        if len(row_of_table) == max_row_counter:
            full_community_table.append(row_of_table)
            row_of_table = []

    dict_map_value_to_column = dict()

    for community_yearly_vals in full_community_table:
        for i in range(max_row_counter):
            if column_labels[i] in dict_map_value_to_column:
                dict_map_value_to_column[column_labels[i]].append(community_yearly_vals[i])
            else:
                dict_map_value_to_column[column_labels[i]] = [community_yearly_vals[i]]

    cleaned_community_table = pd.DataFrame(dict_map_value_to_column)
    
    return cleaned_community_table

def clean_housing_df(dataframe):
    
    dataframe['Geography'] = dataframe['Geography'].apply(lambda x: x.replace('\n', '').lower())
    
    for col in dataframe.columns[1:]:
        
        dataframe[col] = dataframe[col].apply(lambda x: x.replace('--', '0').replace('%', '').replace(',', '')).astype(float)
        
    return dataframe

In [18]:
pct_business_share_sales = clean_housing_df(scrape_page_for_housing_table(pct_business_share_sales_url))

In [19]:
total_mortgages = clean_housing_df(scrape_page_for_housing_table(total_mortgages_url))

In [20]:
total_foreclosures = clean_housing_df(scrape_page_for_housing_table(total_foreclosures_url))

In [21]:
housing_share = clean_housing_df(scrape_page_for_housing_table(housing_units_by_type_url))

In [22]:
merging_pct_business_share_sales = pct_business_share_sales[['2019']]

merging_pct_business_share_sales.rename(columns={'2019': 'pct_business_share_sales'}, inplace=True, errors='ignore')

In [23]:
merging_total_foreclosures = total_foreclosures[['2019']]

merging_total_foreclosures.rename(columns={'2019': 'total_foreclosures'}, inplace=True, errors='ignore')

In [24]:
merging_total_mortgages = total_mortgages[['2019']]

merging_total_mortgages.rename(columns={'2019': 'total_mortgages'}, inplace=True, errors='ignore')

In [25]:
housing_share.drop(['All Residential Properties'], axis=1, inplace=True, errors='ignore')

In [26]:
community_housing_metrics = pd.concat([housing_share, merging_pct_business_share_sales, merging_total_foreclosures, merging_total_mortgages], axis=1)

In [29]:
community_housing_metrics['Geography'] = community_housing_metrics['Geography'].replace("o'hare", 'ohare')

In [30]:
community_housing_metrics.to_pickle('community_housing_metrics.pkl')