# Scraping website with multiple pages

This notebook is created to scrap 2020 FFIEC Census Report from FFIEC website

In [1]:
from lxml import html
import requests
import pandas as pd
from time import sleep
from random import randint

In [2]:
# Function that allows us to pass different parameters to change the url
def create_url(county,state,page=1):
    url="https://www.ffiec.gov/census/report.aspx?year=2020&county={}&tract=ALL&state={}&report=demographic&page={}".format(county,state,page)
    return url

# Function to search for the maximum number of pages of this report 
def find_max_page(url):
    # Parse the main url and find the section says "Page"
    r = requests.get(url)
    parser = html.fromstring(r.content)
    find_page = parser.xpath("//*[contains(text(),'Page')]")

    # Take the last character from the string with "Page" since that is the maximum page number
    page_str = find_page[0].text_content()
    pages = page_str[len(page_str)-1]
    
    return pages

# Function to Loop through all pages and store data in one data frame
def parse_and_save(county,state):
    pages = find_max_page(create_url(county,state))
    
    for page in range(1,int(pages)+1):

        url = create_url(county,state,page) # Change url as we move from page to page
        r = requests.get(url)
        parser = html.fromstring(r.content)
        tb = parser.xpath("//table[@id='Report1_dgReportDemographic']//tr") # Find the section where the demographic table locates

        # Paser table header if we are on the first page and initial the output data frame
        if (page == 1):
            cols = ['State','County','Page']
            for col in tb[0]:
                cols.append(col.text_content())

            df = pd.DataFrame([cols])

        for i in range(1,len(tb)): # Loop through each row since the second row to exclude the header
            if len(tb[i]) != 12:
                break

            row = [state,county,page] # Initialize a list to store elements in a row

            for j in range(0,len(tb[i])): # Loop through each element in a row
                row.append(tb[i][j].text_content())

            df.loc[len(df)] = row

        sleep(randint(5,10)) # Control the scrapping rate - avoid stressing out the server and being banned

    df.columns = df.loc[0] # Set the first row as table header
    df = df[1:] # Remove the first row
    
    return df

Test the function above with Cuyahoga County in Ohio State and check the final output data frame

In [3]:
# Call function parse_and_save
Cuyahoga_Ohio = parse_and_save(county='035',state='39')

In [4]:
Cuyahoga_Ohio.shape

(447, 15)

In [5]:
Cuyahoga_Ohio.head()

Unnamed: 0,State,County,Page,Tract Code,Tract Income Level,Distressed or Under -served Tract,Tract Median Family Income %,2020 FFIEC Est. MSA/MD non-MSA/MD Median Family Income,2020 Est. Tract Median Family Income,2015 Tract Median Family Income,Tract Population,Tract Minority %,Minority Population,Owner Occupied Units,1- to 4- Family Units
1,39,35,1,1011.01,Low,No,29.11,"$76,000","$22,124","$19,167",1903,65.37,1244,107,543
2,39,35,1,1011.02,Moderate,No,70.39,"$76,000","$53,496","$46,333",4516,34.3,1549,704,1495
3,39,35,1,1012.0,Moderate,No,53.53,"$76,000","$40,683","$35,234",2640,46.29,1222,258,1016
4,39,35,1,1013.0,Low,No,32.93,"$76,000","$25,027","$21,679",1897,53.56,1016,144,492
5,39,35,1,1014.0,Low,No,46.7,"$76,000","$35,492","$30,742",1932,49.79,962,365,921
