In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
from splinter import Browser
import time
import pandas as pd

In [2]:
executable_path = {'executable_path': 'chromedriver'}
browser = Browser('chrome', **executable_path, headless=False)

In [3]:
# URL of page to be scraped
url = 'https://www.usich.gov/tools-for-action/map/#fn[]=1400&fn[]=2900&fn[]=6000&fn[]=9900&fn[]=13500'

# using the .visit() method tell the browser to visit the url
browser.visit(url)

#sleep code to allow browser page to load
time.sleep(5)

#obtain the page html using the .html attribute of the browser object
html = browser.html

# Create BeautifulSoup object; parse with 'html.parser'
soup = BeautifulSoup(html, 'html.parser')

In [4]:
#Note the find method gets the "ul" tag with id = "stats", corresponding to homeless stats sections of the html
stats_tag =  soup.find("ul", id="stats")

""" Create a beautiful soup object for the <ul> tag that has id = "selectMenuBox"
Note this object also refers to the dropdown menu which contains a list of states with links to different pages
with data pertaining to the particular state """

state_menu_tag =  soup.find("ul", id="selectMenuBox")

#Create a list for all the <li> tag contained within the <ul> tag
states_tag_list = state_menu_tag.find_all("li")

#Based on the list of <li> tags create a list of the 'href's with link to the approriate state pages
href_list = [li.a['href'] for li in states_tag_list]
    
states = [i.a.text for i in states_tag_list]

In [5]:
#create a list to hold the dictionaries we will use to construct a pandas dataframe.
homeless_data = []

for x in range(len(states)):
    
    #Create an empty dictionary which we will use to create key-value pairs using the <span> text and the <p> text
    stats_dict = {} 
    
    #Click on the dropdown menu by referencing the <ul> id
    browser.click_link_by_id('selectBox')
    time.sleep(1)
    
    #use method to travel to next state page in list 
    browser.click_link_by_partial_href(href_list[x])
    time.sleep(5)
    
    #obtain the new page html using the .html attribute of the browser object
    state_html = browser.html

    # Create BeautifulSoup object; parse with 'html.parser'
    state_soup = BeautifulSoup(state_html, 'html.parser')
    
    #Note the find method gets the "ul" tag with id = "stats", corresponding to homeless stats sections of the html
    state_stats_tag =  state_soup.find("ul", id="stats")
    
    # create a list of <li>'s found in the stats <ul>
    state_stats_list = state_stats_tag.find_all("li")
    
    for i in state_stats_list:
        stats_dict[i.span.text] = i.p.text
    homeless_data.append(stats_dict)

In [9]:
homeless_df = pd.DataFrame(homeless_data, index=states)
homeless_df

Unnamed: 0,Nighttime Residence: Doubled up,Nighttime Residence: Hotels/motels,Nighttime Residence: Shelters,Nighttime Residence: Unsheltered,Persons Experiencing Chronic Homelessness,Total Family Households Experiencing Homelessness,Total Homeless Population,Total Number of Homeless Students,Total Number of Unaccompanied Homeless Students,Unaccompanied Young Adults (Aged 18-24) Experiencing Homelessness,Veterans Experiencing Homelessness
Alabama,12021,681,735,675,540,280,3434,14112,583,158,339
Alaska,2257,270,920,337,357,193,2016,3784,877,163,132
Arizona,15049,1404,6146,577,1774,792,9865,24770,2850,638,893
Arkansas,10178,524,1014,268,466,147,2712,11984,592,240,251
California,211607,10095,17061,7533,34332,6702,129972,246296,7495,12396,10836
Colorado,17146,2205,2837,826,2738,990,10857,23014,2034,593,1073
Connecticut,2702,396,2492,121,341,553,3976,6260,174,187,190
Delaware,2449,561,202,15,189,125,1082,3227,324,53,70
District of Columbia,0,0,0,0,1781,924,6904,0,180,318,306
Florida,53582,8113,8346,2001,5302,2757,31030,72042,6679,1892,2543


In [55]:
df_columns = list(homeless_df.columns)

In [41]:
for x in df_columns:
    for i in range(len(homeless_df[x])):
        homeless_df[x][i] = homeless_df[x][i].replace(',', '')

for t in df_columns:
    homeless_df[t] = pd.to_numeric(homeless_df[t])

In [56]:
homeless_num = homeless_df['Total Homeless Population'].sum()

percentage = [((i/homeless_num)*100) for i in homeless_df['Total Homeless Population']]

homeless_df['Percentage of Homeless Population'] = percentage

homeless_df



Unnamed: 0,Nighttime Residence: Doubled up,Nighttime Residence: Hotels/motels,Nighttime Residence: Shelters,Nighttime Residence: Unsheltered,Persons Experiencing Chronic Homelessness,Total Family Households Experiencing Homelessness,Total Homeless Population,Total Number of Homeless Students,Total Number of Unaccompanied Homeless Students,Unaccompanied Young Adults (Aged 18-24) Experiencing Homelessness,Veterans Experiencing Homelessness,percentages,Percentage of Homeless Population
Alabama,12021,681,735,675,540,280,3434,14112,583,158,339,0.622343,0.622343
Alaska,2257,270,920,337,357,193,2016,3784,877,163,132,0.365359,0.365359
Arizona,15049,1404,6146,577,1774,792,9865,24770,2850,638,893,1.787831,1.787831
Arkansas,10178,524,1014,268,466,147,2712,11984,592,240,251,0.491495,0.491495
California,211607,10095,17061,7533,34332,6702,129972,246296,7495,12396,10836,23.554784,23.554784
Colorado,17146,2205,2837,826,2738,990,10857,23014,2034,593,1073,1.967611,1.967611
Connecticut,2702,396,2492,121,341,553,3976,6260,174,187,190,0.720569,0.720569
Delaware,2449,561,202,15,189,125,1082,3227,324,53,70,0.196091,0.196091
District of Columbia,0,0,0,0,1781,924,6904,0,180,318,306,1.25121,1.25121
Florida,53582,8113,8346,2001,5302,2757,31030,72042,6679,1892,2543,5.623557,5.623557


In [58]:
homeless_df.to_csv("dataframes/2018_homeless_data.csv")