# Approximating the number of Hispanic inmates

Loading the libraries to be used:

In [None]:
qtconsole

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

## Getting a list of common Hispanic names in the US

We are going to extract a list of common Hispanic names in the US. This is from the website Mongobay.com and the data is taken from the 2010 US Census. This list will be used to compare with the names of the inmates and guess whether or not they are hispanic.

In [None]:
url_common_lastnames = 'https://global.mongabay.com/es/nombres/hispanic.html'

response_common_lastnames = requests.get(url_common_lastnames)
soup_common_lastnames = BeautifulSoup(response_common_lastnames.text)
table = soup_common_lastnames.find_all(id='myTable')[0]

common_names_list = []
#There are 10 columns in this table, which is being processed as a list of strings using the .strings method.
#However, an extra column with '\n' is present at the beginning because of the HTML,
#so we are taking the name to be in the second column.
for i, s in enumerate(table.strings):
    if (i%11==1): common_names_list.append(s)
print(common_names_list[:3], '...', common_names_list[-3:]) #inspection of the result

We notice that the column title is at the beginning and there are two unwanted characters at the tail, so they are removed.

In [None]:
common_names_list.pop(0)
common_names_list.pop()
common_names_list.pop()

In [None]:
print('Total number of names:', len(common_names_list))

In [None]:
with open('common_list_names.txt', 'w') as f:
    for i in range(len(common_names_list)):
        f.write(common_names_list[i]+'\n')

The next function makes it easier to access the search page with parameters such as 'Search Aliases = No'.

In [None]:
def make_url(last_name):
    return 'http://www.dc.state.fl.us/OffenderSearch/list.aspx?TypeSearch=AI&Page=List&DataAction=Filter&dcnumber=&LastName={}&FirstName=&SearchAliases=0&OffenseCategory=&CurrentLocation=&CountyOfCommitment=&photosonly=0&nophotos=1&matches=50'.format(last_name)

Setting up the webdriver which will simulate accessing the website in a Firefox Browser.
We set it to headless so that an actual browser window.

In [None]:
ffoptions = webdriver.firefox.options.Options()
ffoptions.headless = True
driver = webdriver.Firefox(options=ffoptions)
driver.implicitly_wait(30)

Setting up the list of tables which will be collected and then running the collection loop.

In [None]:
list_of_tables = []

for name in common_names_list:
#     print('name=',name)
    number_of_results = last_elem_page = 50 #number of results per page and a variable to check if it is finished loading
    try:
        driver.get(make_url(name))
        last_elem_total = driver.find_element_by_id('ctl00_ContentPlaceHolder1_lblgrdListPage')
        last_elem_total = int(last_elem_total.text[last_elem_total.text.find('of') + 3:])
    except:
        #no results and no table found, therefore continue on to the next name
        continue

    while True:
        temp_table = pd.read_html(driver.page_source, attrs={'id':'ctl00_ContentPlaceHolder1_grdList'})[0]
        time.sleep(1)
        if (str(last_elem_page) in temp_table.iloc[-1,0]) or (str(last_elem_total) in temp_table.iloc[-1,0]):
            #sometimes the next page won't load quickly enough and the same table is reloaded
            #this checks where we are in the collection and avoids duplicate tables
            list_of_tables.append(temp_table)
            last_elem_page += number_of_results
            try:
                #moving on to the next page of results.
                elem = driver.find_element_by_name('ctl00$ContentPlaceHolder1$btnListNext')
                elem.send_keys(Keys.RETURN)
            except:
                #no more results. move on to next name.
                break

We close the webdriver object and concatenate all DataFrames collected.

In [None]:
driver.close()
hispanic_inmates = pd.concat(list_of_tables)
hispanic_inmates.reset_index(inplace=True)

Table inspection

In [None]:
hispanic_inmates.tail()

In [None]:
hispanic_inmates.info()