# Polk County Current Inmates

Loading libraries...

In [1]:
import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

## Getting the inmate data

This is the web address where we get the inmate data.

In [2]:
url_polk_domain = 'https://apps.polkcountyiowa.gov' #used to access inmate details
url_polk_curr_inmates = url_polk_domain +'/PolkCountyInmates/CurrentInmates'

The tables are not shown in the source html, so a Selenium WebDriver is used to emulate the browser and automate the extraction of the data.

In [3]:
#I am emulating a Firefox browser.
#The driver is set to work in headless mode so it doesn't bring up a browser window.
ffoptions = webdriver.firefox.options.Options()
ffoptions.headless = True
#The driver is defined and wait time is set (though I am not 100% sure how it works).
driver = webdriver.Firefox(options=ffoptions)
driver.implicitly_wait(30)

Getting the website with the list of current inmates.

In [4]:
driver.get(url_polk_curr_inmates)
time.sleep(1)
#using Beautiful Soup to parse the html and make it easier to extract data.
soup_polk = BeautifulSoup(driver.page_source)
#In the driver page source the table with the data is present.
rows = soup_polk.find_all('tr')
rows.pop(0) #removing table header with titles
total_inmates = len(rows)
print('Number of inmates in sample =', total_inmates)

Number of inmates in sample = 881


The table extracted above has partial names and a link to each inmate's personal data, so we need to access that link.
A list of those links is created with the following cell.

In [5]:
details_page_list = []

for row in rows:
    details_page_list.append(row.find('a')['href'])
#
print('# of detail pages =',len(details_page_list))

# of detail pages = 881


Each details page is accessed and the name and race is extracted. This step takes quite a bit of time.

In [6]:
inmate_names = []
inmate_races = []

for i, detail_page in enumerate(details_page_list):
    driver.get(url_polk_domain+detail_page)
    #time.sleep(.1) #just in case, maybe not
    details_page_text = BeautifulSoup(driver.page_source)
    labels = details_page_text.find_all('label')
    name = labels[1].next_sibling.strip()
    race = labels[8].next_sibling.strip()
    inmate_names.append(name)
    inmate_races.append(race)
    print('item #{}:'.format(i),'name =', name, '; race =', race)
#
assert len(inmate_names) == total_inmates, "Some inmates' data is missing!"
assert len(inmate_races) == total_inmates, "Some inmates' data is missing!"

item #0: name = SHANE D MILLER ; race = White
item #1: name = MARK ANTHONY WILSON ; race = Black
item #2: name = SHAVAUGHN ELIZABETH KAY GARDNER ; race = Black
item #3: name = JULIE ANN LUNDY ; race = Unknown
item #4: name = TYLOR MICHAEL DAVIS ; race = White
item #5: name = DAVID LEE MOORE ; race = Black
item #6: name = KELLEY EUGENE ALLBEE ; race = White
item #7: name = AUSTIN DOMINIC BARKER ; race = White
item #8: name = KALEB VAN LUONG ; race = Asian
item #9: name = DONALD LEROY STEELE ; race = Black
item #10: name = CLARENCE RAPIER NANCE ; race = Black
item #11: name = JIMMY RAY SMITH ; race = Black
item #12: name = TRACY LEIGH JONES ; race = White
item #13: name = ALLISON DANIELLE DOYLE ; race = White
item #14: name = CHARLES LAKEYRIC FUNCHES ; race = Black
item #15: name = BRANDON JOSEPH GIBSON ; race = White
item #16: name = ASAAD RESHAN JABIR ; race = White
item #17: name = LESLEY DALE DUCKWORTH ; race = White
item #18: name = ALEC WADE LARSON ; race = White
item #19: name = S

item #155: name = CHRISTOFER JOHNATHAN KELLING ; race = White
item #156: name = TROY ANTHONY BURNEY ; race = White
item #157: name = MICHAEL JAY WHALEN ; race = White
item #158: name = DONTE JOMARTENEU JAMES ; race = Black
item #159: name = GASTON  KEAHNA ; race = Pacific Islander
item #160: name = TRAYANTWON DESHAE THOMAS ; race = Black
item #161: name = KATIE ANNE HERGERT ; race = White
item #162: name = JOSHUA DEE HILL ; race = White
item #163: name = ALEC RYAN STEFFES ; race = White
item #164: name = JULIE ARLENE CHRISTY ; race = White
item #165: name = KEVIN JACOB CARLTON ; race = White
item #166: name = BRIAN ROSCOE WELTZIN ; race = White
item #167: name = BILL LEE SIMET ; race = White
item #168: name = DENTERRELL LOUIS EWING BUTTS ; race = Black
item #169: name = DEANTA RAMON TAYLOR ; race = Black
item #170: name = WILLIAM ROSS CLARK ; race = White
item #171: name = GILBERTO JESUS MOTA ; race = White
item #172: name = JEREMY EDWARD RIDNOUR ; race = White
item #173: name = DAVID 

item #307: name = CURTIS MARTELL GINES ; race = Black
item #308: name = DONTRE MAURICE ENGLISH LEWIS ; race = Black
item #309: name = ASHLEY RENEE BROWN ; race = Black
item #310: name = VANCE EDWARD WILLIAMS ; race = Black
item #311: name = JAMES THOMAS VASEY ; race = Black
item #312: name = ROBERT J VANGUNDY ; race = White
item #313: name = ANISSA RENEA ANDERSON ; race = White
item #314: name = JEFFREY TRAE PHILLIPS ; race = White
item #315: name = ANDREW RICHARD KOLOMER ; race = White
item #316: name = JOE NAPEH YANKOON ; race = Black
item #317: name = DEBORAH LYNN WEIR ; race = White
item #318: name = DAVID KEITH SAWHILL ; race = White
item #319: name = MARIO LUCIANO ANTHONY ; race = Pacific Islander
item #320: name = CHRISTOPHER WILLIAM COATES ; race = White
item #321: name = LAURIE ANN WARNER ; race = White
item #322: name = DYLAN CHRISTIAN KAVAN ; race = Asian
item #323: name = MICHAEL SCOTT NEUBAUER ; race = White
item #324: name = JENNIFER LEA REYNOLDS ; race = White
item #325:

item #459: name = NICHOLAS DEAN OVERTON ; race = White
item #460: name = PHILLIP GAI POK ; race = Black
item #461: name = SEAN  NOEL ; race = Black
item #462: name = JAMES ALLEN ROBUCK ; race = White
item #463: name = BRIAN KING STOVER ; race = White
item #464: name = JEFFREY PAUL ERICKSON ; race = White
item #465: name = GREGORY ALLEN WRIGHT ; race = White
item #466: name = STANLEY EUGENE HOLT ; race = White
item #467: name = DANIEL LYNN WAUTERS ; race = White
item #468: name = ZACHARY SCOTT BRADISH ; race = White
item #469: name = LUKE STEPHEN PETER PETTEY ; race = White
item #470: name = WILLIAM CHARLES GOOSSENS ; race = White
item #471: name = ANDREW KENYON HIGHT ; race = White
item #472: name = JAMES ELMER STRUTH ; race = White
item #473: name = AISIAH CHRISTOPHER TALTON ; race = Black
item #474: name = LUIS DAMIAN HERNANDEZ ; race = White
item #475: name = RUSSELL SCOTT PETTYJOHN ; race = White
item #476: name = JOSEPH MICHAEL STEWART ; race = White
item #477: name = TYRONE JEROM

item #611: name = NICOLE ELIZABETH BAGBY ; race = White
item #612: name = AMBER DAWN JOHNSON ; race = White
item #613: name = BRANDON PAUL HARRISON ; race = White
item #614: name = BRIAN RAY WELTZIN ; race = White
item #615: name = ALBERT  WHITESIDE ; race = Black
item #616: name = ROBERT KYLE GATLIN ; race = White
item #617: name = ANTHONY HERBERT SMITH ; race = White
item #618: name = CLAUDE ROBERT CHIEF EAGLE ; race = Pacific Islander
item #619: name = SONYA RAE GORDON ; race = White
item #620: name = MYRON DESHAWN NEWTON ; race = Black
item #621: name = TEOFILO  AGAPITO APOLINAR ; race = White
item #622: name = NATALE NORMAN ISOLINI ; race = White
item #623: name = DEVEN KILEY ANDERSON ; race = White
item #624: name = LASANA VAYMULLION TOURE ; race = Black
item #625: name = JIMMY DALE LEAF ; race = White
item #626: name = NICOLE LYNN PARKER ; race = White
item #627: name = JOHN RAY VILLALOBOS ; race = White
item #628: name = KOLTON MICHAEL LINDMAN ; race = White
item #629: name = F

item #762: name = ANDREW WILLIAM LAPPE ; race = White
item #763: name = VALERY  GARCIA ; race = White
item #764: name = BRITTANY NICOLE NOELLE MUSICK ; race = White
item #765: name = JOANNA ENDICOTT HULSEY ; race = White
item #766: name = MICHAEL JEROME MARTIN ; race = Black
item #767: name = FLOYD EUGENE STOCKDALL ; race = White
item #768: name = MARVIN LEWIS NEWSOM ; race = White
item #769: name = BANG TANG WUOL ; race = Black
item #770: name = RICHARD ALLEN OBRYAN ; race = White
item #771: name = KEITH FRANCIS FLOWER ; race = White
item #772: name = NICOLE RENEE WALKER ; race = White
item #773: name = TREVOR JOHN WOZNIAK ; race = White
item #774: name = STEPHANIE ANN HARLOW ; race = White
item #775: name = DUSTIN ELLIOTT WILLIAMS ; race = Black
item #776: name = JACE PHILLIP CHALUP ; race = White
item #777: name = KELLY EUGENE BURKS ; race = Black
item #778: name = TYREE DE JOHN TAYLOR ; race = Black
item #779: name = RYAN BLAKE WOODWARD ; race = White
item #780: name = JOSHUA  BARJ

AssertionError: Some inmates' data is missing!

We can now close the Selenium WebDriver.

In [12]:
driver.close()

Now the table with the inmates' names and races is created. Notice that "Hispanic" or "Latino" is not shown in the list of unique values.

In [13]:
inmate_table = pd.DataFrame.from_dict({'Name':inmate_names, 'Race':inmate_races})
print("Race categories used in database: ", inmate_table['Race'].unique())

Race categories used in database:  ['White' 'Black' 'Unknown' 'Asian' 'Pacific Islander']


Inspecting the head of the table.

In [14]:
inmate_table.head()

Unnamed: 0,Name,Race
0,SHANE D MILLER,White
1,MARK ANTHONY WILSON,Black
2,SHAVAUGHN ELIZABETH KAY GARDNER,Black
3,JULIE ANN LUNDY,Unknown
4,TYLOR MICHAEL DAVIS,White


## Getting a list of common Hispanic names in the US

We are going to extract a list of common Hispanic names in the US. This is from the website Mongobay.com and the data is taken from the 2010 US Census. This list will be used to compare with the names of the inmates and guess whether or not they are hispanic.

In [15]:
url_common_lastnames = 'https://global.mongabay.com/es/nombres/hispanic.html'

response_common_lastnames = requests.get(url_common_lastnames)
soup_common_lastnames = BeautifulSoup(response_common_lastnames.text)
table = soup_common_lastnames.find_all(id='myTable')[0]

common_names_list = []
#There are 10 columns in this table, which is being processed as a list of strings using the .strings method.
#However, an extra column with '\n' is present at the beginning because of the HTML,
#so we are taking the name to be in the second column.
for i, s in enumerate(table.strings):
    if (i%11==1): common_names_list.append(s)
print(common_names_list[:3], '...', common_names_list[-3:]) #inspection of the result

['Apellido', 'GARCIA', 'RODRIGUEZ'] ... ['CORIA', '\n', '\n']


We notice that the column title is at the beginning and there are two unwanted characters at the tail, so they are removed.

In [16]:
common_names_list.pop(0)
common_names_list.pop()
common_names_list.pop()
print('Total number of names:', len(common_names_list))

Total number of names: 979


Saving the list to a text file.

In [None]:
with open('common_list_names.txt', 'w') as f:
    for i in range(len(common_names_list)):
        f.write(common_names_list[i]+'\n')

In [None]:
#inmate_table['Race (compared)'] = 
inmate_table.apply(lambda x: True if len(set(inmate_table['Name'].str.split()).intersection(common_names))>0 else False)

In [34]:
common_names_set = set(common_names_list)

In [37]:
len(common_names_set)

979

In [46]:
inmate_table.head()

Unnamed: 0,Name,Race
0,SHANE D MILLER,White
1,MARK ANTHONY WILSON,Black
2,SHAVAUGHN ELIZABETH KAY GARDNER,Black
3,JULIE ANN LUNDY,Unknown
4,TYLOR MICHAEL DAVIS,White


In [53]:
def race_reviewed(df):
    if len(df['Name'].str.split().apply(set).intersection(common_names_set))>0:
        return "Hispanic"
    else:
        return df['Race']

In [58]:
inmate_table['Name'].str.split().apply(set).apply(set.intersection(common_names_set))

TypeError: 'set' object is not callable

In [54]:
inmate_table.apply(race_reviewed)

KeyError: ('Name', 'occurred at index Name')

In [45]:
inmate_table['Race (reviewed)'] = inmate_table['Name'].apply(lambda x: "Hispanic" if 
                                                             len(set(x.split()).intersection(common_names_set))>0
                                                             else
                          )

0       True
1       True
2      False
3      False
4       True
       ...  
876    False
877    False
878     True
879     True
880    False
Name: Name, Length: 881, dtype: bool

In [None]:
import numpy as np

In [None]:
np.loadtxt?

In [None]:
common_names = np.loadtxt('common_list_names.txt', dtype=str, unpack=True)

In [None]:
common_names[-1]

In [None]:
len(common_names)

In [None]:
test_name = 'MARIJKE JOY HODGSON'#'FELIXA  GOMEZ'

In [None]:
test_name.split()

In [None]:
np.where(common_names==test_name)