# Webscraper
#### TX Death Row Last Words
#### Kojo Alfajiri Shah
#### 9 July 2024

Webscraping contains two main tasks
* Scraping the main table of [Death Row Information](https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html) which includes identifying information for prisoners and links to personal details and last words.
* Programmatically accessing the links in the main table and scraping prisoner details and last words from all.

In [58]:
import requests
import bs4
import pandas as pd

### Scraping the main table

In [3]:
## review
page = 'https://www.tdcj.texas.gov/death_row/dr_executed_offenders.html'
headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'}

In [5]:
## access
status = requests.get(page, headers = headers, verify = False)
status



<Response [200]>

In [6]:
status.close()

In [7]:
## parse
soup = bs4.BeautifulSoup(status.text, features = 'html.parser')

In [8]:
html_table = soup.find_all('tr')
column_headers = html_table.pop(0)

In [9]:
column_headers

<tr>
<th scope="col" style="text-align: center">Execution</th>
<th scope="col" style="text-align: center; width: 16%">Link</th>
<th scope="col" style="text-align: center; width: 13%">Link</th>
<th scope="col" style="text-align: center">Last Name</th>
<th scope="col" style="text-align: center">First Name</th>
<th scope="col" style="text-align: center; width: 7%">TDCJ<br/>Number</th>
<th scope="col" style="text-align: center">Age</th>
<th scope="col" style="text-align: center">Date</th>
<th scope="col" style="text-align: center">Race</th>
<th scope="col" style="text-align: center">County</th>
</tr>

In [62]:
test = column_headers.find_all('th')

name_list = []

for tag in test:
    name = tag.text
    name_list.append(name)
    
name_list[1] = "Inmate Information"
name_list[2] = "Last Words"
name_list

['Execution',
 'Inmate Information',
 'Last Words',
 'Last Name',
 'First Name',
 'TDCJNumber',
 'Age',
 'Date',
 'Race',
 'County']

In [11]:
html_table

[<tr>
 <td style="text-align: center">588</td>
 <td style="text-align: center"><a href="dr_info/gonzalesramiro.html" title="Inmate Information for Ramiro Gonzales">Inmate Information</a></td>
 <td style="text-align: center"><a href="dr_info/gonzalesramirolast.html" title="Last Statement of Ramiro Gonzales">Last Statement</a></td>
 <td style="text-align: center">Gonzales</td>
 <td style="text-align: center">Ramiro</td>
 <td style="text-align: center">999513</td>
 <td style="text-align: center">41</td>
 <td style="text-align: center">6/26/2024</td>
 <td style="text-align: center">Hispanic</td>
 <td style="text-align: center"> Medina</td>
 </tr>,
 <tr>
 <td style="text-align: center">587</td>
 <td style="text-align: center"><a href="dr_info/cantuivan.html" title="Inmate Information for Ivan Cantu">Inmate Information</a></td>
 <td style="text-align: center"><a href="dr_info/cantuivanlast.html" title="Last Statement of Ivan Cantu">Last Statement</a></td>
 <td style="text-align: center">Cant

In [56]:
table_list = []

for row in html_table:
    row_values = []
    for cell in row.find_all('td'):
        if cell.a is None:
            cell_value = cell.text
        else:
            cell_value = cell.a['href']
        row_values.append(cell_value)
    table_list.append(row_values)
    
table_list

[['588',
  'dr_info/gonzalesramiro.html',
  'dr_info/gonzalesramirolast.html',
  'Gonzales',
  'Ramiro',
  '999513',
  '41',
  '6/26/2024',
  'Hispanic',
  ' Medina'],
 ['587',
  'dr_info/cantuivan.html',
  'dr_info/cantuivanlast.html',
  'Cantu',
  'Ivan',
  '999399',
  '50',
  '2/28/2024',
  'Hispanic',
  ' Collin'],
 ['586',
  'dr_info/renteriadavid.html',
  'dr_info/renteriadavidlast.html',
  'Renteria',
  'David',
  '999460',
  '53',
  '11/16/2023',
  'Other',
  ' El Paso'],
 ['585',
  'dr_info/brewer.jpg',
  'dr_info/brewerbrentlast.html',
  'Brewer',
  'Brent',
  '999000',
  '53',
  '11/9/2023',
  'White',
  ' Randall'],
 ['584',
  'dr_info/murphyjedidiah.html',
  'dr_info/murphyjedidiahlast.html',
  'Murphy',
  'Jedidiah',
  '999392',
  '48',
  '10/10/2023',
  'White',
  ' Dallas'],
 ['583',
  'dr_info/brownarthur.jpg',
  'dr_info/brownarthurlast.html',
  'Brown, Jr.',
  'Arthur',
  '999110',
  '52',
  '3/9/2023',
  'Black',
  ' Harris'],
 ['582',
  'dr_info/greengary.html',
  

In [55]:
table_list = [name_list] + table_list
table_list

[['Execution',
  'Link',
  'Link',
  'Last Name',
  'First Name',
  'TDCJNumber',
  'Age',
  'Date',
  'Race',
  'County'],
 ['588',
  'dr_info/gonzalesramiro.html',
  'dr_info/gonzalesramirolast.html',
  'Gonzales',
  'Ramiro',
  '999513',
  '41',
  '6/26/2024',
  'Hispanic',
  ' Medina'],
 ['587',
  'dr_info/cantuivan.html',
  'dr_info/cantuivanlast.html',
  'Cantu',
  'Ivan',
  '999399',
  '50',
  '2/28/2024',
  'Hispanic',
  ' Collin'],
 ['586',
  'dr_info/renteriadavid.html',
  'dr_info/renteriadavidlast.html',
  'Renteria',
  'David',
  '999460',
  '53',
  '11/16/2023',
  'Other',
  ' El Paso'],
 ['585',
  'dr_info/brewer.jpg',
  'dr_info/brewerbrentlast.html',
  'Brewer',
  'Brent',
  '999000',
  '53',
  '11/9/2023',
  'White',
  ' Randall'],
 ['584',
  'dr_info/murphyjedidiah.html',
  'dr_info/murphyjedidiahlast.html',
  'Murphy',
  'Jedidiah',
  '999392',
  '48',
  '10/10/2023',
  'White',
  ' Dallas'],
 ['583',
  'dr_info/brownarthur.jpg',
  'dr_info/brownarthurlast.html',
  

In [51]:
table_list

['Execution',
 'Link',
 'Link',
 'Last Name',
 'First Name',
 'TDCJNumber',
 'Age',
 'Date',
 'Race',
 'County',
 ['588',
  'dr_info/gonzalesramiro.html',
  'dr_info/gonzalesramirolast.html',
  'Gonzales',
  'Ramiro',
  '999513',
  '41',
  '6/26/2024',
  'Hispanic',
  ' Medina'],
 ['587',
  'dr_info/cantuivan.html',
  'dr_info/cantuivanlast.html',
  'Cantu',
  'Ivan',
  '999399',
  '50',
  '2/28/2024',
  'Hispanic',
  ' Collin'],
 ['586',
  'dr_info/renteriadavid.html',
  'dr_info/renteriadavidlast.html',
  'Renteria',
  'David',
  '999460',
  '53',
  '11/16/2023',
  'Other',
  ' El Paso'],
 ['585',
  'dr_info/brewer.jpg',
  'dr_info/brewerbrentlast.html',
  'Brewer',
  'Brent',
  '999000',
  '53',
  '11/9/2023',
  'White',
  ' Randall'],
 ['584',
  'dr_info/murphyjedidiah.html',
  'dr_info/murphyjedidiahlast.html',
  'Murphy',
  'Jedidiah',
  '999392',
  '48',
  '10/10/2023',
  'White',
  ' Dallas'],
 ['583',
  'dr_info/brownarthur.jpg',
  'dr_info/brownarthurlast.html',
  'Brown, Jr.

In [65]:
info = pd.DataFrame(table_list, columns = name_list)
info

Unnamed: 0,Execution,Inmate Information,Last Words,Last Name,First Name,TDCJNumber,Age,Date,Race,County
0,588,dr_info/gonzalesramiro.html,dr_info/gonzalesramirolast.html,Gonzales,Ramiro,999513,41,6/26/2024,Hispanic,Medina
1,587,dr_info/cantuivan.html,dr_info/cantuivanlast.html,Cantu,Ivan,999399,50,2/28/2024,Hispanic,Collin
2,586,dr_info/renteriadavid.html,dr_info/renteriadavidlast.html,Renteria,David,999460,53,11/16/2023,Other,El Paso
3,585,dr_info/brewer.jpg,dr_info/brewerbrentlast.html,Brewer,Brent,999000,53,11/9/2023,White,Randall
4,584,dr_info/murphyjedidiah.html,dr_info/murphyjedidiahlast.html,Murphy,Jedidiah,999392,48,10/10/2023,White,Dallas
...,...,...,...,...,...,...,...,...,...,...
583,5,dr_info/skillerndoyle.jpg,dr_info/skillerndoylelast.html,Skillern,Doyle,518,49,01/16/1985,White,Lubbock
584,4,dr_info/barefootthomas.jpg,dr_info/barefootthomaslast.html,Barefoot,Thomas,621,39,10/30/1984,White,Bell
585,3,dr_info/obryanronald.jpg,dr_info/obryanronaldlast.html,O'Bryan,Ronald,529,39,03/31/1984,White,Harris
586,2,dr_info/autryjames.html,dr_info/no_last_statement.html,Autry,James,670,29,03/14/1984,White,Jefferson


In [66]:
## save
info.to_csv('../data/executed_inmates.csv', index=False)