# Extracting Data from NamUs

NamUs (https://www.findthemissing.org/en) is the National Missing and Unidentified Persons System funded by the U.S. Department of Justice. NamUs does not provide an API, but does provide a searchable interface. 

In this notebook, I want to show how to retrieve and store missing persons data from NamUs using Selenium and store it as JSON. 

In [1]:
import time, re, json
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import numpy as np
import pandas as pd

Try to search for all missing persons in Alabama.

In [2]:
browser = webdriver.Chrome(ChromeDriverManager().install())
#browser = webdriver.Chrome("/Users/flynn_chen/Desktop/Projects/findus/missingpersons/chromedriver")
browser.get("https://www.findthemissing.org/en")
time.sleep(5)

#click away the initating window
browser.find_elements_by_xpath('''//*[@id="LegacyMigrationNotification"]/div[1]/div[2]/button''')[0].click()
states_input = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/section/div/div/div[2]/aside/quick-search/div/div[2]/form/fieldset/label[4]/div/ul/li/input''')[0]
states_input.click()

time.sleep(3)
states = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/section/div/div/div[2]/aside/quick-search/div/div[2]/form/fieldset/label[4]/div/div/ul/li''')[0]
states = states.text.split("\n")
print(states)

states_input.send_keys(states[0] + Keys.ENTER)

time.sleep(2)
search = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/section/div/div/div[2]/aside/quick-search/div/div[2]/form/div[2]/input[2]''')[0]
search.submit()
time.sleep(2)

[WDM] - Current google-chrome version is 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183
[WDM] - Get LATEST driver version for 85.0.4183


 


[WDM] - Trying to download new driver from http://chromedriver.storage.googleapis.com/85.0.4183.87/chromedriver_mac64.zip
[WDM] - Driver has been saved in cache [/Users/flynn_chen/.wdm/drivers/chromedriver/mac64/85.0.4183.87]


['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming']


In [3]:
total_in_state = int(browser.find_elements_by_class_name("case-results-heading")[0].text.replace(" Missing Persons Cases Found", ""))
print(total_in_state)

235


Instead of searching through 15 pages of results with about 10 cases per page. It would make more sense to display 100 cases on a page.

In [4]:
expand_table = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/div[4]/form/div[2]/section[2]/div/div/div/div/div[3]/div[3]/search-results-pager/ng-include/div/div/div/label/select''')[0]
expand_options = expand_table.find_elements_by_tag_name("option")
expand_text = [e.text for e in expand_options]
expand_options[expand_text.index('100')].click()
time.sleep(2)

In [5]:
def get_data_table(browser):
    grid = browser.find_elements_by_class_name("ui-grid-cell-contents")
#     print(len(grid))
    grid_text = [g.text for g in grid]
#     print(grid_text)
    col_names = grid_text[0:11]
    grid_list = [grid_text[g:g+11] for g in range(0, len(grid_text),11)]
    grid_df = pd.DataFrame.from_records(grid_list)
    grid_df.columns = grid_df.iloc[0,]
    grid_df = grid_df.drop([0])
    return grid_df

get_data_table(browser)

Unnamed: 0,Case Number,DLC,Last Name,First Name,Missing Age,City,County,State,Sex,Race / Ethnicity,Date Modified
1,MP73132,08/23/2020,Ragland,Kristie,52 Years,Coker,Tuscaloosa,AL,Female,White / Caucasian,08/31/2020
2,MP67982,03/08/2020,Watson,Douglas,29 Years,West Jefferson,Jefferson,AL,Male,White / Caucasian,06/22/2020
3,MP64666,01/16/2020,Edwards,Katrina,16 Years,Mobile,Mobile,AL,Female,White / Caucasian,08/12/2020
4,MP67978,01/08/2020,Kennebrew,Judy,65 Years,Tallassee,Tallapoosa,AL,Female,Black / African American,04/08/2020
5,MP71439,10/21/2019,Taylor,Lonnie,78 Years,Jasper,Walker,AL,Male,White / Caucasian,08/31/2020
...,...,...,...,...,...,...,...,...,...,...,...
96,MP23660,04/25/2009,Key,Scott,45 Years,Oakman,Walker,AL,Male,White / Caucasian,05/08/2020
97,MP7380,04/08/2009,Rose,Warner,29 Years,Selma,Dallas,AL,Male,Black / African American,02/04/2020
98,MP5014,03/23/2009,Sanchez,Victor,25 Years,Abbeville,Henry,AL,Male,Hispanic / Latino,02/11/2020
99,MP5211,01/21/2009,Adkins,Layla,32 Years,Leeds,Shelby,AL,Female,White / Caucasian,07/20/2020


Since we cannot really tell if the table has loaded correctly, we should write a function to keep checking if the rows have increased to 100.

In [6]:
# one_states_df = pd.DataFrame()
# for i in range(int(total_in_state/100) + 1):
#     print(i)
#     temp_df = get_data_table(browser)
    
#     if one_states_df.empty == True:
#         one_states_df = temp_df
#     else:
#         one_states_df = pd.concat([one_states_df, temp_df])
    
#     next_page = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/div[4]/form/div[2]/section[2]/div/div/div/div/div[5]/div/search-results-pager/ng-include/div/div/div/nav/button[2]''')[0]
#     next_page.click()
#     time.sleep(2)
    
# print(one_states_df)
    

Iterate over all the states and get the data

In [None]:
all_states_df = pd.DataFrame()
for one_state in states:
    getting_data = True
    while getting_data:
        try:
            browser.get("https://www.findthemissing.org/en")
            time.sleep(2)

            #click away the initating window
            browser.find_elements_by_xpath('''//*[@id="LegacyMigrationNotification"]/div[1]/div[2]/button''')[0].click()
            states_input = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/section/div/div/div[2]/aside/quick-search/div/div[2]/form/fieldset/label[4]/div/ul/li/input''')[0]
            states_input.click()
            time.sleep(3)

        #     states = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/section/div/div/div[2]/aside/quick-search/div/div[2]/form/fieldset/label[4]/div/div/ul/li''')[0]
        #     states = states.text.split("\n")
            states_input.send_keys(one_state + Keys.ENTER)
            search = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/section/div/div/div[2]/aside/quick-search/div/div[2]/form/div[2]/input[2]''')[0]
            search.submit()
            time.sleep(3)

            total_in_state = int(browser.find_elements_by_class_name("case-results-heading")[0].text.replace(" Missing Persons Cases Found", "").replace(",", ""))
            print(total_in_state)

            expand_table = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/div[4]/form/div[2]/section[2]/div/div/div/div/div[3]/div[3]/search-results-pager/ng-include/div/div/div/label/select''')[0]
            expand_options = expand_table.find_elements_by_tag_name("option")
            expand_text = [e.text for e in expand_options]
            expand_options[expand_text.index('100')].click()
            time.sleep(2)

            one_states_df = pd.DataFrame()
            for i in range(int(total_in_state/100) + 1):
                temp_df = get_data_table(browser)

                if one_states_df.empty == True:
                    one_states_df = temp_df
                else:
                    one_states_df = pd.concat([one_states_df, temp_df])

                next_page = browser.find_elements_by_xpath('''//*[@id="visitor"]/div[1]/div[4]/form/div[2]/section[2]/div/div/div/div/div[5]/div/search-results-pager/ng-include/div/div/div/nav/button[2]''')[0]
                next_page.click()
                time.sleep(2)


            if all_states_df.empty == True:
                all_states_df = one_states_df
            else:
                all_states_df = pd.concat([all_states_df, one_states_df])

            getting_data = False
            
        except Exception as e: 
            print(e)
            time.sleep(3)
        

235
Message: element not interactable
  (Session info: chrome=85.0.4183.102)

Message: element not interactable
  (Session info: chrome=85.0.4183.102)

Message: element not interactable
  (Session info: chrome=85.0.4183.102)

Message: element not interactable
  (Session info: chrome=85.0.4183.102)

1163


In [None]:
all_states_df.to_csv("namus.csv")

In [None]:
browser.close

In [None]:
#facebook
#google
#twitter
#google