In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException, ElementNotInteractableException, TimeoutException
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.expected_conditions import element_to_be_clickable, presence_of_element_located
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import csv 
import pandas as pd
import time
import math

In [2]:
# options
options = Options()
options.add_argument('--headless')
options.add_argument("--window-size=1000,1000")
options.add_argument('--disable-blink-features=AutomationControlled')
options.add_argument('--no-sandbox')   

# Populations of Towns

<font size="4"> The goal is to find the total population of every town in *final_data_CD.csv.* </font> <br>
* **Input**
    * ```final_data_CD.csv``` <br>
       - This contains the continental debt held by individuals. For this particular project, we only care about the towns that these individuals are from. </a><br>
* **Output**
    * ```town_pops_clean.csv``` <br>
       - This file will contain the populations of every town. 

In [3]:
cd_df = pd.read_csv("final_data_CD.csv")
towns = cd_df[['Group State', 'Group County', 'Group Town']].drop_duplicates().dropna().reset_index()
print(towns)

     index Group State           Group County       Group Town
0        2          RI         Bristol County          Bristol
1        3          CT        Hartford County         Hartford
2        5          RI      Providence County   North Scituate
3        6          CT       New Haven County        New Haven
4        7          NH      Rockingham County       Portsmouth
..     ...         ...                    ...              ...
460   3779          PA  Northumberland County          Sunbury
461   3796          PA     York County County             Fawn
462   3818          PA  Chester County County       New London
463   3836          NY   Queens County County  South Hempstead
464   3838          VA     King George County      King George

[465 rows x 4 columns]


In [4]:
# drop states where census data ancestry does not have records for: VA, GA, NJ, DE
towns.drop(towns[towns['Group State'].isin(['VA', 'GA', 'NJ', 'DE'])].index, inplace=True)
print(towns)

     index Group State               Group County       Group Town
0        2          RI             Bristol County          Bristol
1        3          CT            Hartford County         Hartford
2        5          RI          Providence County   North Scituate
3        6          CT           New Haven County        New Haven
4        7          NH          Rockingham County       Portsmouth
..     ...         ...                        ...              ...
459   3771          NY  Westchester County County          Stephen
460   3779          PA      Northumberland County          Sunbury
461   3796          PA         York County County             Fawn
462   3818          PA      Chester County County       New London
463   3836          NY       Queens County County  South Hempstead

[445 rows x 4 columns]


In [5]:
# dictionary of state codes to actual state names (avoids potential errors)
codes = {
    'CT':'Connecticut',
    'ME':'Maine',
    'MD':'Maryland',
    'MA':'Massachusetts',
    'NH':'New Hampshire',
    'NY':'New York',
    'NC':'North Carolina',
    'PA':'Pennsylvania',
    'RI':'Rhode Island',
    'SC':'South Carolina',
    'VT':'Vermont' 
}

In [6]:
# handle naming exceptions
exceptions = {
    'Philadelphia County':'Philadelphia',
    'Charleston County':'Charleston',
    'New Haven County':'New Haven'
}

## Accessing Ancestry

1. Access the 1790 census on Ancestry.com using the Selenium library. 
2. Handle county names. Some county names have '[county name] County County'.
3. Access searchbar using Selenium. Selenium inputs [town, county, state, USA] into searchbar. 
4. Selenium clicks on the "Search" button and waits for 0.75 seconds. 
5. Once new webpage is loaded, ```&event_x=0-0-0_1-0```, is added to the current url. This restricts the search only to the town and excludes surrounding counties/towns. 
6. Selenium finds the total number of results, which corresponds to the population of the town. This population is added to the ```town_pops``` dictionary. If there are no results for that town, 'NR' is added to the dictionary instead. 
7. Steps 1-6 are repeated for every town. 

<span style="color: red;">**Note: Runtime will be long. There were approximately ~400 unique towns.**</span>

In [10]:
town_pops = {} # dictionary of all the populations of each town 
error_handling = False

# uncomment when handling errors (post-running main script) 
# towns_l = er_towns

# uncomment when going through entire table
towns_l = towns.values.tolist()

# if we want to do it by section --> towns_l = towns_l[88:]
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
wait = WebDriverWait(driver, 30)
census_url = "https://www.ancestry.com/search/collections/5058/"
for town in towns_l:
    # handle '[name] county county'
    
    if not error_handling:
        county = town[2][:-7].replace(' County', '')
        loc_add = town[3] + ", " + county + ", " + codes[town[1]] + ", USA "
    else:
        loc_add = town[3] + ", " + town[2] + ", " + town[1] + ", USA "
        
    try:
        # open 1790 census 
        try:
            driver.get(census_url)
        except:
            driver.close()
            driver.get(census_url)   

        # handle some exceptions to county names 
        if county in exceptions:
            county = exceptions[county]

        print("-------------------------")
        print(loc_add) ## 

        # wait until event searchbar is visible, then click on it: handles error i noticed
        xpath = "/html/body/div[3]/div/div/div/div/section/div/div/div/div/div/form/div[1]/div/fieldset[2]/div[2]/div/input"
        try:
            wait.until(element_to_be_clickable((By.XPATH, xpath))).click()
        except: # mostly to handle timeout exceptions: close tab and try again
            driver.close()
            driver.get(census_url)
            time.sleep(0.25)
            wait.until(element_to_be_clickable((By.XPATH, xpath))).click()

        input_t = driver.find_element(By.XPATH, xpath)
        input_t.send_keys(loc_add)
        time.sleep(0.75)

        # click on search button 
        wait.until(element_to_be_clickable((By.XPATH, "/html/body/div[3]/div/div/div/div/section/div/div/div/div/div/form/div[1]/div/div[9]/div[1]/input"))).click()
        time.sleep(0.75)

        # add restrictions: we want exact town population
        print(driver.current_url) ##
        driver.get(driver.current_url + "&event_x=0-0-0_1-0")

        # check if town is correct: handles error i noticed
        title_pl = wait.until(presence_of_element_located((By.XPATH, "//*[@id='refineView']/form/div[3]/div/div[1]/div"))).get_attribute("title")
        if (title_pl + " " == loc_add):
            # add to town_pops 
            try:
                pop = driver.find_element(By.CLASS_NAME, "resultsLabel")
                town_pops[loc_add] = int(pop.text.split("of")[1].replace(',','')) 
            except NoSuchElementException:
                town_pops[loc_add] = "NR"
        else:

            print("Titles don't match")
            print("title on page = " + title_pl)
            town_pops[loc_add] = "NR"

        print(town_pops[loc_add]) ##
        print(len(town_pops)) ##
        print(driver.current_url) ##
        print("-------------------------")
        #driver.close() # close current tab 
        #driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # restart chrome 
        #wait = WebDriverWait(driver, 30)
    except Exception as e:
        print("___________________________________")
        print("ERROR! Moving swiftly to next town")
        print(e)
        print("___________________________________")
        town_pops[loc_add] = 'ER'
        driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) # restart chrome 
        wait = WebDriverWait(driver, 30)
        continue

-------------------------
Bristol, Bristol, Rhode Island, USA 
https://www.ancestry.com/search/collections/5058/?event=_bristol-bristol-rhode+island-usa_5479
253
1
https://www.ancestry.com/search/collections/5058/?event=_bristol-bristol-rhode+island-usa_5479&event_x=0-0-0_1-0
-------------------------
-------------------------
Hartford, Hartford, Connecticut, USA 
https://www.ancestry.com/search/collections/5058/?event=_hartford-hartford-connecticut-usa_999
666
2
https://www.ancestry.com/search/collections/5058/?event=_hartford-hartford-connecticut-usa_999&event_x=0-0-0_1-0
-------------------------
-------------------------
North Scituate, Providence, Rhode Island, USA 
https://www.ancestry.com/search/collections/5058/?event=_north+scituate-providence-rhode+island-usa_5526
NR
3
https://www.ancestry.com/search/collections/5058/?event=_north+scituate-providence-rhode+island-usa_5526&event_x=0-0-0_1-0
-------------------------
-------------------------
New Haven, New Haven, Connecticut, 

KeyboardInterrupt: 

In [218]:
# save town_pops dictionary as a csv file 
df = pd.DataFrame.from_dict(town_pops, orient="index")
df.to_csv("town_pops.csv")
df = pd.read_csv("town_pops.csv", index_col=0)
print(df)
# printing result

                city          county          state country population
0            Bristol         Bristol   Rhode Island    USA         253
1           Hartford        Hartford    Connecticut    USA         666
2     North Scituate      Providence   Rhode Island    USA          NR
3          New Haven       New Haven    Connecticut    USA         919
4         Portsmouth      Rockingham  New Hampshire    USA         893
..               ...             ...            ...     ...        ...
398          Stephen     Westchester       New York    USA          NR
399          Sunbury  Northumberland   Pennsylvania    USA          NR
400             Fawn            York   Pennsylvania    USA         230
401       New London         Chester   Pennsylvania    USA         120
402  South Hempstead          Queens       New York    USA          NR

[403 rows x 5 columns]
Frequency of ER is : 0


## Fix Formatting

Why? As of right now, there are only two columns. The first column has the location, which includes the town, county, state, and country all in one cell. This is not easy to read. Therefore, town, county, state, and country must become their own columns. The next column has the population. There are no column titles either. The code below fixes these issues. 

In [166]:
# read csv 
towns_df = pd.read_csv("town_pops.csv")

# split location name into multiple columns 
towns_df = towns_df.assign(**towns_df['Unnamed: 0'].str.split(', ', expand=True).add_prefix('Info_'))

# rename columns
towns_df.rename(columns={'0':'population', 'Info_0':'city', 'Info_1':'county', 
                         'Info_2':'state', 'Info_3':'country'}, inplace=True)

# remove last three columns and original location column
towns_df.drop(columns={"Unnamed: 0", "Info_4", "Info_5", "Info_6"}, inplace=True)

# reorder columns
towns_df = towns_df[['city', 'county', 'state', 'country', 'population']]
print(towns_df)

towns_df.to_csv("town_pops_clean.csv")

                city          county          state country population
0            Bristol         Bristol   Rhode Island    USA         253
1           Hartford        Hartford    Connecticut    USA         666
2     North Scituate      Providence   Rhode Island    USA          NR
3          New Haven       New Haven    Connecticut    USA         919
4         Portsmouth      Rockingham  New Hampshire    USA         893
..               ...             ...            ...     ...        ...
398          Stephen     Westchester       New York    USA          NR
399          Sunbury  Northumberland   Pennsylvania    USA          NR
400             Fawn            York   Pennsylvania    USA         230
401       New London         Chester   Pennsylvania    USA         120
402  South Hempstead          Queens       New York    USA          NR

[403 rows x 5 columns]


## Handle Mistakes

Why? The code isn't perfect. If Selenium crashed, the program marked that town as 'ER' in ```town_pops``` and moved onto the next town. It's time to go back and fix these. Also, there are individual discrepancies that must be taken care of. The code below essentially finds the rows in ```town_pops_clean.csv``` that have either 'ER' or are in the ```er_indexes``` list, which contains individual town indexes. 

In [265]:
# handle individual cases (using their indexes)
er_indexes = [11, 52, 88, 287]

In [266]:
# find the populations of towns when webdriver crashed
csv_town = pd.read_csv("town_pops_clean.csv")
csv_town = csv_town[["Unnamed: 0", "state", "county", "city", "country", "population"]]

# handle errors (rows with 'ER')
errors = csv_town.loc[csv_town['population'].isin(["ER"])].drop(columns={'population', 'country'}).values.tolist()

# handle individual discrepancies 
indiv_disc = csv_town.loc[csv_town.index.isin(er_indexes)].drop(columns={'population', 'country'}).values.tolist()

er_towns = errors + indiv_disc
print(er_towns)

[[11, 'Connecticut', 'Windham', 'Woodstock'], [52, 'Rhode Island', 'Kent', 'Warwick'], [88, 'North Carolina', 'Halifax', 'Halifax'], [287, 'New Hampshire', 'Grafton', 'Alexandria']]


## Rerunning the Program

At this point, run the program under **Accessing Ancestry** again. Set ```error_handling``` equal to ```True```. Uncomment the line: ```towns_l = er_towns```. Comment the line: ```towns_l = towns.values.tolist()```. Once that's done, the code below essentially replaces the towns with the new populations. 

In [269]:
# after rerunning program
print(town_pops) # should print out the new populations for each wrong town 
town_copy = pd.read_csv("town_pops_clean.csv")
town_copy["location"] = town_copy["city"] + ", " + town_copy["county"] + ", " + town_copy["state"] + ", " + town_copy["country"]
print(town_copy[["population", "location"]]) 

for town in town_pops.keys():
    town_copy.loc[town_copy["location"] == town, "population"] = town_pops[town]
town_copy.drop(columns={"Unnamed: 0", "location"}, inplace=True)
print(town_copy) 
town_copy.to_csv("town_pops clean.csv")

{'Woodstock, Windham, Connecticut, USA ': 331, 'Warwick, Kent, Rhode Island, USA ': 'NR', 'Halifax, Halifax, North Carolina, USA ': 'NR', 'Alexandria, Grafton, New Hampshire, USA ': 54}
    population                                        location
0          253            Bristol, Bristol, Rhode Island, USA 
1          666           Hartford, Hartford, Connecticut, USA 
2           NR  North Scituate, Providence, Rhode Island, USA 
3          919         New Haven, New Haven, Connecticut, USA 
4          893     Portsmouth, Rockingham, New Hampshire, USA 
..         ...                                             ...
398         NR            Stephen, Westchester, New York, USA 
399         NR     Sunbury, Northumberland, Pennsylvania, USA 
400        230                  Fawn, York, Pennsylvania, USA 
401        120         New London, Chester, Pennsylvania, USA 
402         NR         South Hempstead, Queens, New York, USA 

[403 rows x 2 columns]
                city          coun

In [270]:
# find the frequencies of "no records"
town_pop_csv = pd.read_csv("town_pops_clean.csv")
res = sum(x == 'NR' for x in town_pop_csv["population"].values.tolist())
print("Frequency of NR is : " + str(res))

Frequency of NR is : 151
