This one cleans all files in pre1790 debt certs (ie. who the gov needs to pay back and when)
90th is a cent (ie. instead of 100 cents per dollar, 90 cents per dollar)

In [62]:
#Imports
import pandas as pd
import datetime
import numpy as np
import json
import os
from fuzzywuzzy import fuzz

import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
import ssl

from nameparser import HumanName

In [63]:
#Load the aggregated file
og_df = pd.read_csv("../../cleaning_CD/pre1790/data/final_agg_debt.csv")

#Load the changes dataframe
corrections_df = None
if not os.path.exists("../../cleaning_CD/pre1790/name_changes_liam.csv"):
    corrections_df = pd.DataFrame({'og_title': pd.Series(dtype='str'),
                       'og_fname': pd.Series(dtype='str'),
                       'og_lname': pd.Series(dtype='str'),
                       'new_title': pd.Series(dtype='str'),
                       'new_fname': pd.Series(dtype='str'),
                       'new_lname': pd.Series(dtype='str'),
                       'cleaning_case': pd.Series(dtype='int'),
                       'file_loc': pd.Series(dtype='str'),
                       'org_index': pd.Series(dtype='int')})
else:
    corrections_df = pd.read_csv("../../cleaning_CD/pre1790/name_changes_liam.csv")

  og_df = pd.read_csv("../../cleaning_CD/pre1790/data/final_agg_debt.csv")


### Helper functions
 - retrieve_correction: Get the correction for the title, fname and lname in the dataframe
 - save_correction: Save the correction, given the original and new names
 - process_date: (Unused) Correct dates by prompting the user
 - text_contains_human_name: Returns an array of human names in the supplied text, empty array if no human names (https://unbiased-coder.com/extract-names-python-nltk/)

In [64]:
#Ask running user if they want to enable manual corrections
enable_manual_corrections = input("Enable manual correction system (yes, no)? (DO NOT ENABLE IF YOU ARE NOT READY TO MAKE MANUAL CORRECTIONS) > ")
enable_manual_corrections = True if enable_manual_corrections == "yes" else False

def retrieve_correction(og_title, og_fname, og_lname):
    '''
    Looks for a correction in the corrections dataframe
    '''
    for index, row in corrections_df.iterrows():
        if row["og_title"] == og_title and row["og_fname"] == og_fname and row["og_lname"] == og_lname:
            return (row["new_title"], row["new_fname"], row["new_lname"])
    return None

def save_manual_correction(og_title, og_fname, og_lname, new_title, new_fname, new_lname, clean_case, file, org_i, is_manual):
    """
    Saves a correction to the correction df
    """
    if is_manual and not enable_manual_corrections: return
    corrections_df.loc[len(corrections_df.index)] = [
        og_title, og_fname, og_lname,
        new_title, new_fname, new_lname,
        clean_case, file, org_i]

#Download the necessary NLTK models for the below function
#Change the below to True to use the workaround in case downloads don't work
if True:
    try:
        _unverified = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _unverified
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
def get_tags(text):
    nltk_results = ne_chunk(pos_tag(word_tokenize(text)))
    tags = {}
    for nltk_result in nltk_results:
        if type(nltk_result) == Tree:
            name = ''
            for nltk_result_leaf in nltk_result.leaves():
                name += nltk_result_leaf[0] + ' '
            tags[name] = nltk_result.label()
    return tags

def process_date(yr, mon, day, is_issued_date: bool, state_code, index):
    """ Dates in the files can sometimes be invalid, specifically:\n
     - month and year are swapped\n
     - Typos in the year column (ex. 17780)\n
     - Dates that are impossible (Feburary 31, there are only 28 days in feburary)\n
    Args:
        yr (int): Year
        mon (int): Month
        day (int): Day
        is_issued_date (bool): specifies whether this date is the date a certificate is issued or the date is the maturity.
        state_code (str): state code
        index (int): index of row

    Returns:
        (int: ordinal of the date (datetime.toordinal(s)), bool: did a manual correction need to be made?)
    """
    try:
        d = datetime.date(int(yr), int(mon), int(day))
        return (d.toordinal(), False)
    except Exception as e:
        if "10: ''" in str(e): #ie. the "Invalid literal for base 10: ''" error, which means blank, which means just make it 0
            return (0, False)
        manual = retrieve_manual_correction(state_code, index)
        if manual == None:
            if 'month must' in str(e): #ie. month must be in range 1..12 - just swap month and day
                d = datetime.date(yr, day, mon)
                return (d.toordinal(), False)
            new = input(f"{state_code}: {'RE, ' if ('range' in str(e)) else ''}{'Issued: ' if is_issued_date else 'Expiries: '} {yr} {mon} {day} (yr-mon-day):")
            if new == "" and is_issued_date == False:
                return (0, False)
            d = datetime.date(int(new.split()[0]), int(new.split()[1]), int(new.split()[2]))
            return (d.toordinal(), True)
        else:
            return (int(manual[1].split('-')[0]) if is_issued_date else int(manual[1].split('-')[1]), False)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/liamloughead/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/liamloughead/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/liamloughead/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/liamloughead/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [31]:
dups_removed = og_df.drop_duplicates(subset=["to whom due | first name"])

{'Roxbury ': 'ORGANIZATION'}

Removal of uncleanable entries below. These include:
 - Names that are more than 5 words

In [65]:
#Super fast method - instead of going through it and adding to a new dataset,
#use apply with a simple function that doesn't include long strings in a new dataset
og_df = og_df[og_df['to whom due | first name'].apply(lambda name: len(str(name).split()) > 5) == False]
og_df = og_df[og_df['to whom due | last name'].apply(lambda name: len(str(name).split()) > 5) == False]

Objectives 4 & 8: Standardizing Estate/Town/State/Heir of

For state and town of, leave in the state and town in to describe the organization type, ie. First name: "State of New York", Last name: "" -> First name: "State", Last name: "New York"

For estate and heir of, just get rid of Estate of and Heir of, ie. First name: "Estate of William Garrett", Last name: "" -> First name: "William", Last name: "Garrett"

Uses fuzzy to make sure that spelling mistakes don't trip it up

Standardizes names AND ticks the "organization?" flag if state or town

In [70]:
agg_debt = pd.DataFrame(columns=og_df.columns)
agg_debt["organization?"] = False

manual_corrections = [
    {"og_fname": "State of William Sweet",
     "new_title": "",
     "new_fname": "William", 
     "new_lname": "Sweet"},
    {"og_fname": "Estateof Doct James Front",
     "new_title": "Doct",
     "new_fname": "James",
     "new_lname": "Front"},
    {"og_fname": "Estate of Capt John Williams",
     "new_title": "Capt",
     "new_fname": "John",
     "new_lname": "Williams"}
]

def handle_ofs(row):
    og_fname = str(row["to whom due | first name"])
    og_lname = str(row["to whom due | last name"])
    title = str(row["to whom due | title"])
    
    for c in manual_corrections:
        if c["og_fname"] == og_fname:
            row["to whom due | first name"] = c["new_fname"]
            row["to whom due | last name"] = c["new_lname"]
            row["to whom due | title"] = c["new_title"]
            return row
    
    og_fname = og_fname.replace("the ", "").replace("The ", "")
    og_lname = og_lname.replace("the ", "").replace("The ", "")
    
    if len(og_fname.split()) > 2:
        prefix = og_fname.split()[0] + og_fname.split()[1]
        prefix = prefix.lower()
        if fuzz.ratio(prefix, "state of") >= 88 and "est" not in prefix: #"not in" so that this one won't pick up "Estate of"
            lname =  "-".join(og_fname.split()[2:])
            fname = "State"
            save_manual_correction(title, row["to whom due | first name"], row["to whom due | last name"], title, fname, lname, 8, row["org_file"], row["org_index"], is_manual=False)
            row["to whom due | first name"] = fname
            row["to whom due | last name"] = lname
            row["organization?"] = True
        elif fuzz.ratio(prefix, "town of") >= 88:
            lname =  "-".join(og_fname.split()[2:])
            fname = "Town"
            save_manual_correction(title, row["to whom due | first name"], row["to whom due | last name"], title, fname, lname, 8, row["org_file"], row["org_index"], is_manual=False)
            row["to whom due | first name"] = fname
            row["to whom due | last name"] = lname
            row["organization?"] = True
        elif fuzz.ratio(prefix, "estate of") >= 85 and "est" in prefix: #"in prefix" so that this one won't pick up "State of"
            #print(og_fname.split()[2:])
            name = " ".join(og_fname.split()[2:])
            fname =  name.split()[0]
            lname = name.split()[1:] if len(name.split()) > 1 else ""
            if len(lname) == 0 and row["to whom due | last name"] != "": lname = row["to whom due | last name"]
            if type(lname) == list: lname = " ".join(lname)
            save_manual_correction(title, row["to whom due | first name"], row["to whom due | last name"], title, fname, lname, 4, row["org_file"], row["org_index"], is_manual=False)
            row["to whom due | first name"] = fname
            row["to whom due | last name"] = lname
        elif fuzz.ratio(prefix, "heir of") >= 85 or fuzz.ratio(prefix, "heirs of") >= 85:
            name = " ".join(og_fname.split()[2:])
            fname =  name.split()[0]
            lname = name.split()[1:] if len(name.split()) > 1 else ""
            save_manual_correction(title, row["to whom due | first name"], row["to whom due | last name"], title, fname, lname, 4, row["org_file"], row["org_index"], is_manual=False)
            row["to whom due | first name"] = fname
            row["to whom due | last name"] = lname
    return row

agg_debt = og_df.apply(lambda row: handle_ofs(row), axis=1)

Objective 14: Standardizing organization names entirely in the first name colummn

This is very similiar to State and Town of, except it is supposed to catch ALL organizations using NLTK

In [None]:
def handle_all_orgs(row):
    og_fname = str(row["to whom due | first name"])
    og_lname = str(row["to whom due | last name"])
    title = row["to whom due | title"]
    
    fname, lname = "", ""
    if len(og_fname.split()) > 2 and (("of " in og_fname) or (" of" in og_fname)):
        tags = get_tags(og_fname)
        is_org = False
        print(tags)
        for token, tag in tags.items():
            if tag == "ORGANIZATION" or tag == "GPE": #Geo political entity
                is_org = True
        if not is_org: return row
        row["organization?"] = True
        before_of, after_of = og_fname.split("of")
        fname = before_of.strip().replace("-", "")
        lname = after_of.strip().replace("-", "")
        save_manual_correction(title, og_fname, og_lname, title, fname, lname, 14, row["org_file"], row["org_index"], is_manual=False)
        row["to whom due | first name"] = fname
        row["to whom due | last name"] = lname
    return row

agg_debt = agg_debt.apply(lambda row: handle_all_orgs(row), axis=1)

In [79]:
corrections_df = corrections_df.iloc[0:0]

Objective 9: Names that are entirely in the first or last name column

The following checks if one column is completley blank and there seems to be a name in the other, if so use HumanName to automatically parse it and put first and last names where they belong

In [80]:
def correct_full_names_in_column(row):
    if row["organization?"] == True: return row #ignore orgnizations
    fname = str(row["to whom due | first name"])
    lname = str(row["to whom due | last name"])
    name = None
    if lname == "" and len(fname.split()) >= 2:
        name = HumanName(fname)
    if fname == "" and len(lname.split()) >= 2:
        name = HumanName(lname)
    if name == None:
        return row
    else:
        save_manual_correction(row["to whom due | title"], fname, lname, row["to whom due | title"], name.first, name.last, 9, row["org_file"], row["org_index"], is_manual=False)
        row["to whom due | first name"] = name.first
        row["to whom due | last name"] = name.last
        return row

agg_debt = agg_debt.apply(lambda row: correct_full_names_in_column(row), axis=1)

KeyError: 'organization?'

Objective 7: Filling in blank columns

This one is simple: Just make any blank column UNDEFINED. Note that this has to run AFTER objective 9 (Names that are entirely in the first or last name column fixes).

In [76]:
def handle_blank_name_cols(row):
    fname = str(row["to whom due | first name"])
    lname = str(row["to whom due | last name"])
    if fname == "": fname = "UNDEFINED" # if there is no first name, make it undefined
    elif lname == "": lname = "UNDEFINED" # if there is no last name, make it undefined
    else: return row # if both aren't blank, return the row now
    save_manual_correction(row["to whom due | title"], row["to whom due | first name"], row["to whom due | last name"], row["to whom due | title"], fname, lname, 7, row["org_file"], row["org_index"], is_manual=False)
    row["to whom due | first name"] = fname
    row["to whom due | last name"] = lname
    return row

agg_debt = og_df.apply(lambda row: handle_blank_name_cols(row), axis=1)

Objective 12: Deceased individuals

Checks if "dec'd" or other words that indicated a deceased individual

In [78]:
agg_debt["deceased?"] = False

def check_deceased(row):
    fname = str(row["to whom due | first name"])
    lname = str(row["to whom due | last name"])
    fullname = str(fname) + " " + str(lname)
    for word in fullname.lower().split():
        if " dead" or " decease" or " passed" or " dec'd" or " dec." or " decd" or " deceasd" in word:
            row["deceased?"] = True
            fname = fname.replace(word, "")
            lname = lname.replace(word, "")
            save_manual_correction(row["to whom due | title"], row["to whom due | first name"], row["to whom due | last name"], row["to whom due | title"], fname, lname, 12, row["org_file"], row["org_index"], is_manual=False)
            row["to whom due | first name"] = fname
            row["to whom due | last name"] = lname
    return row

agg_debt = og_df.apply(lambda row: check_deceased(row), axis=1)

TypeError: can only concatenate str (not "float") to str

In [42]:
def check_note_present(note):
    """Checks if a transcriber note is just a number or actually useful

    Args:
        note (str): the note

    Returns:
        bool: if the transcriber note is useful
    """
    if note != "":
        if "Lo" in note or "Ro" in note:
            return False
        try:
            float(note.strip())
            return False
        except:
            pass
        return True
    return False

def add_dash_to_prefix(name, prefix):
    name = name.replace(prefix + " ", prefix + "-")
    name = name.replace(" " + prefix, "-" + prefix)
    return name

agg_cols = list(og_df.columns)
agg_cols.append("deceased?")
agg_df = pd.DataFrame(columns=agg_cols)

for index, row in og_df.iterrows():
    row = row.replace(np.nan,'',regex=True)
    row["deceased?"] = False
    title, fname, lname = row["to whom due | title"].strip(), row["to whom due | first name"].strip(), row["to whom due | last name"].strip()
    if len(fname.split()) == 1 and len(lname.split()) == 1:
        agg_df.loc[len(agg_df.index)] = row
        #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
        continue
    #"Van <something>" and Zee last name and first name replacements
    lname = add_dash_to_prefix(lname, "Van")
    fname = add_dash_to_prefix(fname, "Van")
    lname = add_dash_to_prefix(lname, "Zee")
    fname = add_dash_to_prefix(fname, "Zee")
    lname = add_dash_to_prefix(lname, "Zie")
    fname = add_dash_to_prefix(fname, "Zie")
    #Le, Mc, De name prefixes/suffixes
    lname = add_dash_to_prefix(lname, "Le")
    fname = add_dash_to_prefix(fname, "Le")
    lname = add_dash_to_prefix(lname, "De")
    fname = add_dash_to_prefix(fname, "De")
    lname = add_dash_to_prefix(lname, "Mc")
    fname = add_dash_to_prefix(fname, "Mc")

    #Get rid of "the" prefixes
    fname = fname.replace("The ", "", 1)
    fname = fname.replace("the ", "", 1)
    
    #Jr, Sr, 1st, 2nd
    lname = add_dash_to_prefix(lname, "Jr")
    fname = add_dash_to_prefix(fname, "Jr")
    lname = add_dash_to_prefix(lname, "Sr")
    fname = add_dash_to_prefix(fname, "Sr")
    lname = add_dash_to_prefix(lname, "1st")
    fname = add_dash_to_prefix(fname, "1st")
    lname = add_dash_to_prefix(lname, "2nd")
    fname = add_dash_to_prefix(fname, "2nd")
    lname = add_dash_to_prefix(lname, "2d")
    fname = add_dash_to_prefix(fname, "2d")
    #And co handling
    fname = fname.replace(" & Co", "").replace(" & co", "").replace(" and Co", "").replace(" and co", "")
    lname = lname.replace(" & Co", "").replace(" & co", "").replace(" and Co", "").replace(" and co", "")
    
    #Deceased handling
    fullname = fname + " " + lname
    for word in fullname.split():
        if "dead" or "decease" or "passed" or "dec'd" or "dec." or "decd" in word:
            row["deceased?"] = True
            fname.replace(word, "")
            lname.replace(word, "")
    
    #State of/Hiers of/Estate of/Town of handling - makes first name "<thing>", last name: "<name>"
    #use fuzz because there are misspellings of Estate of and Heirs of
    if len(fname.split()) > 2:
        #State of/town of - "<thing>" 
        prefix = fname.split()[0] + fname.split()[1]
        prefix = prefix.lower()
        if fuzz.ratio(prefix, "state of") >= 88:
            lname =  "-".join(fname.split()[2:])
            fname = "State"
        elif fuzz.ratio(prefix, "town of") >= 88:
            lname =  "-".join(fname.split()[2:])
            fname = "Town"
        elif fuzz.ratio(prefix, "estate of") >= 85:
            name = fname.replace(fname.split()[0] + fname.split()[1], "")
            lname =  name.split()[0]
            fname = name.split()[1]
        elif fuzz.ratio(prefix, "heir of") >= 85:
            name = fname.replace(fname.split()[0] + fname.split()[1], "")
            lname =  name.split()[0]
            fname = name.split()[1]
    
    if " or " in row["to whom due | last name"]: continue
    
    if fname == "": fname = "UNDEFINED" # if there is no first name, make it undefined
    if lname == "": lname = "UNDEFINED" # if there is no last name, make it undefined
    
    #if len(fname.split()) > 5 or len(lname.split()) > 5: continue # do not even try with ones that are crazy long
    
    if len(fname.split()) == 2:
        #drop the dot - for example: "James F." -> "James F"
        if len(fname.replace(".", "").split()[1]) == 1: fname = fname.split()[0] # if middle initial in fname, drop it
        elif len(fname.split()[1]) >= 3: # usually means 2 names
            fname = fname.replace(" ", "-")
            if len(lname.split()) == 0: #usually means that first and last names are put into just first name column
                lname = fname.split("-")[1]
                fname = fname.split("-")[0]
    #Do the same above for the last name
    if len(lname.split()) == 2:
        if len(lname.replace(".", "").split()[1]) == 1: lname = lname.split()[0] # if initial in lname, drop it
        elif len(lname.split()[1]) >= 3: lname = lname.replace(" ", "-") # usually means 2 names
    if fname == "" and lname == "": continue # Drop ones with no name data
    
    if ('&' in fname) or (' and' in fname) or ('|' in fname):
        #Means there is co-ownership
        to_add = []
        sepr = ""
        if ("&" in fname): sepr = "&"
        if (" and" in fname): sepr = "and"
        if ("|" in fname): sepr = "|"
        for i in range(len(fname.split(sepr))):
            row["to whom due | first name"] = fname.split(sepr)[i].strip()
            row["to whom due | last name"] = "undefined"
            agg_df.loc[len(agg_df.index)] = row
            #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
        continue
    if len(fname.split()) != 1 or len(lname.split()) != 1:
        print(f"First: {fname}, Last: {lname}, Title: {title}")
        correction = retrieve_manual_correction(title + " " + fname + " " + lname)
        if correction != None:
            title_new, first_new, last_new, drop = correction
            if drop: continue
            row["to whom due | title"] = title_new
            row["to whom due | first name"] = first_new
            row["to whom due | last name"] = last_new
            agg_df.loc[len(agg_df.index)] = row
            #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
        else:
            new = input(f"In {row['org_file']}, First: {fname}, Last: {lname}, Title: {title} >>> ")
            if new == "s":
                break
            elif new == "k":
                fullname = title + " " + fname + " " + lname
                save_manual_correction(fullname, fullname.replace(" ", "-"), False)
            elif new == "":
                print("dropping")
                save_manual_correction(title + " " + fname + " " + lname, " ", True)
            else:
                split = new.split()
                nt, nf, nl = "", "", ""
                if len(split) == 2:
                    nf, nl = new.split()
                if len(split) == 3:
                    nt, nf, nl = new.split()
                save_manual_correction(title + " " + fname + " " + lname, nt + " " + nf + " " + nl, False)
    else:
        agg_df.loc[len(agg_df.index)] = row
        #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)

First: Arch William Yard, Last: UNDEFINED, Title: 


In [None]:
#Loop that covers Objective 1 (aggregate data files)
def element_to_int(ele): # handles all kinds of Nans (returns 0 for nans)
    if type(ele) == np.float64:
        ele = round(ele)
    if ele == np.nan: return 0
    if str(ele) == "nan": return 0
    return round(np.float64(ele))

def get_dollar(row): #gets the dollar from a row by checking both dollar columns
    dollar = 0
    ninety = 0
    #if dollar (90th) is a decimal, then split it
    if '.' in str(element_to_int(row[11])): # "amount | dollar"
        split = str(element_to_int(row[11])).split(".")
        dollar, ninety = element_to_int(split[0]), element_to_int(split[1])
    elif str(row[11]) == "": # "amount | dollar"
        if '.' in str(element_to_int(row[24])): # "amount | specie"
            split = str(element_to_int(row[24])).split() # "amount | specie"
            dollar, ninety = element_to_int(split[0]), element_to_int(split[1])
    else:
        dollar = element_to_int(row[11]) # "amount | dollar"
        ninety = element_to_int(row[12]) # "amount | 90th"
    return float(str(dollar) + "." + str(ninety))

def new_tup(old_row, new_dol, new_ninety, new_title): # returns a new tuple, specifically for totaled debt amounts (since you can't assign new values in tuples)
    return (old_row[0], old_row[1], old_row[2], old_row[3], old_row[4], old_row[5], old_row[6],
            new_title, old_row[8], old_row[9], old_row[10], new_dol, new_ninety, old_row[13],
            old_row[14], old_row[15], old_row[16], old_row[17], old_row[18], old_row[19],
            old_row[20], old_row[21], old_row[22], old_row[23], old_row[24], old_row[25],
            old_row[26], old_row[27], old_row[28], old_row[29], old_row[30], old_row[31])

agg_df = pd.DataFrame(columns=og_df.columns)
last_f, last_l, last_t = "", "", ""
last_row = None
#save the sum of money
current_sum = 0
for row in og_df.itertuples(name=None, index=False): #main processing function
    fname, lname = str(row[5]).strip(), str(row[6]).strip()
    last_t = last_t if str(row[7]).strip().lower() == "nan" else str(row[7]).strip()
    if fname == last_f and lname == last_l: #If the next name is the same as the last one, add onto the amount
        dol = get_dollar(row)
        print(f"adding {dol} to {fname} {lname}'s total")
        current_sum += dol
    else: #If the next name is not the same as the last one:
        if current_sum > 0: #If the sum is more than 0 (ie. this is the end of consecutive same-name entries), then only add this on
            print(f"{last_row[5]} {last_row[6]} is consecutively owed {current_sum}")
            #consecutive has ended
            split = str(current_sum).split(".")
            agg_df.loc[len(agg_df.index)] = new_tup(last_row, int(split[0]), int(split[1]), last_t if last_t != "" else "")
            current_sum = 0
        else: #If the sum is not more than 0 (ie. this is one unique entry, add it on now)
            #Normal
            agg_df.loc[len(agg_df.index)] = last_row
    last_f, last_l = fname, lname
    last_row = row