This one cleans all files in pre1790 debt certs (ie. who the gov needs to pay back and when)
90th is a cent (ie. instead of 100 cents per dollar, 90 cents per dollar)

In [22]:
#Imports
import pandas as pd
import datetime
import numpy as np
import json
import os
from fuzzywuzzy import fuzz
import nltk

#Solution to get around certificate error: https://www.youtube.com/watch?v=IBmZAYR0pns
import ssl
try:
    e = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = e
nltk.download('omw-1.4')
from nltk.corpus import wordnet

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/liamloughead/nltk_data...


In [2]:
#Helpers
def deNaN(series):
    """
    amends pandas series by replacing NaN values with empty strings
    :param series: pandas series
    """

    return series.apply(lambda x: "" if type(x) != str else x)

In [3]:
#Load  the aggregated file
og_df = pd.read_csv("../../cleaning_CD/pre1790/data/final_agg_debt.csv")

  og_df = pd.read_csv("../../cleaning_CD/pre1790/data/final_agg_debt.csv")


Cleaning process:
1. fix all records (ie. remove extraneous information + combine them into agg)
   - possibilites of stuff to remove: "Estate of", "Heirs of",
   - possibilites of stuff to fix (ie. find the right name for): "and Co.", "Owners of",
   - first or last name missing: either make the other one undefined, or the entire name is in the first or last name column and it needs to be spread out
   - occupation could be in the name
   - some last names have spaces in them (specifically Van) - replace them with spaces
   - so it needs to look like:
     - if fname split or lname split != 1:
       - if its just lname contains van, replace "Van " with "Van-"
       - check and auto remove Estate of, Heirs of (using the fuzzy matching in case of misspellings)
2. Combine consecutive rows with the same name into one row with a total amount of debt

In [6]:
#corrections.json file format:
# [
#     {
#         "name_original": "",
#         "title_new": "",
#         "first_new": "",
#         "last_new": "",
#         "drop": False
#     }
# ]

#Ask running user if they want to enable manual corrections
enable_manual_corrections = input("Enable manual correction system (yes, no)? (DO NOT ENABLE IF YOU ARE NOT READY TO MAKE MANUAL CORRECTIONS) > ")
enable_manual_corrections = True if enable_manual_corrections == "yes" else False

def retrieve_manual_correction(original_name):
    '''
    Looks for a correction in the corrections.json file
    '''
    if not os.path.exists("corrections.json"):
        f = open("corrections.json", "w")
        f.write("[]")
        f.close()
    file = json.load(open("corrections.json", "r"))
    for obj in file:
        if obj["name_original"] == original_name:
            return (obj["title_new"], obj["first_new"], obj["last_new"], obj["drop"])
    return None

def save_manual_correction(name_original, name_correct, drop: bool):
    """
    Saves a correction to corrections.json
    """
    if not enable_manual_corrections: return
    if not os.path.exists("corrections.json"):
        f = open("corrections.json", "w")
        f.write("[]")
        f.close()
    file = json.load(open("corrections.json", "r"))
    print("SMC: " + str(drop))
    if len(name_correct.split()) == 2:
        file.append({
            "name_original": name_original,
            "title_new": "",
            "first_new": name_correct.split()[0],
            "last_new": name_correct.split()[1],
            "drop": drop
        })
    if len(name_correct.split()) == 3:
        file.append({
            "name_original": name_original,
            "title_new": name_correct.split()[0],
            "first_new": name_correct.split()[1],
            "last_new": name_correct.split()[2],
            "drop": drop
        })
    f = open("corrections.json", "w")
    json.dump(file, f)
    f.close()

def process_date(yr, mon, day, is_issued_date: bool, state_code, index):
    """ Dates in the files can sometimes be invalid, specifically:\n
     - month and year are swapped\n
     - Typos in the year column (ex. 17780)\n
     - Dates that are impossible (Feburary 31, there are only 28 days in feburary)\n
    Args:
        yr (int): Year
        mon (int): Month
        day (int): Day
        is_issued_date (bool): specifies whether this date is the date a certificate is issued or the date is the maturity.
        state_code (str): state code
        index (int): index of row

    Returns:
        (int: ordinal of the date (datetime.toordinal(s)), bool: did a manual correction need to be made?)
    """
    try:
        d = datetime.date(int(yr), int(mon), int(day))
        return (d.toordinal(), False)
    except Exception as e:
        if "10: ''" in str(e): #ie. the "Invalid literal for base 10: ''" error, which means blank, which means just make it 0
            return (0, False)
        manual = retrieve_manual_correction(state_code, index)
        if manual == None:
            if 'month must' in str(e): #ie. month must be in range 1..12 - just swap month and day
                d = datetime.date(yr, day, mon)
                return (d.toordinal(), False)
            new = input(f"{state_code}: {'RE, ' if ('range' in str(e)) else ''}{'Issued: ' if is_issued_date else 'Expiries: '} {yr} {mon} {day} (yr-mon-day):")
            if new == "" and is_issued_date == False:
                return (0, False)
            d = datetime.date(int(new.split()[0]), int(new.split()[1]), int(new.split()[2]))
            return (d.toordinal(), True)
        else:
            return (int(manual[1].split('-')[0]) if is_issued_date else int(manual[1].split('-')[1]), False)

In [31]:
def check_note_present(note):
    """Checks if a transcriber note is just a number or actually useful

    Args:
        note (str): the note

    Returns:
        bool: if the transcriber note is useful
    """
    if note != "":
        if "Lo" in note or "Ro" in note:
            return False
        try:
            float(note.strip())
            return False
        except:
            pass
        return True
    return False

def add_dash_to_prefix(name, prefix):
    name = name.replace(prefix + " ", prefix + "-")
    name = name.replace(" " + prefix, "-" + prefix)
    return name

i = 0
agg_cols = list(og_df.columns)
agg_cols.append("deceased?")
agg_df = pd.DataFrame(columns=agg_cols)

for index, row in og_df.iterrows():
    row = row.replace(np.nan,'',regex=True)
    row["deceased?"] = False
    title, fname, lname = row["to whom due | title"].strip(), row["to whom due | first name"].strip(), row["to whom due | last name"].strip()
    if len(fname.split()) == 1 and len(lname.split()) == 1:
        agg_df.loc[len(agg_df.index)] = row
        #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
        continue
    #"Van <something>" and Zee last name and first name replacements
    lname = add_dash_to_prefix(lname, "Van")
    fname = add_dash_to_prefix(fname, "Van")
    lname = add_dash_to_prefix(lname, "Zee")
    fname = add_dash_to_prefix(fname, "Zee")
    lname = add_dash_to_prefix(lname, "Zie")
    fname = add_dash_to_prefix(fname, "Zie")
    #Le, Mc, De name prefixes/suffixes
    lname = add_dash_to_prefix(lname, "Le")
    fname = add_dash_to_prefix(fname, "Le")
    lname = add_dash_to_prefix(lname, "De")
    fname = add_dash_to_prefix(fname, "De")
    lname = add_dash_to_prefix(lname, "Mc")
    fname = add_dash_to_prefix(fname, "Mc")

    #Get rid of "the" prefixes
    fname = fname.replace("The ", "", 1)
    fname = fname.replace("the ", "", 1)
    
    #Jr, Sr, 1st, 2nd
    lname = add_dash_to_prefix(lname, "Jr")
    fname = add_dash_to_prefix(fname, "Jr")
    lname = add_dash_to_prefix(lname, "Sr")
    fname = add_dash_to_prefix(fname, "Sr")
    lname = add_dash_to_prefix(lname, "1st")
    fname = add_dash_to_prefix(fname, "1st")
    lname = add_dash_to_prefix(lname, "2nd")
    fname = add_dash_to_prefix(fname, "2nd")
    lname = add_dash_to_prefix(lname, "2d")
    fname = add_dash_to_prefix(fname, "2d")
    #And co handling
    fname = fname.replace(" & Co", "").replace(" & co", "").replace(" and Co", "").replace(" and co", "")
    lname = lname.replace(" & Co", "").replace(" & co", "").replace(" and Co", "").replace(" and co", "")
    
    #Deceased handling
    fullname = fname + " " + lname
    for word in fullname.split():
        if "dead" or "decease" or "passed" in word:
            row["deceased?"] = True
            fname.replace(word, "")
            lname.replace(word, "")
    
    #State of/Hiers of/Estate of/Town of handling - makes first name "<thing>", last name: "<name>"
    #use fuzz because there are misspellings of Estate of and Heirs of
    if len(fname.split()) > 2:
        #State of/town of - "<thing>" 
        prefix = fname.split()[0] + fname.split()[1]
        prefix = prefix.lower()
        if fuzz.ratio(prefix, "state of") >= 88:
            lname =  "-".join(fname.split()[2:])
            fname = "State"
        elif fuzz.ratio(prefix, "town of") >= 88:
            lname =  "-".join(fname.split()[2:])
            fname = "Town"
        elif fuzz.ratio(prefix, "estate of") >= 85:
            name = fname.replace(fname.split()[0] + fname.split()[1], "")
            lname =  name.split()[0]
            fname = name.split()[1]
        elif fuzz.ratio(prefix, "heir of") >= 85:
            name = fname.replace(fname.split()[0] + fname.split()[1], "")
            lname =  name.split()[0]
            fname = name.split()[1]
    
    if " or " in row["to whom due | last name"]: continue
    
    if len(fname.split()) == 1 and lname == "": lname = "undefined" # if there is no last name, make it undefined
    
    #if len(fname.split()) > 5 or len(lname.split()) > 5: continue # do not even try with ones that are crazy long
    
    if len(fname.split()) == 2:
        #drop the dot - for example: "James F." -> "James F"
        if len(fname.replace(".", "").split()[1]) == 1: fname = fname.split()[0] # if middle initial in fname, drop it
        elif len(fname.split()[1]) >= 3: # usually means 2 names
            fname = fname.replace(" ", "-")
            if len(lname.split()) == 0: #usually means that first and last names are put into just first name column
                lname = fname.split("-")[1]
                fname = fname.split("-")[0]
    #Do the same above for the last name
    if len(lname.split()) == 2:
        if len(lname.replace(".", "").split()[1]) == 1: lname = lname.split()[0] # if initial in lname, drop it
        elif len(lname.split()[1]) >= 3: lname = lname.replace(" ", "-") # usually means 2 names
    if fname == "" and lname == "": continue # Drop ones with no name data
    
    if ('&' in fname) or (' and' in fname) or ('|' in fname):
        #Means there is co-ownership
        to_add = []
        sepr = ""
        if ("&" in fname): sepr = "&"
        if (" and" in fname): sepr = "and"
        if ("|" in fname): sepr = "|"
        for i in range(len(fname.split(sepr))):
            row["to whom due | first name"] = fname.split(sepr)[i].strip()
            row["to whom due | last name"] = "undefined"
            agg_df.loc[len(agg_df.index)] = row
            #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
        continue
    if len(fname.split()) != 1 or len(lname.split()) != 1:
        i += 1
        print(f"First: {fname}, Last: {lname}, Title: {title}")
        correction = retrieve_manual_correction(title + " " + fname + " " + lname)
        if correction != None:
            title_new, first_new, last_new, drop = correction
            if drop: continue
            row["to whom due | title"] = title_new
            row["to whom due | first name"] = first_new
            row["to whom due | last name"] = last_new
            agg_df.loc[len(agg_df.index)] = row
            #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
        else:
            new = input(f"In {row['org_file']}, First: {fname}, Last: {lname}, Title: {title} >>> ")
            if new == "s":
                break
            elif new == "k":
                fullname = title + " " + fname + " " + lname
                save_manual_correction(fullname, fullname.replace(" ", "-"), False)
            elif new == "":
                print("dropping")
                save_manual_correction(title + " " + fname + " " + lname, " ", True)
            else:
                split = new.split()
                nt, nf, nl = "", "", ""
                if len(split) == 2:
                    nf, nl = new.split()
                if len(split) == 3:
                    nt, nf, nl = new.split()
                save_manual_correction(title + " " + fname + " " + lname, nt + " " + nf + " " + nl, False)
    else:
        agg_df.loc[len(agg_df.index)] = row
        #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
print(i)

First: Arch William Yard, Last: , Title: 


KeyboardInterrupt: 

In [26]:
synonyms = []
for syn in wordnet.synsets("deceased"):
    for lm in syn.lemmas():
        synonyms.append(lm.name())#adding into synonyms
print (set(synonyms))

{'croak', 'go', 'pop_off', 'kick_the_bucket', 'choke', 'asleep', 'conk', 'decease', "cash_in_one's_chips", 'snuff_it', 'departed', 'give-up_the_ghost', 'buy_the_farm', 'exit', 'die', 'at_peace', 'perish', 'expire', 'pass_away', 'deceased_person', 'at_rest', 'gone', 'pass', 'decedent', 'drop_dead', 'dead_soul', 'deceased', 'dead_person'}


In [None]:
#Loop that covers Objective 1 (aggregate data files)
#save the name
def element_to_int(ele):
    if type(ele) == np.float64:
        ele = round(ele)
    if ele == np.nan: return 0
    if str(ele) == "nan": return 0
    return round(np.float64(ele))

def get_dollar(row):
    dollar = 0
    ninety = 0
    #if dollar (90th) is a decimal, then split it
    if '.' in str(element_to_int(row["amount | dollars"])):
        split = str(element_to_int(row["amount | dollars"])).split(".")
        dollar, ninety = element_to_int(split[0]), element_to_int(split[1])
    elif str(row["amount | dollars"]) == "":
        if '.' in str(element_to_int(row["amount in specie | dollars"])):
            split = str(element_to_int(row["amount in specie | dollars"])).split()
            dollar, ninety = element_to_int(split[0]), element_to_int(split[1])
    else:
        dollar = element_to_int(row["amount | dollars"])
        ninety = element_to_int(row["amount | 90th"])
    return float(str(dollar) + "." + str(ninety))

agg_df = pd.DataFrame(columns=og_df.columns)
last_f, last_l, last_t = "", "", ""
last_row = None
#save the sum of money
current_sum = 0
for index, row in og_df.iterrows():
    fname, lname = str(row["to whom due | first name"]).strip(), str(row["to whom due | last name"]).strip()
    last_t = last_t if str(row["to whom due | title"]).strip().lower() == "nan" else str(row["to whom due | title"]).strip()
    if fname == last_f and lname == last_l:
        dol = get_dollar(row)
        print(f"adding {dol} to {fname} {lname}'s total")
        current_sum += dol
    else:
        if current_sum > 0:
            print(f"{last_row['to whom due | first name']} {last_row['to whom due | last name']} is consecutively owed {current_sum}")
            #consecutive has ended
            split = str(current_sum).split(".")
            last_row["amount | dollars"] = int(split[0])
            last_row["amount | 90th"] = int(split[1])
            last_row["to whom due | title"] = last_t if last_t != "" else ""
            agg_df.loc[len(agg_df.index)] = last_row
            #agg_df = pd.concat([agg_df, last_row.to_frame().T], ignore_index=True)
            current_sum = 0
        else:
            #Normal
            agg_df.loc[len(agg_df.index)] = last_row
            #agg_df = pd.concat([agg_df, row.to_frame().T], ignore_index=True)
    last_f, last_l = fname, lname
    last_row = row