# Generate the Distance Matrices
For each name in the stock watchers data, we will compute the distance to each name in the propublica data (subsetted to House/Senate). First we will clean up the names a bit.

In [55]:
import os
from dotenv import load_dotenv
import sys
import pandas as pd
import jellyfish

sys.path.append("../")

from resources.bitio_helper import download_dataset

load_dotenv()
PG_STRING = os.getenv("BITIO_PG_STRING")
USERNAME = os.getenv("BITIO_USERNAME")

In [3]:
# Download from bit.io
propublica_data = download_dataset('"bitdotio/congress-names"."propublica_names"', PG_STRING)
stockwatchers_data = download_dataset('"bitdotio/congress-names"."stock_watchers_names"', PG_STRING)

In [104]:
# some basic cleaning
## remove missing name components from propub data
propublica_data["name_clean"] = propublica_data["name"].str.replace(" nothing", "")

## remove honorifics, titles, etc. from SW data
replacements = '|'.join(['^Mr. ', '^Mrs. ', '^Hon. ', '^None ', ' Honorable'])
stockwatchers_data["name_clean"] = stockwatchers_data["name"].str.replace(replacements, "", regex=True)

In [105]:

prp_house = propublica_data.loc[propublica_data.chamber == "house", ["name_clean"]].rename(columns={"name_clean":"name"})
sw_house = stockwatchers_data.loc[stockwatchers_data.chamber == "house", ["name_clean"]].rename(columns={"name_clean":"name"})

prp_senate = propublica_data.loc[propublica_data.chamber == "senate", ["name_clean"]].rename(columns={"name_clean":"name"})
sw_senate = stockwatchers_data.loc[stockwatchers_data.chamber == "senate", ["name_clean"]].rename(columns={"name_clean":"name"})


In [116]:
prp_names_house = list(prp_house["name"])
prp_names_senate = list(prp_senate["name"])

In [119]:
# helper function for getting levenshtein distance across columns
def normalized_ld(name, candidate):
    ld = jellyfish.levenshtein_distance(name, candidate)
    maxlength = max([len(name), len(candidate)])
    normalized_ld = 1-(ld/maxlength)
    return normalized_ld

def ls(name, candidates):
    """Calculate the normalized levenshtein distance between a name
    and a list of candidate names"""
    return list(map(lambda x: normalized_ld(name, x), candidates))

def make_ls_matrix(names, candidates):
    """Given a dataframe of names and a list of candidates, return a
    dataframe with names as rows and candidates as columns with
    scores as values, along with columns for best match and best
    match score"""
    normalized_lds = names.apply(lambda x: ls(x["name"], candidates), axis=1, result_type="expand")
    normalized_lds.columns = candidates
    normalized_lds.insert(loc=0, column = "stock_watchers_name", value=names["name"])
    normalized_lds.insert(loc=1, column="closest_match", value=normalized_lds.iloc[:,1:].idxmax(axis=1))
    normalized_lds.insert(loc=2, column="closest_match_score", value=normalized_lds.iloc[:,2:].max(axis=1))

    return normalized_lds.sort_values("closest_match_score", axis=0)


In [120]:
house_mat = make_ls_matrix(names=sw_house, candidates=prp_names_house)
senate_mat = make_ls_matrix(names = sw_senate, candidates=prp_names_senate)

In [183]:
def top_n_matches(member, n=5):
    """Takes a row for the matching matrix generated above. Returns a new
    matrix with the top N members and their scores."""
    top_matches = list(member[3:].rank().sort_values().index.values[-n:])
    match_sub = member.loc[top_matches]
    return match_sub


In [184]:
top_n_matches(senate_mat.iloc[1])

Richard M. Burr         0.230769
Shelley Moore Capito    0.230769
Mitt Romney             0.269231
Michael Bennet          0.346154
Mitch McConnell         0.538462
Name: 6, dtype: object

In [178]:
senate_mat.iloc[1]

stock_watchers_name    A. Mitchell Mcconnell, Jr.
closest_match                     Mitch McConnell
closest_match_score                      0.538462
Tammy Baldwin                            0.076923
John Barrasso                            0.076923
                                  ...            
Elizabeth Warren                         0.153846
Sheldon Whitehouse                       0.153846
Roger Wicker                             0.192308
Ron Wyden                                0.076923
Todd Young                               0.115385
Name: 6, Length: 105, dtype: object