# Experimentation with fuzzy text matching
Many sessions with Claude have convinced me that the only way to match companies between the AFSC (or other) lists and the SEC data is through *text matching* of company names. There are just too many numbers and exchanges and share classes for stock.

## author:
- **David W. Hogg** (NYU)

## notes:
- Here's one relevant conversation with Claude: https://claude.ai/share/bde7bc8b-f8d5-40b6-aa0c-e5187dc314a5

## bugs:
- I just set the matching method and score threshold by hand?

In [None]:
# !pip install rapidfuzz

In [None]:
import os
import re
import pandas as pd
from rapidfuzz import process, fuzz

In [None]:
DATA_DIR = os.path.expanduser("../data")
scorer = fuzz.ratio # this is magic

In [None]:
# idiotic testing written by Claude
long_list = ["Apple Inc.", "Microsoft Corporation", "Alphabet Inc.", ]
short_list = ["Microsoft Corp", "apple incorporated", ]
for query in short_list:
    match, score, index = process.extractOne(query, long_list, scorer=scorer)
    if score >= 50:  # tune this threshold
        print(f"{query} → {match} (score: {score})")

In [None]:
bad_list = pd.read_csv(os.path.join(DATA_DIR, "investigate_divestment_shortlist.csv"))
print(bad_list.columns)

In [None]:
# holdings = pd.read_csv(os.path.join(DATA_DIR, "Nuveen_Large_Cap_Value_Fund_2025q3.csv"))
# holdings = pd.read_csv(os.path.join(DATA_DIR, "College_Retirement_Equities_Fund_-_Equity_Index_Account_2025q3.csv"))
holdings = pd.read_csv(os.path.join(DATA_DIR, "College_Retirement_Equities_Fund_-_Social_Choice_Account_2025q3.csv"))
print(holdings.columns)

In [None]:
def sanitize_name(name):
    newname = re.sub(r"/The$", "", name)
    newname = re.sub(r"^The ", "", newname)
    newname = re.sub(r", The$", "", newname)
    newname = re.sub(r" Corporation", " Corp", newname)
    newname = re.sub(r" Company", " Co", newname)
    return newname

def sanitize_names(names):
    """
    ## bug:
    - dumb
    """
    foo = names.copy()
    for i, name in enumerate(foo):
        foo[i] = sanitize_name(name)
    return foo

def compare_fund_to_list(fund, naughty, name_column="ISSUER_NAME"):
    """
    ## inputs:
    `fund` - pandas data frame with column `name_column` containing company names
    `naughty` - list of company names for naughty (or, heck, good) companies

    ## output:
    `new_fund` - copy of the data frame but with new boolean `naughty`

    ## notes:
    - When in doubt or when no match, returns `False`.

    ## example usage:
    `annotated_holdings = compare_fund_to_list(holdings, bad_company_list["Company Name"])`
    """
    fund_name_list = sanitize_names(fund[name_column].copy())
    # print(fund_name_list.head())
    new_fund = fund.copy()
    new_fund['NAUGHTY'] = False
    # print(new_fund.columns, new_fund.head())
    for query in sanitize_names(naughty):
        match, score, index = process.extractOne(query, fund_name_list, scorer=scorer)
        if score >= 80:  # tune this threshold
            print(f"{query} → {match} (score: {score})")
            new_fund.at[index, "NAUGHTY"] = True
    return new_fund

In [None]:
foo = compare_fund_to_list(holdings, bad_list["Company Name"])

In [None]:
bad_holdings = foo[foo["NAUGHTY"]]
print(bad_holdings[["ISSUER_NAME", "PERCENTAGE", "NAUGHTY"]])

In [None]:
print("fraction of fund that is naughty:", bad_holdings["PERCENTAGE"].sum() / holdings["PERCENTAGE"].sum())