# Matching Up the Data
In this notebook, I will identify the relationships between all of the data. Many of the relationships depend on string matches, but the text data is inconsistent and has many typos. For example, *'Assemblymember'* could be written as *'Assemblyman'*, *'Assemblywoman'*, *'A semblmember'*, and more. At the same time, with so many repeated words and phrases, many strings appear to match when they should not. A simple string-distance algorithm might find 'Assemblymember David Chiu' to match with 'Assemblymember Dave Chu', which is not correct. Therefore I use an approach that combines fuzzy string matching and regex  with spacy token similarity and entity linking to match the data.

In [1]:
import pandas as pd
import json
import re
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

In [2]:
bill_analysis = pd.read_csv('ca_leg/legislation_data/bill_analysis_tbl.csv')

  bill_analysis = pd.read_csv('ca_leg/legislation_data/bill_analysis_tbl.csv')


In [3]:
committee_codes = bill_analysis.loc[bill_analysis['committee_code'].notna(), ['committee_code', 'committee_name']].drop_duplicates()

In [4]:
committee_codes.to_csv('ca_leg/legislation_data/committee_codes.csv', index=False)

In [5]:
bills = pd.read_csv('ca_leg/legislation_data/bill_tbl.csv')

In [6]:
bill_history = pd.read_csv('ca_leg/legislation_data/bill_history_tbl.csv', dtype={'action_status': str, 'primary_location': str, 'secondary_location': str, 'end_status': str})

In [7]:
bill_versions = pd.read_csv('ca_leg/legislation_data/bill_version_tbl.csv')

In [8]:
with open("ca_leg/legislation_data/bill_version_text.json", "r") as f:
    bill_text = json.load(f)
    authors_data = {bill_id: bill_info['Authors']
                   for bill_id, bill_info in bill_text.items()
                   if 'Authors' in bill_info}
    bill_text_data = {}
    for bill_id, bill_info in bill_text.items():
        record = {}
        if 'Title' in bill_info.keys():
            title = bill_info.get('Title')
            record.update({'title': title})
        if 'GeneralSubject' in bill_info.keys():
            general_subject = bill_info.get('GeneralSubject')
            record.update({'general_subject': general_subject})
        if 'DigestText' in bill_info.keys():
            digest_text = bill_info.get('DigestText')
            record.update({'digest_text': digest_text})
        if 'BillContent' in bill_info.keys():
            content = bill_info.get('BillContent')
            record.update({'content': content})
        bill_text_data[bill_id] = record

In [9]:
list(set([a for b in [v.keys() for v in authors_data.values()] for a in b]))

['LEAD_AUTHOR', 'null', 'COAUTHOR', 'PRINCIPAL_COAUTHOR']

In [10]:
records = []
for bill_id, authors in authors_data.items():
    for author_type, house in authors.items():
        for house_name, author_name in house.items():
            records.append([bill_id, author_type, "COMMITTEE" if house_name == 'UNKNOWN' else house_name, author_name])

df = pd.DataFrame(records, columns=['bill_id', 'author_type', 'house', 'author_name'])
df['bill_id'] = df['bill_id'].apply(lambda x: re.sub(r'__', '', x))
combined = df.merge(bill_versions, left_on='bill_id', right_on='bill_version_id', how='left')

In [11]:
bill_text_records = []
for bill_id, text_info in bill_text_data.items():
    record = {'bill_id': bill_id}
    record.update(text_info)
    bill_text_records.append(record)
bill_text_df = pd.DataFrame(bill_text_records)

In [12]:
bill_text_df.to_csv('ca_leg/legislation_data/bill_text.csv', index=False)

In [13]:
full = combined.loc[combined['bill_version_action'].notna()].merge(bills, left_on='bill_id_y', right_on='bill_id', how='left')

In [14]:
full.to_csv('ca_leg/legislation_data/combined_table.csv', index=False)

In [15]:
full = pd.read_csv('ca_leg/legislation_data/combined_table.csv')

  full = pd.read_csv('ca_leg/legislation_data/combined_table.csv')


In [16]:
full.columns

Index(['bill_id_x', 'author_type', 'house', 'author_name', 'bill_version_id',
       'bill_id_y', 'version_num', 'bill_version_action_date',
       'bill_version_action', 'request_num', 'subject', 'vote_required',
       'appropriation', 'fiscal_committee', 'local_program',
       'substantive_changes', 'urgency', 'taxlevy', 'bill_xml', 'year_x',
       'bill_id', 'session_year', 'session_num', 'measure_num',
       'measure_state', 'chapter_year', 'chapter_type', 'chapter_session_num',
       'chapter_num', 'latest_bill_version_id', 'current_location',
       'current_status', 'year_y'],
      dtype='object')

In [17]:
disclosure = pd.read_csv('calaccess/CVR_LOBBY_DISCLOSURE_CD.csv', dtype=str)

In [18]:
expenditure = pd.read_csv('calaccess/LEXP_CD.csv', dtype=str)

In [19]:
lobbying = disclosure[['FILING_ID', 'FIRM_NAME']].merge(expenditure, on='FILING_ID', how='inner')

In [20]:
lobbying['EXPN_DATE'] = pd.to_datetime(lobbying['EXPN_DATE'], format='%m/%d/%Y %H:%M:%S %p', errors='coerce')

In [21]:
lb = lobbying.loc[(lobbying['EXPN_DATE'].notna()) & (lobbying['EXPN_DATE'] > pd.to_datetime('2001-01-01', format='%Y-%m-%d')) & ((lobbying['BENE_NAME'].notna()) | (lobbying['BENE_POSIT'].notna()))]

In [22]:
assembly_committees = pd.read_csv('pdf_parsing/assembly_committees_clean.csv')
assembly_roster = pd.read_csv('pdf_parsing/assembly_roster.csv')

In [23]:
senate_committees = pd.read_csv('pdf_parsing/senate_committees_cleaned.csv')
senate_roster = pd.read_csv('pdf_parsing/senate_roster.csv')

In [24]:
senate_roster['Last'] = senate_roster['Name'].str.split(',').str[0].apply(lambda x: x.strip())
senate_roster['Term'] = senate_roster['pages'].apply(lambda x: f"{2000 + int(x.split(',')[0].strip())}-{2000 + int(x.split(',')[1].strip())}")

In [25]:
def politician_table(committees, roster):
    doubles = committees.loc[committees['politician'].str.contains(',')]
    hyphens = committees.loc[committees['politician'].str.contains('-')]
    neither = committees.loc[(~committees['politician'].str.contains(',')) & (~committees['politician'].str.contains('-'))]
    hyphens['Last'] = hyphens['politician'].apply(lambda x: re.sub(r'-', ' ', x))
    hyp = hyphens.merge(roster, left_on=['Last', 'term'], right_on=['Last', 'Term'], how='inner')
    if len(doubles) > 0:
        doubles[['Last', 'First']] = doubles['politician'].str.split(',', expand=True)
        doubles['Last'] = doubles['Last'].str.strip()
        doubles['First'] = doubles['First'].str.strip()
        doubles.rename(columns={'term': 'Term'}, inplace=True)
        dbs = doubles.merge(roster, on=['Last', 'First', 'Term'], how='left')
        politicians = pd.concat([neither.merge(roster, left_on=['politician', 'term'], right_on=['Last', 'Term'], how='inner'), hyp, dbs])
    else:
        politicians = pd.concat([neither.merge(roster, left_on=['politician', 'term'], right_on=['Last', 'Term'], how='inner'), hyp])
    return politicians

In [26]:
senate_roster.loc[senate_roster['Name'] == 'Valladares, Suzette Martinez', ['Party', 'District No.', 'Seat No.']] = ['R', 23, 7140]
senate_roster.loc[senate_roster['Name'] == 'Weber Pierson, Dr Akilah', ['Party', 'District No.', 'Seat No.']] = ['D', 39, 7310]
senate_roster.loc[senate_roster['Name'] == 'Eggman, Susan Talamantes', ['Party', 'District No.', 'Seat No.']] = ['D', 5, 8530]

In [27]:
assembly = politician_table(assembly_committees, assembly_roster).rename(columns={'name': 'full_name'})
senate = politician_table(senate_committees, senate_roster).rename(columns={'Name': 'full_name'})

In [28]:
assembly['chamber'] = 'assembly'
senate['chamber'] = 'senate'

In [29]:
politicians = pd.concat([assembly[['committee_clean', 'position', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Term', 'Last', 'full_name', 'chamber']], senate[['committee_clean', 'position', 'Occupation', 'Party', 'District No.', 'Seat No.', 'Term', 'Last', 'full_name', 'chamber']]])

In [30]:
missing = politicians.loc[politicians['full_name'].isna()]

In [31]:
miss = assembly[['Term', 'Last', 'chamber', 'First']].merge(missing[['Term', 'Last', 'chamber']], on=['Last', 'Term', 'chamber']).drop_duplicates()
miss['full_name'] = miss['Last'] + ', ' + miss['First']
mi = miss.merge(assembly[['Term', 'Last', 'chamber', 'First', 'Seat No.']], on=['Term', 'Last', 'chamber', 'First'], how='left')
mis = {(row['Term'], row['Last'], row['chamber'], row['Seat No.']): row['full_name'] for _, row in mi.iterrows()}
politicians.loc[politicians['full_name'].isna(), 'full_name'] = politicians.loc[politicians['full_name'].isna()].apply(lambda x: mis.get((x['Term'], x['Last'], x['chamber'], x['Seat No.'])), axis=1)
assembly.loc[assembly['full_name'].isna(), 'full_name'] = assembly.loc[assembly['full_name'].isna()].apply(
	lambda x: mis.get((x['Term'], x['Last'], x['chamber'], x['Seat No.']), None), axis=1
)

In [32]:
politicians.to_csv('ca_leg/legislation_data/politicians.csv', index=False)

In [33]:
po = pd.read_csv('ca_leg/legislation_data/politicians.csv')

In [34]:
import spacy
from rapidfuzz import fuzz, process
from unidecode import unidecode

In [35]:
def clean_text(text):
    text = unidecode(text.lower().strip())
    return re.sub(r'[^\w\s]', '', text)

In [52]:
assembly.loc[assembly['politician'] == 'Calderon']

Unnamed: 0,term,politician,committee,position,committee_clean,Occupation,Party,District No.,Seat No.,Term,First,Last,Position,full_name,chamber
70,2001-2002,Calderon,Select Committee on the San Gabriel Valley Gro...,Chair,select committee on the san gabriel valley gro...,Legislator,D,58.0,61.0,2001-2002,Thomas M,Calderon,,"Calderon, Thomas M",assembly
71,2001-2002,Calderon,Education,Member,education,Legislator,D,58.0,61.0,2001-2002,Thomas M,Calderon,,"Calderon, Thomas M",assembly
72,2001-2002,Calderon,Governmental Organization,Member,governmental organization,Legislator,D,58.0,61.0,2001-2002,Thomas M,Calderon,,"Calderon, Thomas M",assembly
73,2001-2002,Calderon,Insurance,Member,insurance,Legislator,D,58.0,61.0,2001-2002,Thomas M,Calderon,,"Calderon, Thomas M",assembly
74,2001-2002,Calderon,Utilities and Commerce,Member,utilities and commerce,Legislator,D,58.0,61.0,2001-2002,Thomas M,Calderon,,"Calderon, Thomas M",assembly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9424,2025-2026,Calderon,Insurance,Chair,insurance,Businesswoman,D,56.0,9.0,2025-2026,Lisa,Calderon,,"Calderon, Lisa",assembly
9425,2025-2026,Calderon,Appropriations,Member,appropriations,Businesswoman,D,56.0,9.0,2025-2026,Lisa,Calderon,,"Calderon, Lisa",assembly
9426,2025-2026,Calderon,Emergency Management,Member,joint committee on emergency management,Businesswoman,D,56.0,9.0,2025-2026,Lisa,Calderon,,"Calderon, Lisa",assembly
9427,2025-2026,Calderon,Human Services,Member,human services,Businesswoman,D,56.0,9.0,2025-2026,Lisa,Calderon,,"Calderon, Lisa",assembly


In [56]:
for i, row in po.loc[po['full_name'].apply(lambda x: isinstance(x, float)), ['Term', 'Last', 'chamber']].drop_duplicates().iterrows():
    term, last = row['Term'], row['Last']
    a = assembly.loc[(assembly['politician'] == last) & (assembly['term'] == term)]
    if len(a) > 0:
        po.loc[(po['Term'] == term) & (po['Last'] == last) & (po['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
        continue
    else:
        a = assembly.loc[assembly['politician'] == last]
    if len(a) > 0:
        po.loc[(po['Term'] == term) & (po['Last'] == last) & (po['chamber'] == row['chamber']), 'full_name'] = a['full_name'].values[0]
    else:
        print(last, term)

In [57]:
df_lob = lb.copy()[['BENE_NAME', 'BENE_POSIT']].drop_duplicates()
df_legislators = po['full_name'].drop_duplicates().apply(clean_text).tolist()
df_committees = po['committee_clean'].drop_duplicates().apply(clean_text).tolist()

In [58]:
entity_ids = {name: f"LEG_{i}" for i, name in enumerate(df_legislators)}
entity_ids.update({name: f"COM_{i}" for i, name in enumerate(df_committees)})

In [59]:
nlp = spacy.load("en_core_web_md")

In [60]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = unidecode(text).upper()
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    prefixes = r'^(HON|HONORABLE|REP|REPRESENTATIVE|SEN|SENATOR|ASSEMBLY|ASSEMBLYMAN|ASSEMBLYMEMBER|COMMITTEE\s+ON|THE|STAFF\s+OF|OFFICE\s+OF)\s+'
    text = re.sub(prefixes, '', text, flags=re.IGNORECASE)
    suffixes = r'(\s+JR|\s+SR|\s+III|\s+II|\s+IV|\s+MD|\s+PHD|\s+ESQ)$'
    text = re.sub(suffixes, '', text, flags=re.IGNORECASE)
    return text

def preprocess_entity_ids(entity_ids):
    name_mapping = {}
    processed_entities = {}
    ngram_index = {}

    for original_name, entity_id in entity_ids.items():
        clean_name = clean_text(original_name)
        entity_type = 'legislator' if entity_id.startswith("LEG_") else 'committee'

        name_mapping[clean_name] = {
            'original': original_name,
            'id': entity_id,
            'type': entity_type
        }

        processed_entities[original_name] = {
            'clean_name': clean_name,
            'tokens': set(clean_name.split()),
            'entity_type': entity_type,
            'id': entity_id
        }
        tokens = clean_name.split()
        for token in tokens:
            if len(token) >= 3:
                if token not in ngram_index:
                    ngram_index[token] = []
                ngram_index[token].append(original_name)
        if len(tokens) >= 2:
            for i in range(len(tokens) - 1):
                bigram = f"{tokens[i]} {tokens[i+1]}"
                if bigram not in ngram_index:
                    ngram_index[bigram] = []
                ngram_index[bigram].append(original_name)

    return {
        'name_mapping': name_mapping,
        'processed_entities': processed_entities,
        'ngram_index': ngram_index,
        'legislator_names': [name for name, eid in entity_ids.items() if eid.startswith("LEG_")],
        'committee_names': [name for name, eid in entity_ids.items() if eid.startswith("COM_")]
    }

def get_candidates_by_ngrams(text, ngram_index):
    cleaned = clean_text(text)
    tokens = cleaned.split()

    candidates = {}

    for token in tokens:
        if len(token) >= 3 and token in ngram_index:
            for candidate in ngram_index[token]:
                candidates[candidate] = candidates.get(candidate, 0) + 1

    if len(tokens) >= 2:
        for i in range(len(tokens) - 1):
            bigram = f"{tokens[i]} {tokens[i+1]}"
            if bigram in ngram_index:
                for candidate in ngram_index[bigram]:
                    candidates[candidate] = candidates.get(candidate, 0) + 3

    return sorted(candidates.items(), key=lambda x: x[1], reverse=True)

In [61]:
def calculate_similarity_score(text1, text2):
    clean1 = clean_text(text1)
    clean2 = clean_text(text2)

    if not clean1 or not clean2:
        return 0

    if clean1 == clean2:
        return 100

    tokens1 = set(clean1.split())
    tokens2 = set(clean2.split())

    intersection = tokens1.intersection(tokens2)
    jaccard = len(intersection) / (len(tokens1) + len(tokens2) - len(intersection)) if (len(tokens1) + len(tokens2) - len(intersection)) > 0 else 0
    subsequence_score = 0
    if clean1 in clean2 or clean2 in clean1:
        subsequence_score = 30

    fuzzy_score = fuzz.token_set_ratio(clean1, clean2) * 0.65  # FUZZY SCORE

    final_score = (jaccard * 25) + subsequence_score + fuzzy_score # final score

    return min(final_score, 100)

def extract_referenced_names(position_text):
    if not position_text or not isinstance(position_text, str):
        return []
    referenced_names = []
    position_lower = position_text.lower()
    # bulk regex search
    patterns = [
        r'(?:staff|aide|assist\w*|chief|counsel|direct\w*)(?:\s+\w+)?\s+(?:to|for|of|with)\s+(?:sen\w*|rep\w*|assembl\w*|congress\w*)?\s+([A-Za-z\s\.\-]+?)(?:$|,|\s+\(|\s+[A-Z]{2})',
        r'(?:sen\w*|rep\w*|assembl\w*|congress\w*)\s+([A-Za-z\s\.\-]+?)(?:\'s?)?\s+(?:staff|office|aide|assist\w*|chief)',
        r'(?:office|staff)\s+(?:of|for)\s+(?:sen\w*|rep\w*|assembl\w*|congress\w*)?\s+([A-Za-z\s\.\-]+?)(?:$|,|\s+\(|\s+[A-Z]{2})',
        r'(?:sen\w*|rep\w*|assembl\w*|congress\w*)\s+([A-Za-z\s\.\-]{2,30})(?:$|,|\s+\(|\s+[A-Z]{2})',
        r'\b([A-Za-z\s\.\-]{2,30})\s+\([A-Z]{2}\)',
        r'\b(?:senator|representative|congressman|chairperson|chairman|assembl\w*)\s+([A-Za-z\s\.\-]{2,30})\b'
    ]
    for pattern in patterns:
        matches = re.finditer(pattern, position_lower, re.IGNORECASE)
        for match in matches:
            name = match.group(1).strip()
            if name and len(name) > 2:
                if any(term not in name.lower() for term in ['staff', 'office', 'committee']):
                    start, end = match.span(1)
                    original_case = position_text[start:end].strip()
                    if original_case and len(original_case) > 2 and original_case not in referenced_names:
                        referenced_names.append(original_case)

    if not referenced_names: # NER if no matches found
        try:
            doc = nlp(position_text)
            for ent in doc.ents:
                if ent.label_ == "PERSON" and len(ent.text) > 2:
                    if ent.text not in referenced_names:
                        referenced_names.append(ent.text)
        except Exception:
            pass
    return referenced_names

In [62]:
def extract_last_names_from_politicians(politicians_df):
    last_names_mapping = {}

    for _, row in politicians_df.iterrows():
        full_name = row['full_name']
        chamber = row['chamber']

        if ',' in full_name:
            last_name = full_name.split(',')[0].strip()
        else:
            last_name = full_name.split()[-1].strip()

        clean_last = clean_text(last_name)

        if clean_last not in last_names_mapping:
            last_names_mapping[clean_last] = []

        last_names_mapping[clean_last].append({
            'full_name': full_name,
            'chamber': chamber,
            'last_name': last_name
        })

    return last_names_mapping

def search_by_last_name(position_text, last_names_mapping):
    if not position_text or not isinstance(position_text, str):
        return []

    position_lower = position_text.lower()
    matches = []

    # Check for senate/assembly indicators with last names
    senate_patterns = [
        r'senate?\s+(\w+)',
        r'sen\.\s+(\w+)',
        r'senator\s+(\w+)'
    ]

    assembly_patterns = [
        r'assembly\s*member?\s+(\w+)',
        r'assemblym[ae]n\s+(\w+)',
        r'assemblywoman\s+(\w+)'
    ]

    for pattern in senate_patterns:
        for match in re.finditer(pattern, position_lower, re.IGNORECASE):
            last_name = clean_text(match.group(1))
            if last_name in last_names_mapping:
                for politician in last_names_mapping[last_name]:
                    if politician['chamber'] == 'senate':
                        matches.append({
                            'full_name': politician['full_name'],
                            'match_type': 'senate_last_name',
                            'extracted_name': match.group(1)
                        })

    for pattern in assembly_patterns:
        for match in re.finditer(pattern, position_lower, re.IGNORECASE):
            last_name = clean_text(match.group(1))
            if last_name in last_names_mapping:
                for politician in last_names_mapping[last_name]:
                    if politician['chamber'] == 'assembly':
                        matches.append({
                            'full_name': politician['full_name'],
                            'match_type': 'assembly_last_name',
                            'extracted_name': match.group(1)
                        })

    return matches

In [63]:
result_df = df_lob.copy()
entity_data = preprocess_entity_ids(entity_ids)
name_mapping = entity_data['name_mapping']
ngram_index = entity_data['ngram_index']
legislator_names = entity_data['legislator_names']
committee_names = entity_data['committee_names']

last_names_mapping = extract_last_names_from_politicians(po)

GENERIC_LEGISLATOR_TITLES = ['assemblymember', 'senator', 'assemblyman', 'assemblywoman', 'representative', 'rep', 'sen']
GOV_DEPT_INDICATORS = ['ca department', 'ca dept', 'california department', 'california dept','department of', 'dept. of', 'dept of', 'agency', 'bureau', 'division of', 'state of california', 'state board', 'state commission']
result_df['MATCHED_NAME'] = None
result_df['ENTITY_ID'] = None
result_df['ENTITY_TYPE'] = None
result_df['MATCH_METHOD'] = None
result_df['CONFIDENCE'] = None

for idx, row in tqdm(result_df.iterrows(), total=len(result_df)):
    bene_name = str(row['BENE_NAME']) if pd.notna(row['BENE_NAME']) else ""
    bene_position = str(row['BENE_POSIT']) if pd.notna(row['BENE_POSIT']) else ""
    if not bene_name.strip() and not bene_position.strip():
        continue
    combined_text = f"{bene_name} {bene_position}".lower()
    if any(indicator in combined_text for indicator in GOV_DEPT_INDICATORS):
        continue

    position_is_generic = bene_position.strip().lower() in GENERIC_LEGISLATOR_TITLES

    for field_name, field_value in [('BENE_NAME', bene_name),
                                   ('BENE_POSIT', bene_position if not position_is_generic else "")]:
        if not field_value.strip():
            continue

        clean_value = clean_text(field_value)
        if clean_value in name_mapping:
            entity_info = name_mapping[clean_value]
            result_df.at[idx, 'MATCHED_NAME'] = entity_info['original']
            result_df.at[idx, 'ENTITY_ID'] = entity_info['id']
            result_df.at[idx, 'ENTITY_TYPE'] = entity_info['type']
            result_df.at[idx, 'MATCH_METHOD'] = f'exact_{field_name.lower()}'
            result_df.at[idx, 'CONFIDENCE'] = 'high'
            break

    if pd.notna(result_df.at[idx, 'MATCHED_NAME']):
        continue

    all_referenced_names = []
    if not position_is_generic:
        all_referenced_names.extend(extract_referenced_names(bene_position))
    all_referenced_names.extend(extract_referenced_names(bene_name))

    for ref_name in all_referenced_names:
        clean_ref = clean_text(ref_name)
        if clean_ref in name_mapping:
            entity_info = name_mapping[clean_ref]
            result_df.at[idx, 'MATCHED_NAME'] = entity_info['original']
            result_df.at[idx, 'ENTITY_ID'] = entity_info['id']
            result_df.at[idx, 'ENTITY_TYPE'] = entity_info['type']
            result_df.at[idx, 'MATCH_METHOD'] = 'reference_exact'
            result_df.at[idx, 'CONFIDENCE'] = 'high'
            break

    if pd.notna(result_df.at[idx, 'MATCHED_NAME']):
        continue

    last_name_matches = search_by_last_name(bene_position, last_names_mapping)
    if last_name_matches:
        best_last_name_match = last_name_matches[0]
        matched_name = best_last_name_match['full_name']

        clean_matched = clean_text(matched_name)
        if clean_matched in name_mapping:
            entity_info = name_mapping[clean_matched]
            result_df.at[idx, 'MATCHED_NAME'] = entity_info['original']
            result_df.at[idx, 'ENTITY_ID'] = entity_info['id']
            result_df.at[idx, 'ENTITY_TYPE'] = entity_info['type']
            result_df.at[idx, 'MATCH_METHOD'] = best_last_name_match['match_type']
            result_df.at[idx, 'CONFIDENCE'] = 'medium'
            continue

    if pd.notna(result_df.at[idx, 'MATCHED_NAME']):
        continue

    position_has_legislator = any(term in combined_text for term in ['senator', 'representative', 'rep ', 'sen ', 'assemblymember', 'assemblyman', 'assemblywoman', 'assembly member'])
    position_has_committee = any(term in combined_text for term in ['committee', 'commission', 'board', 'task force', 'caucus'])
    search_pool = None

    if position_has_legislator:
        search_pool = legislator_names
    elif position_has_committee:
        search_pool = committee_names

    candidates = []
    if bene_name.strip():
        candidates.extend(get_candidates_by_ngrams(bene_name, ngram_index))

    if bene_position.strip() and not position_is_generic:
        candidates.extend(get_candidates_by_ngrams(bene_position, ngram_index))

    seen = set()
    unique_candidates = [(name, score) for name, score in candidates
                         if not (name in seen or seen.add(name))]

    if search_pool:
        unique_candidates = [(name, score) for name, score in unique_candidates if name in search_pool]

    top_candidates = unique_candidates[:10] if unique_candidates else []

    if top_candidates:
        best_match = None
        best_score = 0
        best_method = None

        for candidate_name, _ in top_candidates:
            if bene_name.strip():
                name_score = calculate_similarity_score(bene_name, candidate_name)

                if name_score > best_score and name_score >= 75:
                    best_score = name_score
                    best_match = candidate_name
                    best_method = "fuzzy_name"
            if bene_position.strip() and not position_is_generic:
                position_score = calculate_similarity_score(bene_position, candidate_name)
                if position_score > best_score and position_score >= 85:
                    best_score = position_score
                    best_match = candidate_name
                    best_method = "fuzzy_position"
            if bene_name.strip() and bene_position.strip():
                combined_text = f"{bene_name} {bene_position}"
                combined_score = calculate_similarity_score(combined_text, candidate_name)

                if combined_score > best_score and combined_score >= 70:
                    best_score = combined_score
                    best_match = candidate_name
                    best_method = "fuzzy_combined"

        if best_match:
            confidence = "high" if best_score >= 90 else "medium" if best_score >= 75 else "low"

            result_df.at[idx, 'MATCHED_NAME'] = best_match
            result_df.at[idx, 'ENTITY_ID'] = entity_ids[best_match]
            result_df.at[idx, 'ENTITY_TYPE'] = 'legislator' if entity_ids[best_match].startswith('LEG_') else 'committee'
            result_df.at[idx, 'MATCH_METHOD'] = best_method
            result_df.at[idx, 'CONFIDENCE'] = confidence
            continue

    for field_name, field_value in [('BENE_NAME', bene_name),
                                   ('BENE_POSIT', bene_position if not position_is_generic else "")]:
        if not field_value.strip():
            continue

        clean_value = clean_text(field_value)
        search_list = search_pool if search_pool else list(entity_ids.keys())

        name_match = process.extractOne(
            clean_value,
            search_list,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=85
        )

        if name_match:
            match, score = name_match
            result_df.at[idx, 'MATCHED_NAME'] = match
            result_df.at[idx, 'ENTITY_ID'] = entity_ids[match]
            result_df.at[idx, 'ENTITY_TYPE'] = 'legislator' if entity_ids[match].startswith('LEG_') else 'committee'
            result_df.at[idx, 'MATCH_METHOD'] = f'direct_fuzzy_{field_name.lower()}'
            result_df.at[idx, 'CONFIDENCE'] = 'high' if score > 90 else 'medium'
            break

100%|██████████| 46309/46309 [02:55<00:00, 263.91it/s]


In [64]:
ben_name_positions_dict = result_df.loc[result_df['CONFIDENCE'].notna(), ['BENE_NAME', 'BENE_POSIT', 'MATCHED_NAME']].set_index(['BENE_NAME', 'BENE_POSIT']).to_dict()['MATCHED_NAME']

In [65]:
lb['clean_beneficiary'] = lb[['BENE_NAME', 'BENE_POSIT']].apply(lambda x: ben_name_positions_dict.get(tuple(x), None), axis=1)


In [66]:
lb.loc[lb['clean_beneficiary'].notna()].to_csv('calaccess/lobbying_clean2.csv', index=False)

In [67]:
expenditure_assembly = pd.read_csv('calaccess/expenditure_assembly.csv')

In [68]:
expenditure_assembly['year'] = expenditure_assembly['DateRange'].apply(lambda x: int(x.split('-')[0]))
expenditure_assembly['term'] = expenditure_assembly['year'].apply(lambda x: f"{x}-{x+1}" if x % 2 != 0 else f"{x-1}-{x}")

In [69]:
def match_names(_names, expenditure_df, cutoff=84):
    expenditure_names = {n: n.lower() for n in expenditure_df['TargetCandidateName'].unique()}
    name_mapping = {}
    __names = [name.lower() for name in _names if isinstance(name, str)]
    for title, exp_name in expenditure_names.items():
        best_match = process.extractOne(
            exp_name,
            __names,
            scorer=fuzz.token_sort_ratio,
            score_cutoff=cutoff
        )

        if best_match:
            name_mapping[title] = _names[__names.index(best_match[0])]
        else:
            name_mapping[title] = None

    return name_mapping

name_mapping = match_names(assembly['full_name'].unique(), expenditure_assembly, cutoff=72)
expenditure_assembly['matched_target_name'] = expenditure_assembly['TargetCandidateName'].map(name_mapping)
merged_df = pd.merge(
    expenditure_assembly,
    assembly,
    left_on='matched_target_name',
    right_on='full_name',
    how='inner'
)

In [70]:
exp_as = expenditure_assembly.loc[(expenditure_assembly['matched_target_name'].notna())]
exp_as.loc[exp_as['year'] < 2000, ['year', 'term']] = [2000, '2000-2001']

In [71]:
exp_as.to_csv('calaccess/expend_assembly_matched.csv', index=False)

In [72]:
expenditure_senate = pd.read_csv('calaccess/expenditure_senator.csv')

In [73]:
expenditure_senate['year'] = expenditure_senate['DateRange'].apply(lambda x: int(x.split('-')[0]))
expenditure_senate['term'] = expenditure_senate['year'].apply(lambda x: f"{x}-{x+1}" if x % 2 != 0 else f"{x-1}-{x}")


In [74]:
senate_name_mapping = match_names(senate['full_name'].unique(), expenditure_senate, cutoff=80)
expenditure_senate['matched_target_name'] = expenditure_senate['TargetCandidateName'].map(senate_name_mapping)

In [75]:
expenditure_senate.loc[expenditure_senate['matched_target_name'].notna()].to_csv('calaccess/expend_senate_matched.csv', index=False)