# Fixing Parsed Text

In [None]:
import pandas as pd
import re
from wordsegment import load, segment
from rapidfuzz import fuzz
from collections import defaultdict
from sklearn.cluster import AgglomerativeClustering
import numpy as np
load()

### Senate Committees

In [5]:
senate = pd.read_csv('senators_committees.csv')

In [13]:
import spacy
nlp = spacy.load("en_core_web_md", disable=["ner", "parser", "lemmatizer", "senter", "tagger"])

In [26]:
sen_coms = senate['committee'].unique().tolist()

def fuzzy_strings(sen_coms):
    def preprocess_name(name):
        name = name.lower()
        name = re.sub(r'\(.*\)', '', name)
        name = re.sub(r'[^A-Za-z\s]', ' ', name)
        name = re.sub(r'\s+', ' ', name).strip()
        name = re.sub(r'select|joint|standing (?:subcommittee on|committee on)*', '', name).strip()
        return name

    def clean_nlp(text):
        text = preprocess_name(text)
        text = re.sub(r'\s+', '', text).strip()
        text = segment(text)
        text = ' '.join(text)
        return nlp(text)

    clean_sen = [preprocess_name(c) for c in sen_coms]
    nlp_sen = [clean_nlp(c) for c in clean_sen]

    def fuzzy_distance(a, b):
        n_a = re.sub(r'\s+', '', a)
        n_b = re.sub(r'\s+', '', b)
        if n_a == n_b:
            return 0
        ratio1 = fuzz.token_sort_ratio(n_a, n_b)
        ratio2 = fuzz.partial_ratio(n_a, n_b)
        ratio3 = fuzz.token_set_ratio(n_a, n_b)
        weighted_ratio = (ratio1 * 0.4) + (ratio2 * 0.2) + (ratio3 * 0.4)
        return 100 - weighted_ratio

    n = len(clean_sen)
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            distance = fuzzy_distance(clean_sen[i], clean_sen[j])
            spacy_dist = nlp_sen[i].similarity(nlp_sen[j]) * 100
            distance_matrix[i, j] = (distance * 0.4) + (spacy_dist * 0.6)
            distance_matrix[j, i] = (distance * 0.4) + (spacy_dist * 0.6)

    threshold = 15
    clustering_model = AgglomerativeClustering(
    metric='precomputed',
    linkage='average',
    distance_threshold=threshold,
    n_clusters=None
    )
    labels = clustering_model.fit_predict(distance_matrix)
    clusters = defaultdict(list)
    for label, name in zip(labels, sen_coms):
        clusters[label].append(name)
    cluster_mapping = {variant: label for label, variants in clusters.items() for variant in variants}

    return cluster_mapping

In [27]:
cluster_frame_sen = pd.DataFrame.from_dict(fuzzy_strings(sen_coms), orient='index').reset_index().rename(columns={'index': 'committee', 0: 'cluster'})

In [28]:
cframe = cluster_frame_sen.groupby('cluster').agg('first').reset_index()
cframe['committee'] = cframe['committee'].apply(lambda x: " ".join(segment(x)))

In [29]:
cluster_frame_sen = cluster_frame_sen.merge(cframe, on='cluster', how='left')
sen_full_mappings = {x: y for x, y in zip(cluster_frame_sen['committee_x'], cluster_frame_sen['committee_y'])}

In [40]:
cluster_frame_sen.loc[cluster_frame_sen['committee_x'].str.contains('Wine'), ['cluster', 'committee_y']] = [200, "California's Wine Industry"]

In [42]:
senate['committee_clean'] = senate['committee'].map(sen_full_mappings)
senate.loc[senate['committee_clean'].isna(), 'committee_clean'] = senate['committee']
senate.to_csv('senate_committees_cleaned.csv', index=False)

### Assembly Committees

In [43]:
assembly = pd.read_csv('assembly_committees.csv')

In [44]:
assembly_coms = assembly['committee'].unique().tolist()
assembly_frame_com = pd.DataFrame.from_dict(fuzzy_strings(assembly_coms), orient='index').reset_index().rename(columns={'index': 'committee', 0: 'cluster'})

In [45]:
aframe = assembly_frame_com.groupby('cluster').agg('first').reset_index()
aframe['committee'] = aframe['committee'].apply(lambda x: " ".join(segment(x)))

In [46]:
assembly_frame_com = assembly_frame_com.merge(aframe, on='cluster', how='left')
assembly_full_mappings = {x: y for x, y in zip(assembly_frame_com['committee_x'], assembly_frame_com['committee_y'])}
assembly['committee_clean'] = assembly['committee'].map(assembly_full_mappings)
assembly.loc[assembly['committee_clean'].isna(), 'committee_clean'] = assembly['committee']

In [48]:
assembly.to_csv('assembly_committees_clean.csv', index=False)