# Fixing Parsed Text

In [144]:
import pandas as pd
import re
from wordsegment import load, segment
from rapidfuzz import process, fuzz
from collections import defaultdict
from unidecode import unidecode
from sklearn.cluster import AgglomerativeClustering
import numpy as np
from textblob import TextBlob
load()

### Senate Committees

In [102]:
senate = pd.read_csv('senators_committees.csv')

In [136]:
sen_coms = senate['committee'].unique().tolist()

def fuzzy_strings(sen_coms):
    def clean_text(text):
        t = unidecode(text)
        te = t.lower().strip()
        tex = re.sub(r'[^\w\s]', '', te)
        t1 = re.sub(r'S\s*e\s*l\s*e\s*c\s*t\s*C\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s', '', tex)
        t2 = re.sub(r'S\s*u\s*b\s*c\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s', '', t1)
        t3 = re.sub(r'J\s*o\s*i\s*n\s*t\s*C\s*o\s*m\s*m\s*i\s*t\s*t\s*e\s*e\s*s', '', t2)
        t4 = re.sub(r'B\s*o\s*a\s*r\s*d\s*,\s*C\s*o\s*m\s*m\s*i\s*s*i\s*o\s*n\s*s*,\s*a\s*n\s*d\s*C\s*o\s*u\s*n\s*c\s*i\s*l\s*s', '', t3)
        t5 = t4.strip()
        t6 = " ".join(segment(re.sub(r' ', '', t5)))
        return t6

    clean_sen = [clean_text(c) for c in sen_coms]

    def fuzzy_distance(a, b):
        ratio = fuzz.token_sort_ratio(a, b)
        return 100 - ratio

    n = len(clean_sen)
    distance_matrix = np.zeros((n, n))
    for i in range(n):
        for j in range(i, n):
            distance = fuzzy_distance(clean_sen[i], clean_sen[j])
            distance_matrix[i, j] = distance
            distance_matrix[j, i] = distance

    threshold = 15
    clustering_model = AgglomerativeClustering(
    metric='precomputed',
    linkage='average',
    distance_threshold=threshold,
    n_clusters=None
    )
    labels = clustering_model.fit_predict(distance_matrix)
    clusters = defaultdict(list)
    for label, name in zip(labels, sen_coms):
        clusters[label].append(name)
    cluster_mapping = {v: k for k, val in clusters.items() for v in val}
    return cluster_mapping

In [148]:
cluster_frame_sen = pd.DataFrame.from_dict(fuzzy_strings(sen_coms), orient='index').reset_index().rename(columns={'index': 'committee', 0: 'cluster'})

In [149]:
cframe = cluster_frame_sen.groupby('cluster').agg('first').reset_index()
cluster_frame_sen = cluster_frame_sen.merge(cframe, on='cluster', how='left')
sen_full_mappings = {x: y for x, y in zip(cluster_frame_sen['committee_x'], cluster_frame_sen['committee_y'])}

In [153]:
senate['committee_clean'] = senate['committee'].map(sen_full_mappings)
senate.loc[senate['committee_clean'].isna(), 'committee_clean'] = senate['committee']
senate.to_csv('senate_committees_cleaned.csv', index=False)

### Assembly Committees

In [83]:
assembly = pd.read_csv('assembly_committees.csv')

In [154]:
assembly_coms = assembly['committee'].unique().tolist()
assembly_frame_com = pd.DataFrame.from_dict(fuzzy_strings(assembly_coms), orient='index').reset_index().rename(columns={'index': 'committee', 0: 'cluster'})

In [155]:
aframe = assembly_frame_com.groupby('cluster').agg('first').reset_index()
assembly_frame_com = assembly_frame_com.merge(aframe, on='cluster', how='left')
assembly_full_mappings = {x: y for x, y in zip(assembly_frame_com['committee_x'], assembly_frame_com['committee_y'])}
assembly['committee_clean'] = assembly['committee'].map(assembly_full_mappings)
assembly.loc[assembly['committee_clean'].isna(), 'committee_clean'] = assembly['committee']

In [157]:
assembly.to_csv('assembly_committees_clean.csv', index=False)